<a href="https://colab.research.google.com/github/srimanthtenneti/Cuss-Word-Detector---LSTM/blob/master/LSTM_Based_Cuss_word_detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cuss word Detector


This code implements a cuss word detector using some samples and an LSTM array. A total of 7 cuss words are going to be taken in various contexts and the detector is going to be implemented.



In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim
import matplotlib.pyplot as ply
import numpy as np

# Target Words

1. Fuck
2. Bastard
3. Dickhead
4. Prick
5. Pussy
6. Fuck off
7. Cock

The above are all the cuss words that the detector aims to detect.

# Data Set 

The dataset is a collection of certain sentences that contain abusive words and the corrosponding tags if the given word in the sentence is abusive or not.

Tags : {"O" : 0 , "CS" : 1}

O : OK not abusive

CS : Offensive language

In [3]:
data = [("What the fuck".lower().split() , ["O","O","CS"]),
        ("The boy asked him to fuckoff".lower().split() ,["O","O","O","O","O","CS"]),
        ("I hate that bastard".lower().split() , ["O","O","O","CS"]),
        ("He is a dicked".lower().split(),["O","O","O","CS"]),
        ("Hey prick".lower().split(),["O","CS"]),
        ("What a pussy you are".lower().split() , ["O","O","CS","O","O"]),
        ("Dont be a cock".lower().split(),["O","O","O","CS"])]

# Reprasenting the data in numerical format

In [4]:
word2idx = {}

for sent , tag in data:
  for word in sent:
    if word not in word2idx:
      word2idx[word] = len(word2idx)

tag2idx = {"O" : 0 , "CS" : 1}
tag2rev = {0 : "O" , 1 : "CS"}

In [5]:
print("The word set is : {}".format(word2idx))

The word set is : {'what': 0, 'the': 1, 'fuck': 2, 'boy': 3, 'asked': 4, 'him': 5, 'to': 6, 'fuckoff': 7, 'i': 8, 'hate': 9, 'that': 10, 'bastard': 11, 'he': 12, 'is': 13, 'a': 14, 'dicked': 15, 'hey': 16, 'prick': 17, 'pussy': 18, 'you': 19, 'are': 20, 'dont': 21, 'be': 22, 'cock': 23}


In [6]:
print("The training data is : {}".format(data))

The training data is : [(['what', 'the', 'fuck'], ['O', 'O', 'CS']), (['the', 'boy', 'asked', 'him', 'to', 'fuckoff'], ['O', 'O', 'O', 'O', 'O', 'CS']), (['i', 'hate', 'that', 'bastard'], ['O', 'O', 'O', 'CS']), (['he', 'is', 'a', 'dicked'], ['O', 'O', 'O', 'CS']), (['hey', 'prick'], ['O', 'CS']), (['what', 'a', 'pussy', 'you', 'are'], ['O', 'O', 'CS', 'O', 'O']), (['dont', 'be', 'a', 'cock'], ['O', 'O', 'O', 'CS'])]


In [7]:
def prepare_sequence(seq , to_idx):
  idxs = [to_idx[word] for word in seq]
  idxs = np.array(idxs)
  return torch.tensor(idxs)

In [10]:
testsent = "fuckoff boy".lower().split()
inp = prepare_sequence(testsent , word2idx)
print("The test sentence {} is tranlated to {}\r\n".format(testsent , inp))

The test sentence ['fuckoff', 'boy'] is tranlated to tensor([7, 3])



# Cuss word tagger

The class below implements a LSTM tagger that tags all the cuss words in the given sentence

In [11]:
class LSTMTagger(nn.Module):

  def __init__(self,embedding_dim,hidden_dim,vocab_size,tagset_size):

    super(LSTMTagger , self).__init__()

    self.hidden_dim = hidden_dim

    self.word_embedding = nn.Embedding(vocab_size , embedding_dim= embedding_dim)

    self.lstm = nn.LSTM(input_size= embedding_dim , hidden_size = hidden_dim)

    self.hidden2tag = nn.Linear(hidden_dim , tagset_size)

    self.hidden = self.init_hidden()

  def init_hidden(self):

    return (torch.randn(1 , 1 , self.hidden_dim),
           torch.randn(1 , 1 , self.hidden_dim))

  def forward(self , sentence):

    embeds = self.word_embedding(sentence)

    lstm_out , hidden_out = self.lstm(embeds.view(len(sentence) , 1 , -1) , self.hidden) 

    tag_outputs = self.hidden2tag(lstm_out.view(len(sentence) , -1))
    tag_scores = F.log_softmax(tag_outputs , dim = 1)

    return tag_scores   

In [12]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
model = LSTMTagger(EMBEDDING_DIM , HIDDEN_DIM , len(word2idx) , len(tag2idx))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters() , lr = 0.1)

In [17]:
print("Input sent : {}".format(inp))
tags = model(inp)
_,pred_tags = torch.max(tags , 1)
print("Pred tag : {}".format(pred_tags))

Input sent : tensor([7, 3])
Pred tag : tensor([0, 0])


In [21]:
n_epochs = 300

for epoch in range(n_epochs):

  epoch_loss = 0.0

  for sent , tags in data:

    model.zero_grad()

    input_sent = prepare_sequence(sent , word2idx)
    tag = prepare_sequence(tags , tag2idx)

    model.hidden = model.init_hidden()

    output = model(input_sent)

    loss = loss_function(output , tag)

    epoch_loss += loss.item()

    loss.backward()

    optimizer.step()

  if epoch % 20 == 19:
    print("Epoch : {} , loss : {}".format(epoch , epoch_loss / len(data)))

Epoch : 19 , loss : 0.5189785148416247
Epoch : 39 , loss : 0.4222014205796378
Epoch : 59 , loss : 0.3108013996056148
Epoch : 79 , loss : 0.1471867822110653
Epoch : 99 , loss : 0.08494963124394417
Epoch : 119 , loss : 0.03300481888332537
Epoch : 139 , loss : 0.0257016835468156
Epoch : 159 , loss : 0.017642608111990348
Epoch : 179 , loss : 0.008294256370780724
Epoch : 199 , loss : 0.007141864226598825
Epoch : 219 , loss : 0.005498719235349979
Epoch : 239 , loss : 0.01046386590626623
Epoch : 259 , loss : 0.004956835481737342
Epoch : 279 , loss : 0.003309302165039948
Epoch : 299 , loss : 0.002649571347449507


In [22]:
print("Input sent : {}".format(inp))
tags = model(inp)
_,pred_tags = torch.max(tags , 1)
print("Pred tag : {}".format(pred_tags))

Input sent : tensor([7, 3])
Pred tag : tensor([1, 0])


In [32]:
pred = np.array(pred_tags)
pred

array([1, 0])

In [45]:
testsent = "You ".lower().split()
inp = prepare_sequence(testsent , word2idx)

print("Input sent : {}".format(testsent))
tags = model(inp)
_,pred_tags = torch.max(tags , 1)
print("Pred tag : {}".format(pred_tags))
pred = np.array(pred_tags)

for i in range(len(testsent)):
  print("Word : {} , Predicted tag : {}".format(testsent[i] , tag2rev[pred[i]]))

Input sent : ['you', 'cock']
Pred tag : tensor([0, 1])
Word : you , Predicted tag : O
Word : cock , Predicted tag : CS
