# Structure Based Hate Speech Detection

In [1]:
import csv
import nltk
import re
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd

from nltk.corpus import stopwords
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, train_test_split
DATA_PATH = "../data/"

In [2]:
vocab_set = set()

In [4]:
def readData(path):
    data = []
    with open(path,'r') as file:
        data = [x for x in csv.reader(file, delimiter=',')]
    return data

def getTweets(raw):
    #pass
    data = [x[6] for x in raw]
    return np.array(data)

def getClass(raw):
    #pass
    classes = [x[5] for x in raw]
    return np.array(classes)

def removePattern(tweet, pattern):
    r = re.findall(pattern, tweet)
    for x in r:
        tweet = re.sub(x, '', tweet)
    return tweet

def preprocess(data):
    cleanData = []
    for tweet in data:
        tweet = removePattern(tweet, "@[\w]*")
        tweet = tweet.replace("#", "") # Removing '#' from hashtags
        tweet = tweet.replace("[^a-zA-Z#]", " ") # Removing punctuation and special characters
        tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
        tweet = re.sub(" +", " ", tweet)
        tweet = tweet.lower()
        tweet = tokenize(tweet)
#         print(tweet)
        cleanData.append(tweet)
    return cleanData

def tokenize(text):
#     print(text)
    return text.split()
    #return TweetTokenizer.tokenize(text)

def evaluate(target, predicted):
    f1 = f1_score(target, predicted, average='weighted')
    acc = accuracy_score(target, predicted)
    rec = recall_score(target, predicted, average = 'macro')
    print("F1 score:   ", f1)
    print("Avg Recall: ", rec)    
    print("Accuracy:   ", acc)

In [None]:
DATA = DATA_PATH + "labeled_data.csv"

In [None]:
import pandas as pd
import numpy as np
ds = pd.read_csv(DATA)

In [None]:
for index, row in ds.iterrows(): 
    print(index)
    x = np.argmax([row['hate_speech'], row['offensive_language'], row['neither']])
    if x == 0:
        ds.loc[index, 'hate_speech'] = 1
        ds.loc[index, 'offensive_language'] = 0
        ds.loc[index, 'neither'] = 0
#         row['offensive_language']=0 
#         row['neither'] = 0
    elif x == 1:
        ds.loc[index, 'hate_speech'] = 0
        ds.loc[index, 'offensive_language'] = 1
        ds.loc[index, 'neither'] = 0
#         row['hate_speech'] = 0
        row['offensive_language']=1 
        row['neither'] = 0
    elif x == 2:
        ds.loc[index, 'hate_speech'] = 0
        ds.loc[index, 'offensive_language'] = 0
        ds.loc[index, 'neither'] = 1
#         row['hate_speech'] = 0
#         row['offensive_language']=0 
#         row['neither'] = 1

In [None]:
ds.to_csv(DATA_PATH + "labeled_data-mod.csv")

In [None]:
en_stopwords = set(stopwords.words("english")) 

raw = readData(DATA) 
tweets = getTweets(raw)
classes = getClass(raw)
tweets = preprocess(tweets)

In [None]:
X = [x for x in tweets]
X = np.delete(np.array(X), [0])
y = np.delete(classes, [0])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# class LSTMClassifier(nn.Module):
    
#     def __init__(self, embedding_dim, hidden_dim, output_size, batch_size, num_layers = 1):

#         super(LSTMClassifier, self).__init__()
        
#         self.embedding_dim = embedding_dim
#         self.hidden_dim = hidden_dim
#         self.output_size = output_size
#         self.batch_size = batch_size
#         self.num_layers = num_layers
        
#         # Naive embeddings for testing purposes
#         self.embedding = nn.Embedding(1024, embedding_dim)
        
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers = num_layers)
#         self.hidden2out = nn.Linear(hidden_dim, output_size)
                
#         self.hidden = self.init_hidden()
#         self.softmax = nn.LogSoftmax()
        
#         self.dropout_layer = nn.Dropout(p = 0.2)
    
#     def init_hidden(self):
#          return (autograd.Variable(torch.randn(self.num_layers, self.batch_size, self.hidden_dim)),
#                 autograd.Variable(torch.randn(self.num_layers, self.batch_size, self.hidden_dim)))
        
#     def forward(self, sents, lengths):
#         embeds = self.embedding(sents)
#         packed_input = pack_padded_sequence(embeds, lengths)
        
#         lstm_out, self.hidden = self.lstm(packed_input, self.hidden)
#         y = self.hidden2out(lstm_out[-1])
#         y = self.softmax(y)
#         return y

In [None]:
# model = LSTMClassifier(128, 32, 2, 1)
# loss_function = nn.NLLLoss()
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=0.1)
# model(X, 10)

## Using Torchtext

In [5]:
from torchtext.data import Field, TabularDataset
def cust_preprocess(tweet):
    tweet = removePattern(tweet, "@[\w]*")
    tweet = tweet.replace("#", "") # Removing '#' from hashtags
    tweet = tweet.replace("[^a-zA-Z#]", " ") # Removing punctuation and special characters
    tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
    tweet = re.sub(" +", " ", tweet)
    tweet = tweet.lower()
    tweet = tokenize(tweet)
    return tweet

In [6]:
TEXT = Field(sequential = True, tokenize = cust_preprocess, lower=True)
LABEL = Field(sequential = False, use_vocab=False)

In [7]:
tv_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                 ("ct", None),
                 ("count", None),
                 ("hate_speech", LABEL),
                 ("offensive", LABEL),
                 ("neither", LABEL),
                 ("label", None),
                 ("tweet", TEXT)]

dt = TabularDataset(
               path=DATA_PATH + "labeled_data-mod.csv",
               format='csv',
               skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=tv_datafields)
trn, dev,tst = dt.split([0.8,0.1,0.1])

In [8]:
TEXT.build_vocab(trn)

In [None]:
TEXT.vocab.freqs.most_common(10)

In [9]:
from torchtext.data import Iterator, BucketIterator

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, dev_iter = BucketIterator.splits(
     (trn, dev), # we pass in the datasets we want the iterator to draw data from
     batch_sizes=(64, 64),
     device=device, # if you want to use the GPU, specify the GPU number here
     sort_key=lambda x: len(x.tweet), # the BucketIterator needs to be told what function it should use to group the data.
     sort_within_batch=False,
     repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)
test_iter = Iterator(tst, batch_size=64, device=device, sort=False, sort_within_batch=False, repeat=False)

In [10]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y

    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [11]:
train_dl = BatchWrapper(train_iter, "tweet", ["hate_speech","offensive","neither"])
test_dl = BatchWrapper(test_iter, "tweet", ["hate_speech","offensive","neither"])
dev_dl = BatchWrapper(dev_iter, "tweet", ["hate_speech","offensive","neither"])

In [12]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [13]:
class SimpleBiLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300,
                 spatial_dropout=0.05, recurrent_dropout=0.1, num_linear=1):
        super().__init__() # don't forget to call this!
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1, dropout=recurrent_dropout)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 3)
    
    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
            feature = layer(feature)
        preds = self.predictor(feature)
        return preds

In [14]:
em_sz = 100
nh = 500
nl = 3
model = SimpleBiLSTMBaseline(nh, emb_dim=em_sz)
model.to(device)

  "num_layers={}".format(dropout, num_layers))


SimpleBiLSTMBaseline(
  (embedding): Embedding(31438, 100)
  (encoder): LSTM(100, 500, dropout=0.1)
  (linear_layers): ModuleList()
  (predictor): Linear(in_features=500, out_features=3, bias=True)
)

In [None]:
import tqdm

opt = optim.Adam(model.parameters(), lr=1e-2)
# opt = optim.SGD(model.parameters(), lr=0.01, weight_decay=0.1)
loss_func = nn.BCEWithLogitsLoss()
epochs = 10
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x,y in tqdm.tqdm(train_dl): # thanks to our wrapper, we can intuitively iterate over our data!
        opt.zero_grad()
        preds = model(x)
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()

        running_loss += loss.data.item() * x.size(0)

    epoch_loss = running_loss / len(trn)

    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x,y in tqdm.tqdm(dev_dl):
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.data.item() * x.size(0)

    val_loss /= len(dev)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))

100%|██████████| 310/310 [00:05<00:00, 60.41it/s]
100%|██████████| 39/39 [00:00<00:00, 81.53it/s]
  1%|▏         | 4/310 [00:00<00:09, 32.58it/s]

Epoch: 1, Training Loss: 0.1853, Validation Loss: 0.0955


100%|██████████| 310/310 [00:05<00:00, 47.79it/s]
100%|██████████| 39/39 [00:00<00:00, 131.34it/s]
  3%|▎         | 10/310 [00:00<00:03, 93.89it/s]

Epoch: 2, Training Loss: 0.1470, Validation Loss: 0.0560


100%|██████████| 310/310 [00:05<00:00, 57.80it/s]
100%|██████████| 39/39 [00:00<00:00, 446.12it/s]
  3%|▎         | 8/310 [00:00<00:04, 67.69it/s]

Epoch: 3, Training Loss: 0.1003, Validation Loss: 0.0544


 14%|█▍        | 43/310 [00:00<00:06, 44.40it/s]

In [None]:
with torch.no_grad():
    test_preds = []
    for tt in tqdm.tqdm(test_iter):
        preds = model(tt.tweet)
#         print(tt.tweet)
        preds = preds.cpu()
        preds = preds.data.numpy()
        preds = preds/preds.max()
        
        # the actual outputs of the model are logits, so we need to pass these values to the sigmoid function
        preds = 1 / (1 + np.exp(-preds))
#         print(preds)
        test_preds.append(preds)
#     test_preds = np.hstack(test_preds)

In [None]:
test_preds

In [None]:
x = np.array([[1737.7166, 1942.4241, 1870.5917],[1,2,3]])
x - np.mean(x, axis=1).reshape(1,-1)

In [None]:
x