# Structure Based Hate Speech Detection

In [5]:
import csv
import nltk
import re
import numpy as np
from tqdm import tqdm

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
from nltk.corpus import stopwords

In [3]:
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [4]:
DATA_PATH = "../data/"

In [5]:
def readData(path):
    data = []
    with open(path,'r') as file:
        data = [x for x in csv.reader(file, delimiter=',')]
    return data

def getTweets(raw):
    #pass
    data = [x[6] for x in raw]
    return np.array(data)

def getClass(raw):
    #pass
    classes = [x[5] for x in raw]
    return np.array(classes)

def removePattern(tweet, pattern):
    r = re.findall(pattern, tweet)
    for x in r:
        tweet = re.sub(x, '', tweet)
    return tweet

def preprocess(data):
    cleanData = []
    for tweet in data:
        tweet = removePattern(tweet, "@[\w]*")
        tweet = tweet.replace("#", "") # Removing '#' from hashtags
        tweet = tweet.replace("[^a-zA-Z#]", " ") # Removing punctuation and special characters
        tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
        tweet = re.sub(" +", " ", tweet)
        tweet = tweet.lower()
        tweet = tokenize(tweet)
#         print(tweet)
        cleanData.append(tweet)
    return cleanData

def tokenize(text):
#     print(text)
    return text.split()
    #return TweetTokenizer.tokenize(text)

def evaluate(target, predicted):
    f1 = f1_score(target, predicted, average='weighted')
    acc = accuracy_score(target, predicted)
    rec = recall_score(target, predicted, average = 'macro')
    print("F1 score:   ", f1)
    print("Avg Recall: ", rec)    
    print("Accuracy:   ", acc)

In [6]:
tweet = "!!! rt: as a woman"
tweet = re.sub(r"[^a-zA-Z]+", " ", tweet)
tweet

' rt as a woman'

In [6]:
DATA = DATA_PATH + "labeled_data.csv"

In [7]:
en_stopwords = set(stopwords.words("english")) 

raw = readData(DATA) 
r_tweets = getTweets(raw)
classes = getClass(raw)
tweets = preprocess(r_tweets)

In [8]:
X = [x for x in r_tweets]
X = np.delete(np.array(X), [0])
y = np.delete(classes, [0])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# X_train = np.array(X_train)

## Word-level Tokens with CountVectorizer

In [9]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    lowercase = True,
    tokenizer = tokenize,
    ngram_range=(1, 1),
    stop_words = en_stopwords)
vectorizer.fit(X_train)
train_features = vectorizer.transform(X_train)
test_features = vectorizer.transform(X_test)

### Logistic Regression

In [10]:
classifier = LogisticRegression(C=0.1, solver='sag')

In [11]:
classifier.fit(train_features, y_train)
y_predict = classifier.predict(test_features)
evaluate(y_test, y_predict)



F1 score:    0.8542912117582977
Avg Recall:  0.5827025804232632
Accuracy:    0.8769416986080291


### SVM

In [12]:
classifier = SVC(C = 0.1)

In [13]:
classifier.fit(train_features, y_train)
y_predict = classifier.predict(test_features)
evaluate(y_test, y_predict)



F1 score:    0.6705231536335549
Avg Recall:  0.3333333333333333
Accuracy:    0.770425660681864


  'precision', 'predicted', average, warn_for)


## Char-level Tokens with CountVectorizer

In [14]:
vectorizer = CountVectorizer(
    analyzer = 'char',
    lowercase = True,
    tokenizer = tokenize,
    ngram_range=(2, 6),
    stop_words = en_stopwords)
vectorizer.fit(X_train)
train_features = vectorizer.transform(X_train)
test_features = vectorizer.transform(X_test)

### Logistic Regression

In [15]:
classifier = LogisticRegression(C = 0.1, solver='sag')

In [16]:
classifier.fit(train_features, y_train)
y_predict = classifier.predict(test_features)
evaluate(y_test, y_predict)



F1 score:    0.888798971979161
Avg Recall:  0.6904279088071067
Accuracy:    0.8971151906394997


### SVM

In [17]:
classifier = SVC(C = 0.001)

In [18]:
classifier.fit(train_features, y_train)
y_predict = classifier.predict(test_features)
evaluate(y_test, y_predict)



F1 score:    0.6705231536335549
Avg Recall:  0.3333333333333333
Accuracy:    0.770425660681864


  'precision', 'predicted', average, warn_for)


## DL Models

## Using Torchtext

In [19]:
from torchtext.data import Field, TabularDataset
def cust_preprocess(tweet):
    tweet = removePattern(tweet, "@[\w]*")
    tweet = tweet.replace("#", "") # Removing '#' from hashtags
    tweet = tweet.replace("[^a-zA-Z#]", " ") # Removing punctuation and special characters
    tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
    tweet = re.sub(" +", " ", tweet)
    tweet = tweet.lower()
    tweet = tokenize(tweet)
    return tweet

In [20]:
TEXT = Field(sequential = True, tokenize = cust_preprocess, lower=True)
LABEL = Field(sequential = False, use_vocab=False)

In [21]:
tv_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                 ("ct", None),
                 ("count", None),
                 ("hate_speech", LABEL),
                 ("offensive", LABEL),
                 ("neither", LABEL),
                 ("label", None),
                 ("tweet", TEXT)]

dt = TabularDataset(
               path=DATA_PATH + "labeled_data-mod.csv",
               format='csv',
               skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=tv_datafields)
trn, dev,tst = dt.split([0.8,0.1,0.1])

In [22]:
TEXT.build_vocab(trn)

In [23]:
from torchtext.data import Iterator, BucketIterator

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, dev_iter = BucketIterator.splits(
     (trn, dev), # we pass in the datasets we want the iterator to draw data from
     batch_sizes=(64, 64),
     device=device, # if you want to use the GPU, specify the GPU number here
     sort_key=lambda x: len(x.tweet), # the BucketIterator needs to be told what function it should use to group the data.
     sort_within_batch=False,
     repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)
test_iter = Iterator(tst, batch_size=64, device=device, sort=False, sort_within_batch=False, repeat=False)

In [24]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y

    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [25]:
train_dl = BatchWrapper(train_iter, "tweet", ["hate_speech","offensive","neither"])
test_dl = BatchWrapper(test_iter, "tweet", ["hate_speech","offensive","neither"])
dev_dl = BatchWrapper(dev_iter, "tweet", ["hate_speech","offensive","neither"])

In [26]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [27]:
class LSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300,
                 spatial_dropout=0.05, recurrent_dropout=0.1, num_linear=1):
        super().__init__() # don't forget to call this!
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1, dropout=recurrent_dropout)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 3)
    
    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
            feature = layer(feature)
        preds = self.predictor(feature)
        return preds

## LSTM 128-dim embedding

In [28]:
em_sz = 128
nh = 250
nl = 3
model = LSTMBaseline(nh, emb_dim=em_sz)
model.to(device)

  "num_layers={}".format(dropout, num_layers))


LSTMBaseline(
  (embedding): Embedding(31384, 128)
  (encoder): LSTM(128, 250, dropout=0.1)
  (linear_layers): ModuleList()
  (predictor): Linear(in_features=250, out_features=3, bias=True)
)

In [29]:
opt = optim.Adam(model.parameters(), lr=1e-2)
loss_func = nn.BCEWithLogitsLoss()
epochs = 10
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x,y in tqdm(train_dl): # thanks to our wrapper, we can intuitively iterate over our data!
        opt.zero_grad()
        preds = model(x)
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()

        running_loss += loss.data.item() * x.size(0)

    epoch_loss = running_loss / len(trn)

    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x,y in tqdm(dev_dl):
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.data.item() * x.size(0)

    val_loss /= len(dev)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))

100%|██████████| 310/310 [00:01<00:00, 169.17it/s]
100%|██████████| 39/39 [00:00<00:00, 557.25it/s]
  4%|▍         | 13/310 [00:00<00:02, 127.05it/s]

Epoch: 1, Training Loss: 0.1695, Validation Loss: 0.0691


100%|██████████| 310/310 [00:01<00:00, 176.09it/s]
100%|██████████| 39/39 [00:00<00:00, 592.11it/s]
  4%|▍         | 13/310 [00:00<00:02, 124.24it/s]

Epoch: 2, Training Loss: 0.1143, Validation Loss: 0.0563


100%|██████████| 310/310 [00:01<00:00, 174.91it/s]
100%|██████████| 39/39 [00:00<00:00, 581.63it/s]
  4%|▍         | 12/310 [00:00<00:02, 119.31it/s]

Epoch: 3, Training Loss: 0.0860, Validation Loss: 0.0553


100%|██████████| 310/310 [00:01<00:00, 175.97it/s]
100%|██████████| 39/39 [00:00<00:00, 552.81it/s]
  4%|▍         | 13/310 [00:00<00:02, 123.44it/s]

Epoch: 4, Training Loss: 0.0685, Validation Loss: 0.0587


100%|██████████| 310/310 [00:01<00:00, 173.98it/s]
100%|██████████| 39/39 [00:00<00:00, 570.93it/s]
  4%|▍         | 13/310 [00:00<00:02, 123.43it/s]

Epoch: 5, Training Loss: 0.0563, Validation Loss: 0.0628


100%|██████████| 310/310 [00:01<00:00, 175.61it/s]
100%|██████████| 39/39 [00:00<00:00, 592.54it/s]
  4%|▍         | 12/310 [00:00<00:02, 117.82it/s]

Epoch: 6, Training Loss: 0.0437, Validation Loss: 0.0676


100%|██████████| 310/310 [00:01<00:00, 175.58it/s]
100%|██████████| 39/39 [00:00<00:00, 594.74it/s]
  4%|▍         | 13/310 [00:00<00:02, 123.45it/s]

Epoch: 7, Training Loss: 0.0375, Validation Loss: 0.0735


100%|██████████| 310/310 [00:01<00:00, 172.89it/s]
100%|██████████| 39/39 [00:00<00:00, 562.00it/s]
  4%|▍         | 12/310 [00:00<00:02, 118.92it/s]

Epoch: 8, Training Loss: 0.0333, Validation Loss: 0.0722


100%|██████████| 310/310 [00:01<00:00, 176.41it/s]
100%|██████████| 39/39 [00:00<00:00, 545.63it/s]
  5%|▍         | 14/310 [00:00<00:02, 133.11it/s]

Epoch: 9, Training Loss: 0.0278, Validation Loss: 0.0857


100%|██████████| 310/310 [00:01<00:00, 177.19it/s]
100%|██████████| 39/39 [00:00<00:00, 550.41it/s]

Epoch: 10, Training Loss: 0.0246, Validation Loss: 0.0886





In [30]:
# test_iter.
with torch.no_grad():
    test_preds = []
    ground_truth = []
    for x,y in tqdm(test_dl):
        preds = model(x)
        preds = F.softmax(preds)
        preds = preds.cpu()
        preds = preds.data.numpy()
        for result in preds:
            if np.argmax(result) == 0:
                test_preds.append([1, 0, 0])
            elif np.argmax(result) == 1:
                test_preds.append([0, 1, 0])
            elif np.argmax(result) == 2:
                test_preds.append([0, 0, 1])
        for val in y:
            ground_truth.append(val.cpu().data.numpy())

  import sys
100%|██████████| 39/39 [00:00<00:00, 243.43it/s]


In [31]:
test_preds = np.array(test_preds)
ground_truth = np.array(ground_truth)
evaluate(ground_truth, test_preds)

F1 score:    0.811162229805579
Avg Recall:  0.5276322175928475
Accuracy:    0.8256658595641646


## LSTM 256-dim embedding

In [32]:
em_sz = 256
nh = 250
nl = 3
model = LSTMBaseline(nh, emb_dim=em_sz)
model.to(device)

LSTMBaseline(
  (embedding): Embedding(31384, 256)
  (encoder): LSTM(256, 250, dropout=0.1)
  (linear_layers): ModuleList()
  (predictor): Linear(in_features=250, out_features=3, bias=True)
)

In [33]:
opt = optim.Adam(model.parameters(), lr=1e-2)
loss_func = nn.BCEWithLogitsLoss()
epochs = 10
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x,y in tqdm(train_dl): # thanks to our wrapper, we can intuitively iterate over our data!
        opt.zero_grad()
        preds = model(x)
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()

        running_loss += loss.data.item() * x.size(0)

    epoch_loss = running_loss / len(trn)

    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x,y in tqdm(dev_dl):
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.data.item() * x.size(0)

    val_loss /= len(dev)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))

100%|██████████| 310/310 [00:02<00:00, 144.28it/s]
100%|██████████| 39/39 [00:00<00:00, 552.44it/s]
  3%|▎         | 10/310 [00:00<00:03, 98.03it/s]

Epoch: 1, Training Loss: 0.1856, Validation Loss: 0.0952


100%|██████████| 310/310 [00:02<00:00, 144.58it/s]
100%|██████████| 39/39 [00:00<00:00, 561.54it/s]
  3%|▎         | 10/310 [00:00<00:03, 99.32it/s]

Epoch: 2, Training Loss: 0.1844, Validation Loss: 0.0918


100%|██████████| 310/310 [00:02<00:00, 145.14it/s]
100%|██████████| 39/39 [00:00<00:00, 551.80it/s]
  3%|▎         | 10/310 [00:00<00:03, 99.37it/s]

Epoch: 3, Training Loss: 0.1762, Validation Loss: 0.0899


100%|██████████| 310/310 [00:02<00:00, 145.04it/s]
100%|██████████| 39/39 [00:00<00:00, 570.93it/s]
  3%|▎         | 10/310 [00:00<00:03, 98.83it/s]

Epoch: 4, Training Loss: 0.1275, Validation Loss: 0.0696


100%|██████████| 310/310 [00:02<00:00, 144.32it/s]
100%|██████████| 39/39 [00:00<00:00, 542.08it/s]
  4%|▎         | 11/310 [00:00<00:02, 104.39it/s]

Epoch: 5, Training Loss: 0.0948, Validation Loss: 0.0693


100%|██████████| 310/310 [00:02<00:00, 144.75it/s]
100%|██████████| 39/39 [00:00<00:00, 540.29it/s]
  4%|▎         | 11/310 [00:00<00:02, 105.55it/s]

Epoch: 6, Training Loss: 0.0745, Validation Loss: 0.0732


100%|██████████| 310/310 [00:02<00:00, 144.21it/s]
100%|██████████| 39/39 [00:00<00:00, 538.23it/s]
  4%|▎         | 11/310 [00:00<00:02, 105.62it/s]

Epoch: 7, Training Loss: 0.0642, Validation Loss: 0.0730


100%|██████████| 310/310 [00:02<00:00, 145.06it/s]
100%|██████████| 39/39 [00:00<00:00, 545.07it/s]
  4%|▎         | 11/310 [00:00<00:02, 102.83it/s]

Epoch: 8, Training Loss: 0.0529, Validation Loss: 0.0755


100%|██████████| 310/310 [00:02<00:00, 144.68it/s]
100%|██████████| 39/39 [00:00<00:00, 529.98it/s]
  4%|▎         | 11/310 [00:00<00:02, 107.04it/s]

Epoch: 9, Training Loss: 0.0481, Validation Loss: 0.0905


100%|██████████| 310/310 [00:02<00:00, 145.03it/s]
100%|██████████| 39/39 [00:00<00:00, 539.57it/s]

Epoch: 10, Training Loss: 0.0428, Validation Loss: 0.0925





In [34]:
# test_iter.
with torch.no_grad():
    test_preds = []
    ground_truth = []
    for x,y in tqdm(test_dl):
        preds = model(x)
        preds = F.softmax(preds)
        preds = preds.cpu()
        preds = preds.data.numpy()
        for result in preds:
            if np.argmax(result) == 0:
                test_preds.append([1, 0, 0])
            elif np.argmax(result) == 1:
                test_preds.append([0, 1, 0])
            elif np.argmax(result) == 2:
                test_preds.append([0, 0, 1])
        for val in y:
            ground_truth.append(val.cpu().data.numpy())

  import sys
100%|██████████| 39/39 [00:00<00:00, 244.13it/s]


In [35]:
test_preds = np.array(test_preds)
ground_truth = np.array(ground_truth)
evaluate(ground_truth, test_preds)

F1 score:    0.8160748716905766
Avg Recall:  0.5613161444854359
Accuracy:    0.8280871670702179
