# Structure Based Hate Speech Detection

In [11]:
import csv
import nltk
import re
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd

from nltk.corpus import stopwords
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, train_test_split
DATA_PATH = "../data/"

In [None]:
vocab_set = set()

In [60]:
def readData(path):
    data = []
    with open(path,'r') as file:
        data = [x for x in csv.reader(file, delimiter=',')]
    return data

def getTweets(raw):
    #pass
    data = [x[6] for x in raw]
    return np.array(data)

def getClass(raw):
    #pass
    classes = [x[5] for x in raw]
    return np.array(classes)

def removePattern(tweet, pattern):
    r = re.findall(pattern, tweet)
    for x in r:
        tweet = re.sub(x, '', tweet)
    return tweet

def preprocess(data):
    cleanData = []
    for tweet in data:
        tweet = removePattern(tweet, "@[\w]*")
        tweet = tweet.replace("#", "") # Removing '#' from hashtags
        tweet = tweet.replace("[^a-zA-Z#]", " ") # Removing punctuation and special characters
        tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
        tweet = re.sub(" +", " ", tweet)
        tweet = tweet.lower()
        tweet = tokenize(tweet)
#         print(tweet)
        cleanData.append(tweet)
    return cleanData

def tokenize(text):
#     print(text)
    return text.split(' ')
    #return TweetTokenizer.tokenize(text)

def evaluate(target, predicted):
    f1 = f1_score(target, predicted, average='weighted')
    acc = accuracy_score(target, predicted)
    rec = recall_score(target, predicted, average = 'macro')
    print("F1 score:   ", f1)
    print("Avg Recall: ", rec)    
    print("Accuracy:   ", acc)

In [61]:
DATA = DATA_PATH + "labeled_data.csv"

In [62]:
en_stopwords = set(stopwords.words("english")) 

raw = readData(DATA) 
tweets = getTweets(raw)
classes = getClass(raw)
tweets = preprocess(tweets)

In [64]:
X = [x for x in tweets]
X = np.delete(np.array(X), [0])
y = np.delete(classes, [0])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [84]:
# from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# class LSTMClassifier(nn.Module):
    
#     def __init__(self, embedding_dim, hidden_dim, output_size, batch_size, num_layers = 1):

#         super(LSTMClassifier, self).__init__()
        
#         self.embedding_dim = embedding_dim
#         self.hidden_dim = hidden_dim
#         self.output_size = output_size
#         self.batch_size = batch_size
#         self.num_layers = num_layers
        
#         # Naive embeddings for testing purposes
#         self.embedding = nn.Embedding(1024, embedding_dim)
        
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers = num_layers)
#         self.hidden2out = nn.Linear(hidden_dim, output_size)
                
#         self.hidden = self.init_hidden()
#         self.softmax = nn.LogSoftmax()
        
#         self.dropout_layer = nn.Dropout(p = 0.2)
    
#     def init_hidden(self):
#          return (autograd.Variable(torch.randn(self.num_layers, self.batch_size, self.hidden_dim)),
#                 autograd.Variable(torch.randn(self.num_layers, self.batch_size, self.hidden_dim)))
        
#     def forward(self, sents, lengths):
#         embeds = self.embedding(sents)
#         packed_input = pack_padded_sequence(embeds, lengths)
        
#         lstm_out, self.hidden = self.lstm(packed_input, self.hidden)
#         y = self.hidden2out(lstm_out[-1])
#         y = self.softmax(y)
#         return y

In [85]:
# model = LSTMClassifier(128, 32, 2, 1)
# loss_function = nn.NLLLoss()
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=0.1)
# model(sampleT, 10)