In [1]:
import warnings
warnings.filterwarnings('ignore')

import pickle

### Importing tenosrflow libraries

In [1]:
import tensorflow as tf

In [2]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

### Reading CSV

In [3]:
import pandas as pd
df = pd.read_csv('tweets.csv')

In [4]:
raw_text = df['text']
testing_indices = []
training_indices = []
for i in range(0,len(df)):
    if df.iloc[i,9] == 'test':
        testing_indices.append(i)
    else:
        training_indices.append(i)
        
testing_data = df[df['split'] == 'test']
training_data = df[(df['split'].isin(['train','dev']))]


In [5]:
Y = df['label']
Y = [1 if i == "hate" else 0 for i in Y]

### Text Preprocessing

### Converting to Lower Case

In [6]:
### Lower text
def toLower(data):
    res = []
    for sentence in data:
        res.append(str.lower(sentence))
    return res

In [8]:
cleanset = toLower(raw_text)

### Importing nltk libraries

In [9]:
# Tokenise
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
#nltk.download('punkt')

### Removing stopwords and punctuation

In [10]:

from nltk.corpus import stopwords
#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
  
filtered_sentences = []

# Removing stopwords
for text in cleanset:
    content = []
    for i in word_tokenize(text):
        if i not in stop_words:
            content.append(i)
    filtered_sentences.append(' '.join(content))
    

tokenizer = nltk.RegexpTokenizer(r"\w+")

filtered_sentences_2 = []

# Removing punctuation

for sentence in filtered_sentences:
    new_words = tokenizer.tokenize(sentence)
    filtered_sentences_2.append(' '.join(new_words))


    

### Removing numbers and text lemmatization

In [11]:
# function to remove numbers
#nltk.download('wordnet')
import re

def remove_numbers(text):
    # define the pattern to keep
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
    return re.sub(pattern, '', text)
 
#nltk.download('wordnet')
filtered_sentences_2 = [remove_numbers(sentence) for sentence in filtered_sentences_2]

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

filtered_sentences_2 = [lemmatizer.lemmatize(sentence) for sentence in filtered_sentences_2]
    

In [23]:
#filtered_sentences_2[:4]

### Filtering useful words

In [12]:
filtered_sentences_3 = []

for sentence in filtered_sentences_2:
    wordsToAdd = []
    for word in word_tokenize(sentence):
        if len(word) >= 3 and word.isalpha():
            wordsToAdd.append(word)
    filtered_sentences_3.append(' '.join(wordsToAdd))
    

In [33]:
#filtered_sentences_3[:3]

['legal husband rape wife', 'dalits lowlife scum', 'dalits lowlives']

### Removing whitespace

In [13]:
filtered_sentences_3 = [sentence.strip() for sentence in filtered_sentences_3]

In [14]:
#filtered_sentences_3[:4]

['legal husband rape wife',
 'dalits lowlife scum',
 'dalits lowlives',
 'better world women dare question']

In [14]:
corpus = filtered_sentences_3

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

### Function for POS Tagging

In [34]:
def posTagging(sentences):
    final = []
    for sentence in sentences:
        res = []
        tok=nltk.tokenize.word_tokenize(sentence) 
        pos=nltk.pos_tag(tok)
        for token in pos:
            res.append(token[0] + "_" + token[1])
        final.append(' '.join(res))
    return final

In [35]:
#tagged_corpus = posTagging(corpus)
#print(tagged_corpus)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### Creating BoW model with term frequency

In [16]:
vect = CountVectorizer(max_features=30000,ngram_range=(1,3))
vect.fit(corpus)

CountVectorizer(max_features=30000, ngram_range=(1, 3))

In [17]:
size = len(vect.vocabulary_)
size

30000

In [30]:
#vect.get_feature_names()

In [18]:
bag_of_words = vect.transform(corpus)

In [19]:
X = bag_of_words.toarray()

## LSTM Model

### One Hot Representation

In [20]:
import tensorflow as tf

vocab_size = size

onehot_repr = [one_hot(sentence, vocab_size) for words in corpus]

#onehot_repr


### Embedding Representation

In [21]:
sent_length = 15
embedded_docs = pad_sequences(onehot_repr, padding='pre',maxlen=sent_length)


In [22]:
## Model Creation
embedding_vector_features = 60
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length=sent_length))
model.add(LSTM(200)) ## 200 neurons
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 15, 60)            1800000   
_________________________________________________________________
lstm (LSTM)                  (None, 200)               208800    
_________________________________________________________________
dense (Dense)                (None, 1)                 201       
Total params: 2,009,001
Trainable params: 2,009,001
Non-trainable params: 0
_________________________________________________________________
None


In [23]:
import numpy as np
X_for_LSTM = np.array(embedded_docs)
Y = np.array(Y)

In [24]:
X_train = np.array([X_for_LSTM[i] for i in range(len(X_for_LSTM)) if i in training_indices])
X_test = np.array([X_for_LSTM[i] for i in range(len(X_for_LSTM)) if i in testing_indices])
#X_test = [i for i in X_for_LSTM if i in testing_indices]
Y_train = np.array([Y[i] for i in range(len(Y)) if i in training_indices])
Y_test = np.array([Y[i] for i in range(len(Y)) if i in testing_indices])

#from sklearn.model_selection import train_test_split
#X_train, X_test, Y_train, Y_test = train_test_split(X_for_LSTM,Y,test_size=0.2, random_state=25)


In [28]:
## Training deep learning model
model.fit(X_train,Y_train,validation_data=(X_test,Y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x23d9c023550>

### Performance Metrics and Accuracy

In [30]:
predict_x=model.predict(X_test) 
Y_pred=np.argmax(predict_x,axis=1)
from sklearn.metrics import accuracy_score,classification_report

print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.46      1.00      0.63      1857
           1       0.00      0.00      0.00      2205

    accuracy                           0.46      4062
   macro avg       0.23      0.50      0.31      4062
weighted avg       0.21      0.46      0.29      4062



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Gaussian NB

In [20]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, random_state=25)

from sklearn.metrics import accuracy_score,classification_report

X_train = [X[i] for i in range(len(X)) if i in training_indices]
X_test = [X[i] for i in range(len(X)) if i in testing_indices]
Y_train = [Y[i] for i in range(len(Y)) if i in training_indices]
Y_test = [Y[i] for i in range(len(Y)) if i in testing_indices]


from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()

#print(Y_train[:4])
clf.fit(X_train, Y_train)



print(classification_report(Y_test, clf.predict(X_test)))


              precision    recall  f1-score   support

           0       0.49      0.76      0.60      1857
           1       0.62      0.33      0.43      2205

    accuracy                           0.53      4062
   macro avg       0.56      0.55      0.51      4062
weighted avg       0.56      0.53      0.51      4062



## Logistic Regression

In [26]:
import pickle
from sklearn.linear_model import LogisticRegression


#from sklearn.model_selection import train_test_split
#X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, random_state=25)

logReg = LogisticRegression(max_iter = 1000)
logReg = logReg.fit(X, Y)

pickle.dump((vect,logReg),open('log_reg_model.pkl','wb'))
#log_from_pickle = pickle.loads(saved_model)

print(classification_report(Y_test, logReg.predict(X_test)))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88      1857
           1       0.90      0.91      0.90      2205

    accuracy                           0.89      4062
   macro avg       0.89      0.89      0.89      4062
weighted avg       0.89      0.89      0.89      4062



## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

decTree = DecisionTreeClassifier()

decTree = decTree.fit(X_train, Y_train)

accuracy_score(Y_test, decTree.predict(X_test))

## Basic Multinomial NB

In [31]:
#X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2, random_state=25)
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB()

In [40]:
from sklearn import metrics
import numpy as np
#import itertools

In [34]:
classifier.fit(X_train, Y_train)
pred = classifier.predict(X_test)
print(classification_report(Y_test,classifier.predict(X_test)))

              precision    recall  f1-score   support

           0       0.56      0.56      0.56      1857
           1       0.63      0.63      0.63      2205

    accuracy                           0.60      4062
   macro avg       0.59      0.59      0.59      4062
weighted avg       0.60      0.60      0.60      4062



## Passive Aggressive Classifier Algorithm

In [35]:
from sklearn.linear_model import PassiveAggressiveClassifier
clf = PassiveAggressiveClassifier()

In [37]:
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
print(classification_report(Y_test,pred))

              precision    recall  f1-score   support

           0       0.70      0.55      0.62      1857
           1       0.68      0.80      0.74      2205

    accuracy                           0.69      4062
   macro avg       0.69      0.68      0.68      4062
weighted avg       0.69      0.69      0.68      4062



## Multinomial Classifier with Hyperparameter

In [38]:
classifier=MultinomialNB(alpha=0.1)

In [41]:
previous_score=0
for alpha in np.arange(0,1,0.1):
    sub_classifier=MultinomialNB(alpha=alpha)
    sub_classifier.fit(X_train,Y_train)
    y_pred=sub_classifier.predict(X_test)
    score = accuracy_score(Y_test, y_pred)
    if score>previous_score:
        classifier=sub_classifier
    print("Alpha: {}, Score : {}".format(alpha,score))
    
print(classification_report(Y_test,classifier.predict(X_test)))



Alpha: 0.0, Score : 0.587149187592319
Alpha: 0.1, Score : 0.5869030034465781
Alpha: 0.2, Score : 0.5876415558838011
Alpha: 0.30000000000000004, Score : 0.587149187592319
Alpha: 0.4, Score : 0.5898572131954702
Alpha: 0.5, Score : 0.5898572131954702
Alpha: 0.6000000000000001, Score : 0.5925652387986213
Alpha: 0.7000000000000001, Score : 0.5937961595273265
Alpha: 0.8, Score : 0.5945347119645494
Alpha: 0.9, Score : 0.5960118168389956
              precision    recall  f1-score   support

           0       0.56      0.56      0.56      1857
           1       0.63      0.63      0.63      2205

    accuracy                           0.60      4062
   macro avg       0.59      0.59      0.59      4062
weighted avg       0.60      0.60      0.60      4062

