In [3]:
import warnings
warnings.filterwarnings('ignore')

### Importing tenosrflow libraries

In [4]:
import tensorflow as tf

In [5]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

### Reading CSV

In [6]:
import pandas as pd
df = pd.read_csv('tweets.csv')

In [7]:
raw_text = df['text']

In [8]:
Y = df['label']
Y = [1 if i == "hate" else 0 for i in Y]

### Text Preprocessing

### Converting to Lower Case

In [9]:
### Lower text
def toLower(data):
    res = []
    for sentence in data:
        res.append(str.lower(sentence))
    return res

In [10]:
cleanset = toLower(raw_text)

### Importing nltk libraries

In [11]:
# Tokenise
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
#nltk.download('punkt')

### Removing stopwords and punctuation

In [12]:

from nltk.corpus import stopwords
#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
  
filtered_sentences = []

# Removing stopwords
for text in cleanset:
    content = []
    for i in word_tokenize(text):
        if i not in stop_words:
            content.append(i)
    filtered_sentences.append(' '.join(content))
    

tokenizer = nltk.RegexpTokenizer(r"\w+")

filtered_sentences_2 = []

# Removing punctuation

for sentence in filtered_sentences:
    new_words = tokenizer.tokenize(sentence)
    filtered_sentences_2.append(' '.join(new_words))


    

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Removing numbers and text lemmatization

In [13]:
# function to remove numbers
#nltk.download('wordnet')
import re

def remove_numbers(text):
    # define the pattern to keep
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
    return re.sub(pattern, '', text)
 
#nltk.download('wordnet')
filtered_sentences_2 = [remove_numbers(sentence) for sentence in filtered_sentences_2]

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

filtered_sentences_2 = [lemmatizer.lemmatize(sentence) for sentence in filtered_sentences_2]
    

In [13]:
#filtered_sentences_2[:4]

['legal husband rape wife',
 'dalits lowlife scum',
 'dalits lowlives',
 'better world women dare question men']

### Filtering useful words

In [14]:
filtered_sentences_3 = []

for sentence in filtered_sentences_2:
    wordsToAdd = []
    for word in word_tokenize(sentence):
        if len(word) >= 3 and word.isalpha():
            wordsToAdd.append(word)
    filtered_sentences_3.append(' '.join(wordsToAdd))
    

In [33]:
#filtered_sentences_3[:3]

['legal husband rape wife', 'dalits lowlife scum', 'dalits lowlives']

### Removing whitespace

In [15]:
filtered_sentences_3 = [sentence.strip() for sentence in filtered_sentences_3]

In [14]:
#filtered_sentences_3[:4]

['legal husband rape wife',
 'dalits lowlife scum',
 'dalits lowlives',
 'better world women dare question']

In [16]:
corpus = filtered_sentences_3

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

### Function for POS Tagging

In [34]:
def posTagging(sentences):
    final = []
    for sentence in sentences:
        res = []
        tok=nltk.tokenize.word_tokenize(sentence) 
        pos=nltk.pos_tag(tok)
        for token in pos:
            res.append(token[0] + "_" + token[1])
        final.append(' '.join(res))
    return final

In [35]:
#tagged_corpus = posTagging(corpus)
#print(tagged_corpus)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### Creating BoW model with term frequency

In [25]:
vect = CountVectorizer(max_features=30000,ngram_range=(1,3))
vect.fit(corpus)

CountVectorizer(max_features=30000, ngram_range=(1, 3))

In [26]:
size = len(vect.vocabulary_)
size

30000

In [2]:
#vect.get_feature_names()

In [27]:
bag_of_words = vect.transform(corpus)

In [28]:
X = bag_of_words.toarray()

## LSTM Model

### One Hot Representation

In [29]:
import tensorflow as tf

vocab_size = size

onehot_repr = [one_hot(sentence, vocab_size) for words in corpus]

#onehot_repr


### Embedding Representation

In [30]:
sent_length = 15
embedded_docs = pad_sequences(onehot_repr, padding='pre',maxlen=sent_length)


In [31]:
## Model Creation
embedding_vector_features = 60
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length=sent_length))
model.add(LSTM(200)) ## 200 neurons
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 15, 60)            1800000   
_________________________________________________________________
lstm (LSTM)                  (None, 200)               208800    
_________________________________________________________________
dense (Dense)                (None, 1)                 201       
Total params: 2,009,001
Trainable params: 2,009,001
Non-trainable params: 0
_________________________________________________________________
None


In [33]:
import numpy as np
X_for_LSTM = np.array(embedded_docs)
Y = np.array(Y)

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_for_LSTM,Y,test_size=0.2, random_state=25)


In [35]:
## Training deep learning model
model.fit(X_train,Y_train,validation_data=(X_test,Y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2049d91adc0>

### Performance Metrics and Accuracy

In [36]:
predict_x=model.predict(X_test) 
Y_pred=np.argmax(predict_x,axis=1)
from sklearn.metrics import accuracy_score

print(accuracy_score(Y_test,Y_pred))

0.46695384615384616


## Gaussian NB

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, random_state=25)

from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()

#print(Y_train[:4])
clf.fit(X_train, Y_train)

from sklearn.metrics import accuracy_score

accuracy_score(Y_test, clf.predict(X_test))


0.5384615384615384

## Logistic Regression

In [38]:
from sklearn.linear_model import LogisticRegression

logReg = LogisticRegression(max_iter = 1000)
logReg = logReg.fit(X_train, Y_train)


accuracy_score(Y_test, logReg.predict(X_test))

0.7355076923076923

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

decTree = DecisionTreeClassifier()

decTree = decTree.fit(X_train, Y_train)

accuracy_score(Y_test, decTree.predict(X_test))

## Basic Multinomial NB

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2, random_state=25)
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB()

In [40]:
from sklearn import metrics
import numpy as np
#import itertools

In [42]:
classifier.fit(X_train, y_train)
pred = classifier.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
metrics.confusion_matrix(y_test, pred)


accuracy:   0.632


array([[2233, 1561],
       [1427, 2904]], dtype=int64)

## Passive Aggressive Classifier Algorithm

In [43]:
from sklearn.linear_model import PassiveAggressiveClassifier
clf = PassiveAggressiveClassifier()

In [44]:
clf.fit(X_train, y_train)
pred = linear_clf.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
print(metrics.confusion_matrix(y_test, pred))

accuracy:   0.708
[[2559 1235]
 [1141 3190]]


## Multinomial Classifier with Hyperparameter

In [45]:
classifier=MultinomialNB(alpha=0.1)

In [46]:
previous_score=0
for alpha in np.arange(0,1,0.1):
    sub_classifier=MultinomialNB(alpha=alpha)
    sub_classifier.fit(X_train,y_train)
    y_pred=sub_classifier.predict(X_test)
    score = metrics.accuracy_score(y_test, y_pred)
    if score>previous_score:
        classifier=sub_classifier
    print("Alpha: {}, Score : {}".format(alpha,score))

Alpha: 0.0, Score : 0.6162461538461539
Alpha: 0.1, Score : 0.6201846153846153
Alpha: 0.2, Score : 0.6209230769230769
Alpha: 0.30000000000000004, Score : 0.6216615384615385
Alpha: 0.4, Score : 0.6247384615384616
Alpha: 0.5, Score : 0.6264615384615385
Alpha: 0.6000000000000001, Score : 0.6274461538461539
Alpha: 0.7000000000000001, Score : 0.6296615384615385
Alpha: 0.8, Score : 0.6291692307692308
Alpha: 0.9, Score : 0.6312615384615384


## SVM

In [1]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(X_train,y_train)
print(accuracy_score(y_test,clf.predict(X_test)))

NameError: name 'X_train' is not defined