### Importing tenosrflow libraries

In [1]:
import tensorflow as tf

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
!pip install tensorflow

In [2]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

### Reading CSV

In [3]:
import pandas as pd
df = pd.read_csv('tweets.csv')

In [4]:
raw_text = df['text']

In [5]:
Y = df['label']
Y = [1 if i == "hate" else 0 for i in Y]

### Text Preprocessing

### Converting to Lower Case

In [6]:
### Lower text
def toLower(data):
    res = []
    for sentence in data:
        res.append(str.lower(sentence))
    return res

In [7]:
cleanset = toLower(raw_text)

### Importing nltk libraries

In [8]:
# Tokenise
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
#nltk.download('punkt')

### Removing stopwords and punctuation

In [9]:

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
  
filtered_sentences = []

# Removing stopwords
for text in cleanset:
    content = []
    for i in word_tokenize(text):
        if i not in stop_words:
            content.append(i)
    filtered_sentences.append(' '.join(content))
    

tokenizer = nltk.RegexpTokenizer(r"\w+")

filtered_sentences_2 = []

# Removing punctuation

for sentence in filtered_sentences:
    new_words = tokenizer.tokenize(sentence)
    filtered_sentences_2.append(' '.join(new_words))


    

### Removing numbers and text lemmatization

In [10]:
# function to remove numbers
import re

def remove_numbers(text):
    # define the pattern to keep
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
    return re.sub(pattern, '', text)
 
#nltk.download('wordnet')
filtered_sentences_2 = [remove_numbers(sentence) for sentence in filtered_sentences_2]

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

filtered_sentences_2 = [lemmatizer.lemmatize(sentence) for sentence in filtered_sentences_2]
    

In [20]:
#filtered_sentences_2[:4]

['legal husband rape wife',
 'dalits lowlife scum',
 'dalits lowlives',
 'better world women dare question men']

### Filtering useful words

In [11]:
filtered_sentences_3 = []

for sentence in filtered_sentences_2:
    wordsToAdd = []
    for word in word_tokenize(sentence):
        if len(word) >= 3 and word.isalpha():
            wordsToAdd.append(word)
    filtered_sentences_3.append(' '.join(wordsToAdd))
    

In [33]:
#filtered_sentences_3[:3]

['legal husband rape wife', 'dalits lowlife scum', 'dalits lowlives']

### Removing whitespace

In [12]:
filtered_sentences_3 = [sentence.strip() for sentence in filtered_sentences_3]

In [14]:
#filtered_sentences_3[:4]

['legal husband rape wife',
 'dalits lowlife scum',
 'dalits lowlives',
 'better world women dare question']

In [13]:
corpus = filtered_sentences_3

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

### Function for POS Tagging

In [34]:
def posTagging(sentences):
    final = []
    for sentence in sentences:
        res = []
        tok=nltk.tokenize.word_tokenize(sentence) 
        pos=nltk.pos_tag(tok)
        for token in pos:
            res.append(token[0] + "_" + token[1])
        final.append(' '.join(res))
    return final

In [35]:
tagged_corpus = posTagging(corpus)
print(tagged_corpus)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### Creating BoW model with term frequency

In [15]:
vect = CountVectorizer(max_features=20000,ngram_range=(1,1))
vect.fit(corpus)

CountVectorizer(max_features=20000)

In [16]:
size = len(vect.vocabulary_)
size

19646

In [31]:
#vect.get_feature_names()

['abbos',
 'abhorrent',
 'ability',
 'able',
 'able get',
 'able take',
 'abnormal',
 'abolish',
 'abolished',
 'abomination',
 'aboriginal',
 'aboriginal people',
 'aboriginals',
 'abortion',
 'abroad',
 'absolute',
 'absolute shit',
 'absolutely',
 'absolutely adore',
 'absolutely adore plan',
 'absolutely fantastic',
 'absolutely fantastic love',
 'absolutely fucking',
 'absolutely hilarious',
 'absolutely love',
 'absolutely love plan',
 'absolutely nothing',
 'absolutely stunning',
 'absolutely stunning turned',
 'absurd',
 'abundance',
 'abuse',
 'abused',
 'abusers',
 'abusive',
 'abysmal',
 'academia',
 'academic',
 'academics',
 'academics therefore',
 'academics therefore experts',
 'accent',
 'accept',
 'accept kinds',
 'accept kinds immigrants',
 'accept others',
 'accept others call',
 'accept people',
 'accept people calling',
 'accept races',
 'accept races equal',
 'acceptable',
 'acceptance',
 'accepted',
 'accepting',
 'access',
 'accidentally',
 'accommodation',
 'ac

In [17]:
bag_of_words = vect.transform(corpus)

In [18]:
X = bag_of_words.toarray()

'\nfor i in range(len(x)):\n    if x[i] >= 1:\n       print("Found one")\n'

In [40]:
tk = word_tokenize("Baby we done it")
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
pos_tag(tk)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Rohan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


[('Baby', 'IN'), ('we', 'PRP'), ('done', 'VBN'), ('it', 'PRP')]

## LSTM Model

### One Hot Representation

In [19]:
import tensorflow as tf

vocab_size = size

onehot_repr = [one_hot(sentence, vocab_size) for words in corpus]

#onehot_repr


### Embedding Representation

In [20]:
sent_length = 15
embedded_docs = pad_sequences(onehot_repr, padding='pre',maxlen=sent_length)


In [21]:
## Model Creation
embedding_vector_features = 60
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length=sent_length))
model.add(LSTM(200)) ## 200 neurons
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 15, 60)            1178760   
_________________________________________________________________
lstm (LSTM)                  (None, 200)               208800    
_________________________________________________________________
dense (Dense)                (None, 1)                 201       
Total params: 1,387,761
Trainable params: 1,387,761
Non-trainable params: 0
_________________________________________________________________
None


In [22]:
import numpy as np
X_for_LSTM = np.array(embedded_docs)
Y = np.array(Y)

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_for_LSTM,Y,test_size=0.2, random_state=25)


In [24]:
## Training deep learning model
model.fit(X_train,Y_train,validation_data=(X_test,Y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x27ca0667eb0>

### Performance Metrics and Accuracy

In [25]:
predict_x=model.predict(X_test) 
Y_pred=np.argmax(predict_x,axis=1)
from sklearn.metrics import accuracy_score

print(accuracy_score(Y_test,Y_pred))

0.46695384615384616


## Gaussian NB

In [98]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, random_state=25)

from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()

#print(Y_train[:4])
clf.fit(X_train, Y_train)

from sklearn.metrics import accuracy_score

accuracy_score(Y_test, clf.predict(X_test))


0.5344

## Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression

logReg = LogisticRegression(max_iter = 1000)
logReg = logReg.fit(X_train, Y_train)


accuracy_score(Y_test, logReg.predict(X_test))

0.7166769230769231

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

decTree = DecisionTreeClassifier()

decTree = decTree.fit(X_train, Y_train)

accuracy_score(Y_test, decTree.predict(X_test))