# Twitter Sentiment Analysis

In [None]:
!pip install gensim --upgrade
!pip install keras --upgrade
!pip install pandas --upgrade

In [2]:
# DataFrame
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rishabh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Settings

In [4]:
# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 8
W2V_MIN_COUNT = 10

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

# EXPORT
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"

### Read Dataset

In [5]:
dataset_path = '/home/rishabh/NLP/hate speech/data.csv'
df = pd.read_csv(dataset_path, encoding =DATASET_ENCODING , names=DATASET_COLUMNS)

In [6]:
print("Dataset size:", len(df))

Dataset size: 1600000


In [7]:
df.head(5)

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [8]:
decode_map = {0: "HATE", 4:"NON HATE"}
def decode_sentiment(label):
    return decode_map[int(label)]

In [9]:
%%time
df.target = df.target.apply(lambda x: decode_sentiment(x))

CPU times: user 556 ms, sys: 6.57 ms, total: 563 ms
Wall time: 555 ms


### Pre-Process dataset

In [10]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [11]:
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [12]:
%%time
df.text = df.text.apply(lambda x: preprocess(x))

CPU times: user 53 s, sys: 112 ms, total: 53.1 s
Wall time: 53.1 s


### Split train and test

In [13]:
df_train, df_test = train_test_split(df, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

TRAIN size: 1280000
TEST size: 320000


### Word2Vec 

In [14]:
%%time
documents = [_text.split() for _text in df_train.text] 

CPU times: user 2.24 s, sys: 200 ms, total: 2.44 s
Wall time: 2.44 s


In [15]:
w2v_model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=8)

In [16]:
w2v_model.build_vocab(documents)

2020-12-04 21:05:46,658 : INFO : collecting all words and their counts
2020-12-04 21:05:46,661 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-12-04 21:05:46,695 : INFO : PROGRESS: at sentence #10000, processed 72565 words, keeping 14005 word types
2020-12-04 21:05:46,722 : INFO : PROGRESS: at sentence #20000, processed 144393 words, keeping 21587 word types
2020-12-04 21:05:46,765 : INFO : PROGRESS: at sentence #30000, processed 215826 words, keeping 27541 word types
2020-12-04 21:05:46,794 : INFO : PROGRESS: at sentence #40000, processed 288271 words, keeping 32764 word types
2020-12-04 21:05:46,823 : INFO : PROGRESS: at sentence #50000, processed 359772 words, keeping 37587 word types
2020-12-04 21:05:46,852 : INFO : PROGRESS: at sentence #60000, processed 431431 words, keeping 42198 word types
2020-12-04 21:05:46,880 : INFO : PROGRESS: at sentence #70000, processed 503103 words, keeping 46458 word types
2020-12-04 21:05:46,909 : INFO : PROGRESS: at s

2020-12-04 21:05:48,824 : INFO : PROGRESS: at sentence #720000, processed 5193881 words, keeping 200325 word types
2020-12-04 21:05:48,865 : INFO : PROGRESS: at sentence #730000, processed 5265467 words, keeping 202133 word types
2020-12-04 21:05:48,901 : INFO : PROGRESS: at sentence #740000, processed 5337518 words, keeping 203818 word types
2020-12-04 21:05:48,929 : INFO : PROGRESS: at sentence #750000, processed 5409321 words, keeping 205535 word types
2020-12-04 21:05:48,961 : INFO : PROGRESS: at sentence #760000, processed 5481512 words, keeping 207282 word types
2020-12-04 21:05:48,990 : INFO : PROGRESS: at sentence #770000, processed 5554093 words, keeping 209076 word types
2020-12-04 21:05:49,015 : INFO : PROGRESS: at sentence #780000, processed 5625382 words, keeping 210805 word types
2020-12-04 21:05:49,045 : INFO : PROGRESS: at sentence #790000, processed 5698066 words, keeping 212618 word types
2020-12-04 21:05:49,073 : INFO : PROGRESS: at sentence #800000, processed 577088

In [17]:
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 30369


In [18]:
%%time
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

2020-12-04 21:05:58,030 : INFO : training model with 8 workers on 30369 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=7
2020-12-04 21:05:59,081 : INFO : EPOCH 1 - PROGRESS: at 6.08% examples, 480436 words/s, in_qsize 16, out_qsize 1
2020-12-04 21:06:00,087 : INFO : EPOCH 1 - PROGRESS: at 12.93% examples, 518903 words/s, in_qsize 13, out_qsize 2
2020-12-04 21:06:01,090 : INFO : EPOCH 1 - PROGRESS: at 19.73% examples, 532018 words/s, in_qsize 15, out_qsize 0
2020-12-04 21:06:02,094 : INFO : EPOCH 1 - PROGRESS: at 25.91% examples, 525606 words/s, in_qsize 15, out_qsize 0
2020-12-04 21:06:03,137 : INFO : EPOCH 1 - PROGRESS: at 32.19% examples, 519402 words/s, in_qsize 15, out_qsize 0
2020-12-04 21:06:04,172 : INFO : EPOCH 1 - PROGRESS: at 39.00% examples, 523260 words/s, in_qsize 15, out_qsize 0
2020-12-04 21:06:05,190 : INFO : EPOCH 1 - PROGRESS: at 45.92% examples, 528444 words/s, in_qsize 15, out_qsize 0
2020-12-04 21:06:06,214 : INFO : EPOCH 1 - PROGRESS: 

2020-12-04 21:06:48,280 : INFO : EPOCH 4 - PROGRESS: at 26.23% examples, 525824 words/s, in_qsize 15, out_qsize 0
2020-12-04 21:06:49,290 : INFO : EPOCH 4 - PROGRESS: at 32.41% examples, 521272 words/s, in_qsize 15, out_qsize 2
2020-12-04 21:06:50,301 : INFO : EPOCH 4 - PROGRESS: at 39.00% examples, 523867 words/s, in_qsize 16, out_qsize 0
2020-12-04 21:06:51,309 : INFO : EPOCH 4 - PROGRESS: at 45.16% examples, 520995 words/s, in_qsize 15, out_qsize 2
2020-12-04 21:06:52,329 : INFO : EPOCH 4 - PROGRESS: at 51.64% examples, 521455 words/s, in_qsize 15, out_qsize 3
2020-12-04 21:06:53,337 : INFO : EPOCH 4 - PROGRESS: at 58.47% examples, 525365 words/s, in_qsize 15, out_qsize 0
2020-12-04 21:06:54,343 : INFO : EPOCH 4 - PROGRESS: at 65.17% examples, 527675 words/s, in_qsize 15, out_qsize 0
2020-12-04 21:06:55,350 : INFO : EPOCH 4 - PROGRESS: at 71.67% examples, 527967 words/s, in_qsize 16, out_qsize 0
2020-12-04 21:06:56,396 : INFO : EPOCH 4 - PROGRESS: at 78.29% examples, 527906 words/s,

2020-12-04 21:07:40,300 : INFO : EPOCH 7 - PROGRESS: at 66.79% examples, 539102 words/s, in_qsize 15, out_qsize 0
2020-12-04 21:07:41,305 : INFO : EPOCH 7 - PROGRESS: at 73.19% examples, 537618 words/s, in_qsize 15, out_qsize 0
2020-12-04 21:07:42,307 : INFO : EPOCH 7 - PROGRESS: at 80.03% examples, 539425 words/s, in_qsize 15, out_qsize 0
2020-12-04 21:07:43,324 : INFO : EPOCH 7 - PROGRESS: at 86.31% examples, 537015 words/s, in_qsize 11, out_qsize 4
2020-12-04 21:07:44,349 : INFO : EPOCH 7 - PROGRESS: at 93.49% examples, 539627 words/s, in_qsize 15, out_qsize 0
2020-12-04 21:07:45,222 : INFO : worker thread finished; awaiting finish of 7 more threads
2020-12-04 21:07:45,236 : INFO : worker thread finished; awaiting finish of 6 more threads
2020-12-04 21:07:45,249 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-12-04 21:07:45,269 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-12-04 21:07:45,286 : INFO : worker thread finished; awaiting 

CPU times: user 7min 36s, sys: 1.9 s, total: 7min 38s
Wall time: 2min 2s


(65780095, 73817632)

In [19]:
w2v_model.most_similar("love")

  """Entry point for launching an IPython kernel.
2020-12-04 21:08:00,542 : INFO : precomputing L2-norms of word weight vectors


[('adore', 0.5728391408920288),
 ('luv', 0.5702308416366577),
 ('loved', 0.517593502998352),
 ('loves', 0.5084329843521118),
 ('loooove', 0.4869951605796814),
 ('looove', 0.4839927554130554),
 ('loove', 0.4672917127609253),
 ('loveee', 0.4425637722015381),
 ('lovee', 0.4396896958351135),
 ('sings', 0.4157412648200989)]

### Tokenize Text

In [20]:
%%time
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.text)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

Total words 290419
CPU times: user 21.1 s, sys: 59.9 ms, total: 21.2 s
Wall time: 21 s


In [21]:
%%time
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.text), maxlen=SEQUENCE_LENGTH)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.text), maxlen=SEQUENCE_LENGTH)

CPU times: user 34.2 s, sys: 493 ms, total: 34.7 s
Wall time: 34.5 s


### Label Encoder 

In [22]:
labels = df_train.target.unique().tolist()
labels

['NON HATE', 'HATE']

In [23]:
encoder = LabelEncoder()
encoder.fit(df_train.target.tolist())

y_train = encoder.transform(df_train.target.tolist())
y_test = encoder.transform(df_test.target.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

print("y_train",y_train.shape)
print("y_test",y_test.shape)

y_train (1280000, 1)
y_test (320000, 1)


In [24]:
print("x_train", x_train.shape)
print("y_train", y_train.shape)
print()
print("x_test", x_test.shape)
print("y_test", y_test.shape)

x_train (1280000, 300)
y_train (1280000, 1)

x_test (320000, 300)
y_test (320000, 1)


In [25]:
y_train[:10]

array([[1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]])

### Embedding layer

In [26]:
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

(290419, 300)


In [27]:
embedding_layer = Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False)

### Build Model

In [28]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 300)          87125700  
_________________________________________________________________
dropout (Dropout)            (None, 300, 300)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               160400    
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 87,286,201
Trainable params: 160,501
Non-trainable params: 87,125,700
_________________________________________________________________


### Compile model

In [29]:
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

### Callbacks

In [30]:
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]

test run


In [31]:
import time 
from tensorflow import keras

In [32]:
lmodel = keras.models.load_model('/home/rishabh/NLP/hate speech/model.h5')

### Train

In [30]:
%%time
history = model.fit(x_train, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=1,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)





CPU times: user 11h 7min 37s, sys: 24min 43s, total: 11h 32min 20s
Wall time: 2h 59min 17s


### Evaluate

In [31]:
%%time
score = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])


ACCURACY: 0.7781187295913696
LOSS: 0.4646095633506775
CPU times: user 49min 17s, sys: 2min 34s, total: 51min 52s
Wall time: 13min 49s


In [33]:
def decode_sentiment(score):
    return 'HATE' if score < 0.5 else 'NON HATE'


In [34]:
def predict(text):
    start_at = time.time()
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=SEQUENCE_LENGTH)
    # Predict
    score = lmodel.predict([x_test])[0]
    # Decode sentiment
    label = decode_sentiment(score)

    return {"label": label, "score": float(score),
       "elapsed_time": time.time()-start_at}

In [35]:
predict("I love the music")


{'label': 'NON HATE',
 'score': 0.9656286239624023,
 'elapsed_time': 1.1235954761505127}

In [36]:
predict("I hate the rain")


{'label': 'HATE',
 'score': 0.010753682814538479,
 'elapsed_time': 0.1017141342163086}

In [37]:
predict("i don't know what i'm doing")


{'label': 'HATE',
 'score': 0.2742375433444977,
 'elapsed_time': 0.09618854522705078}

In [38]:
predict("i will kill myself")


{'label': 'HATE',
 'score': 0.16064141690731049,
 'elapsed_time': 0.07689070701599121}