# Twitter sentiment analysis

In [0]:

import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout
import nltk
import re
from nltk.corpus import stopwords


In [0]:
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

In [0]:
df=pd.read_csv("/content/drive/My Drive/Colab Notebooks/twitter/training.1600000.processed.noemoticon.csv",encoding="ISO-8859-1",names=["sentiment_class", "ids", "date", "flag", "user", "original_text"])

In [0]:
df.head()


Unnamed: 0,sentiment_class,ids,date,flag,user,original_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [0]:
df=df.sample(80000)

In [0]:
df.isnull().sum()


sentiment_class    0
ids                0
date               0
flag               0
user               0
original_text      0
dtype: int64

In [0]:
df=df.dropna()

In [0]:

df = df.sample(frac=1)

In [0]:
df["sentiment_class"].value_counts()

0    40141
4    39859
Name: sentiment_class, dtype: int64

In [0]:
df = df.sample(frac=1, random_state=42)

In [0]:
df["sentiment_class"].value_counts()

0    40141
4    39859
Name: sentiment_class, dtype: int64

In [0]:
df=df.drop([ "ids", "date", "flag", "user"],axis=1)

In [0]:
df.isnull().sum()

sentiment_class    0
original_text      0
dtype: int64

In [0]:
X=df.drop('sentiment_class',axis=1)

In [0]:
## Get the Dependent features
y=df['sentiment_class']

In [0]:

y.value_counts()

0    40141
4    39859
Name: sentiment_class, dtype: int64

In [0]:
messages=X.copy()

In [0]:
messages.reset_index(inplace=True)

In [0]:
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [0]:
from tqdm import tqdm
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in tqdm(range(0, len(messages))):
   
    review = re.sub('[^a-zA-Z]', ' ', messages['original_text'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords]
    review = ' '.join(review)
    corpus.append(review)

100%|██████████| 80000/80000 [00:17<00:00, 4644.00it/s]


#MODEL

In [0]:
from tensorflow.keras.layers import Conv1D, Bidirectional, LSTM, Dense, Input, Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.callbacks import ModelCheckpoint

In [0]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

ReduceLROnPlateau = ReduceLROnPlateau(factor=0.1,
                                     min_lr = 0.01,
                                     monitor = 'val_loss',
                                     verbose = 1)

In [0]:

import numpy as np
X_final=np.array(corpus)
y_final=np.array(y)

In [0]:
X_final.shape,y_final.shape

((80000,), (80000,))

In [0]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

In [0]:
X_train.shape

(53600,)

In [0]:
documents = [_text.split() for _text in X_train] 

In [0]:
import gensim

In [0]:
w2v_model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=8)

In [0]:
w2v_model.build_vocab(documents)

In [0]:
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 3800


In [0]:
%%time
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

CPU times: user 45.4 s, sys: 296 ms, total: 45.7 s
Wall time: 24 s


(9921924, 13462752)

In [0]:
w2v_model.most_similar("love")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('luv', 0.39705783128738403),
 ('amaz', 0.36800283193588257),
 ('grate', 0.3545593023300171),
 ('faith', 0.34759435057640076),
 ('mraz', 0.3458879590034485),
 ('wonder', 0.34360912442207336),
 ('acoust', 0.33260953426361084),
 ('fo', 0.3283953070640564),
 ('brazil', 0.3158532381057739),
 ('appreci', 0.31381887197494507)]

In [0]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [0]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

Total words 49641


In [0]:
SEQUENCE_LENGTH=300

In [0]:
x_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=SEQUENCE_LENGTH)
x_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=SEQUENCE_LENGTH)

In [0]:
from tensorflow.keras import utils as np_utils

In [0]:
x_train.shape

(53600, 300)

In [0]:
y_train = pd.get_dummies(y_train).values
y_test = pd.get_dummies(y_test).values


In [0]:
y_train

array([[1, 0],
       [0, 1],
       [1, 0],
       ...,
       [1, 0],
       [1, 0],
       [0, 1]], dtype=uint8)

In [0]:
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

(49641, 300)


In [0]:
embedding_layer = Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False)

In [0]:
from tensorflow.keras.layers import SpatialDropout1D
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(embedding_layer)
model.add(SpatialDropout1D(0.2))

model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))

model.add(Dense(2,activation='sigmoid'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 300)          14892300  
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 300, 300)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 200)               400800    
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 512)               102912    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               2

In [0]:
## Creating model

model1=Sequential()
model1.add(embedding_layer)
model1.add(SpatialDropout1D(0.2))
model1.add(Conv1D(64, 5, activation='relu'))


model1.add(Bidirectional(LSTM(200)))
model1.add(Dropout(0.2))
model1.add(Dense(512, activation='relu'))
model1.add(Dropout(0.5))
model1.add(Dense(512, activation='relu'))


model1.add(Dense(2,activation='sigmoid'))
model1.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model1.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 300)          14892300  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 300, 300)          0         
_________________________________________________________________
conv1d (Conv1D)              (None, 296, 64)           96064     
_________________________________________________________________
bidirectional (Bidirectional (None, 400)               424000    
_________________________________________________________________
dropout_2 (Dropout)          (None, 400)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 512)               205312    
_________________________________________________________________
dropout_3 (Dropout)          (None, 512)              

In [0]:
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [0]:
EPOCHS = 8
BATCH_SIZE = 1024

In [0]:
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]


In [0]:
from tensorflow.keras.callbacks import EarlyStopping
### Finally Training
model.fit(x_train,y_train,validation_split=0.1,epochs=EPOCHS,batch_size=BATCH_SIZE)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x7f1d99384128>

In [0]:
### Finally Training
hist1=model1.fit(x_train,y_train,validation_split=0.1,epochs=EPOCHS,batch_size=BATCH_SIZE)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [0]:
accr = model.evaluate(x_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.489
  Accuracy: 0.763
