#### Importing Dependencies:

In [9]:
import numpy as np 
import pandas as pd 
import keras
import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

#### Data Cleaning:

In [10]:
df = pd.read_csv('twtData.csv')
df.head(5)


Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,...,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,...,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,...,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


Only keeping relevant columns, and removing RT from the tweets that are retweets.

In [11]:
df = df[['text','sentiment']]

df['text'] = df['text'].apply(lambda x: x.lower())
df['text'] = df['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

In [39]:
for idx, row in df.iterrows():
    row[0] = row[0].replace('rt', ' ')

df.head()

Unnamed: 0,text,sentiment
1,scottwalker didnt catch the full gopdebate l...,Positive
3,robgeorge that carly fiorina is trending ho...,Positive
4,danscavino gopdebate w realdonaldtrump deliv...,Positive
5,gregabbott_tx tedcruz on my first day i will...,Positive
6,warriorwoman91 i liked her and was happy whe...,Negative


In [13]:
df = df[df.sentiment != "Neutral"]

n = df.size - df[df['sentiment'] == 'Positive'].size - df[df['sentiment'] == 'Negative'].size

print(n)

0


This check above ensures that we only have tweets labelled positive and negative in our dataset.

In [14]:
  max_features = 2000

# Initialising a special Tokenizer class object to tokenize our data (tweets). This will only "remember" max_features-1 words.
tokenizer = Tokenizer(num_words=max_features, split=' ')

# Fitting our internal vocabulary on our tweets, using the fit_on_texts method of the Tokenizer class.
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X)
X.shape

(10729, 28)

In [15]:
vocabSize = len(tokenizer.word_index) + 1
print(vocabSize)

15793


#### Hyperparameter tuning: 

The variables embedDim, lstmOutDim, batchSize, and dropout are the hyperparameters in this experiment. These are the values that influence the derived parameters' values.

We use the softmax function 

In [16]:
embedDim=128
lstmOutDim=196
model = Sequential()                                                                
# Each layer has one input tensor and one output tensor.
model.add(Embedding(max_features, embedDim, input_length = X.shape[1]))

# drops out (disregards) 1D feature map instead of individual elements.
model.add(SpatialDropout1D(0.4))

# Creating an LSTM layer, keeping dropout low as we have a SpatialDropout layer already.
model.add(LSTM(lstmOutDim, dropout=0.2))

# Output layer with 2 nodes.
model.add(Dense(2, activation='softmax'))

# Now, we compile the model and get it ready for training, we use the Adam optimizer with bce loss.
bce = tf.keras.losses.BinaryCrossentropy()
model.compile(optimizer='adam', loss=bce, metrics=['accuracy'])
print(model.summary())


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 28, 128)           256000    
                                                                 
 spatial_dropout1d (SpatialD  (None, 28, 128)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 196)               254800    
                                                                 
 dense (Dense)               (None, 2)                 394       
                                                                 
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


#### Splitting the dataset into training and testing datasets:

In [17]:
# One hot encoding the labels
Y = pd.get_dummies(df['sentiment']).values


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.35, random_state = 26)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(6973, 28) (6973, 2)
(3756, 28) (3756, 2)


#### "Fitting" (Training) the model:

In [18]:
model.fit(X_train, Y_train, epochs = 10,  verbose = 2)

Epoch 1/10
218/218 - 11s - loss: 0.4525 - accuracy: 0.8128 - 11s/epoch - 50ms/step
Epoch 2/10
218/218 - 9s - loss: 0.3316 - accuracy: 0.8642 - 9s/epoch - 41ms/step
Epoch 3/10
218/218 - 9s - loss: 0.2824 - accuracy: 0.8838 - 9s/epoch - 42ms/step
Epoch 4/10
218/218 - 9s - loss: 0.2519 - accuracy: 0.8952 - 9s/epoch - 40ms/step
Epoch 5/10
218/218 - 8s - loss: 0.2230 - accuracy: 0.9085 - 8s/epoch - 37ms/step
Epoch 6/10


KeyboardInterrupt: 

#### Setting aside a validation set to check accuracy.

In [34]:
validation_size = 1500
batch_size = 32

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score, acc = model.evaluate(X_test, Y_test, verbose=2, batch_size=batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))


71/71 - 2s - loss: 0.3944 - accuracy: 0.8493 - 2s/epoch - 35ms/step
score: 0.39
acc: 0.85


In [19]:
X_test.shape

(3756, 28)

#### Calculating Accuracy of the model.

In [38]:
posCount, negCount, posCorrect, negCorrect = 0, 0, 0, 0

for x in range(len(X_validate)):
    
    result = model.predict(X_validate[x].reshape(1, X_test.shape[1]), batch_size=1, verbose=2)[0]
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            negCorrect += 1
        else:
            posCorrect += 1
       
    if np.argmax(Y_validate[x]) == 0:
        negCount += 1
    else:
        posCount += 1


# bce()
print("Positive Accuracy: %.3f" % (posCorrect/posCount*100), "%")
print("Negative Accuracy: %.3f" % (negCorrect/negCount*100), "%")


1/1 - 0s - 25ms/epoch - 25ms/step
1/1 - 0s - 24ms/epoch - 24ms/step
1/1 - 0s - 26ms/epoch - 26ms/step
1/1 - 0s - 24ms/epoch - 24ms/step
1/1 - 0s - 24ms/epoch - 24ms/step
1/1 - 0s - 23ms/epoch - 23ms/step
1/1 - 0s - 18ms/epoch - 18ms/step
1/1 - 0s - 24ms/epoch - 24ms/step
1/1 - 0s - 20ms/epoch - 20ms/step
1/1 - 0s - 29ms/epoch - 29ms/step
1/1 - 0s - 22ms/epoch - 22ms/step
1/1 - 0s - 22ms/epoch - 22ms/step
1/1 - 0s - 18ms/epoch - 18ms/step
1/1 - 0s - 21ms/epoch - 21ms/step
1/1 - 0s - 21ms/epoch - 21ms/step
1/1 - 0s - 22ms/epoch - 22ms/step
1/1 - 0s - 25ms/epoch - 25ms/step
1/1 - 0s - 26ms/epoch - 26ms/step
1/1 - 0s - 22ms/epoch - 22ms/step
1/1 - 0s - 21ms/epoch - 21ms/step
1/1 - 0s - 22ms/epoch - 22ms/step
1/1 - 0s - 52ms/epoch - 52ms/step
1/1 - 0s - 21ms/epoch - 21ms/step
1/1 - 0s - 21ms/epoch - 21ms/step
1/1 - 0s - 26ms/epoch - 26ms/step
1/1 - 0s - 26ms/epoch - 26ms/step
1/1 - 0s - 22ms/epoch - 22ms/step
1/1 - 0s - 22ms/epoch - 22ms/step
1/1 - 0s - 22ms/epoch - 22ms/step
1/1 - 0s - 20m