In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from sklearn.preprocessing import LabelEncoder

In [2]:
# Reading the Data
data = pd.read_csv('Sentiment.csv')

# Keeping only the neccessary columns
data = data[['text','sentiment']]
data.head()

Unnamed: 0,text,sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive


In [3]:
# Preprocessing the Data
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))
data.head()

Unnamed: 0,text,sentiment
0,rt nancyleegrahn how did everyone feel about t...,Neutral
1,rt scottwalker didnt catch the full gopdebate ...,Positive
2,rt tjmshow no mention of tamir rice and the go...,Neutral
3,rt robgeorge that carly fiorina is trending h...,Positive
4,rt danscavino gopdebate w realdonaldtrump deli...,Positive


In [4]:
# Furthter processing of Data
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ')
data.head()

Unnamed: 0,text,sentiment
0,nancyleegrahn how did everyone feel about th...,Neutral
1,scottwalker didnt catch the full gopdebate l...,Positive
2,tjmshow no mention of tamir rice and the gop...,Neutral
3,robgeorge that carly fiorina is trending ho...,Positive
4,danscavino gopdebate w realdonaldtrump deliv...,Positive


In [5]:
# Tokenization of Data, Converting to sequences
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)
X

array([[   0,    0,    0, ...,   51, 1039,    1],
       [   0,    0,    0, ..., 1577, 1356,  847],
       [   0,    0,    0, ...,   10,  696,  518],
       ...,
       [   0,    0,    0, ...,   68,   62,    3],
       [   0,    0,    0, ..., 1112, 1588,   81],
       [   0,    0,    0, ...,  196,    3,  880]])

In [6]:
# Model Configuration
embed_dim = 128
lstm_out = 196

In [7]:
# Method to create the model
def createmodel():
    model = Sequential()
    model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    return model

In [8]:
# Applying Label Encoding on Target column
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['sentiment'])
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

In [9]:
# Model creation & Evaluation
batch_size = 32
model = createmodel()
model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size, verbose = 2)
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size)

291/291 - 22s - loss: 0.8233 - accuracy: 0.6486
144/144 - 1s - loss: 0.7768 - accuracy: 0.6660


In [10]:
# Score and Accutracy
print(score)
print(acc)
print(model.metrics_names)

0.776770830154419
0.6660113334655762
['loss', 'accuracy']


In [11]:
# Saving the model
from keras.models import load_model 
model.save("sentiment_analysis.h5") 

In [12]:
# Loading the saved model and evaluating
loaded_model = load_model("sentiment_analysis.h5") 
loss, accuracy = loaded_model.evaluate(X_test, Y_test)
print("The Loss is ",loss)
print("The Accuracy is ",accuracy)

The Loss is  0.776770830154419
The Accuracy is  0.6660113334655762


# Prediction 

In [24]:
# Processing the input text 
input_text = "A lot of good things are happening. We are respected again throughout the world, and that's a great thing.@realDonaldTrump"
tokenizer.fit_on_texts(input_text)

#vectorizing the tweet by the pre-fitted tokenizer instance
tweet = tokenizer.texts_to_sequences(input_text)

tweet = pad_sequences(tweet, maxlen=28, dtype='int32', value=0)

In [25]:
# Sentiment Prediction
result = loaded_model.predict(tweet,batch_size=1,verbose = 2)[0]
print(result)
if(np.argmax(result) == 0):
    print("negative")
elif (np.argmax(result) == 1):
    print("positive")
else:
    print("neutral")

122/122 - 1s
[0.20640944 0.42548195 0.36810857]
positive


# GridSerachCV 

In [26]:
# Applying GridSearchCV on model
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
model1 = KerasClassifier(build_fn=createmodel, verbose=0)
batch_size = [32, 64]
epochs = [1, 2]
param_grid = dict(batch_size=batch_size, epochs=epochs)

grid = GridSearchCV(estimator=model1, param_grid=param_grid, n_jobs=1)
grid_result = grid.fit(X_train, Y_train)
# summarize results
print("Best Score : %f using the parameters %s" % (grid_result.best_score_, grid_result.best_params_))

Best Score : 0.679220 using the parameters {'batch_size': 64, 'epochs': 2}
