In [24]:
# importing the required libraries
import pandas as pd                       # for creating dataframes
import numpy as np
import matplotlib.pyplot as plt 
import re                                 # regular expression operations

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential 
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D # for layers in Neural Network
from keras.utils.np_utils import to_categorical

In [25]:
# looading the Sentiment dataset
data = pd.read_csv('Sentiment.csv') 
data = data[['text','sentiment']] # Keeping only the neccessary columns

data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-Z0-9\s]', '', x))) #only keeping a-z,A-Z,0-9 in the data

In [26]:
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ') # removing Retweets

In [27]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ') # tokenizing the sentence
tokenizer.fit_on_texts(data['text'].values) 
X = tokenizer.texts_to_sequences(data['text'].values)

X = pad_sequences(X) # padding the feature matrix - add zeros for matching the sentence length
embed_dim = 128      # dimension of the Embedded layer
lstm_out = 196       # LSTM ( Long short-term memory ) layer neurons

In [28]:
def createmodel():
    model = Sequential() # Sequential Neural Network
    model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1])) 
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) 
    model.add(Dense(3,activation='softmax')) 
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy']) 
    return model
# print(model.summary())

In [29]:
labelencoder = LabelEncoder() # conversion of categorical to Numerical
#fitting the model
integer_encoded = labelencoder.fit_transform(data['sentiment']) 
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42) 
batch_size = 32 
model = createmodel() 
model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size, verbose = 2) # more messages for higher verbose
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size)
print(score)
print(acc)

291/291 - 44s - loss: 0.8248 - accuracy: 0.6448
144/144 - 3s - loss: 0.7409 - accuracy: 0.6811
0.7408873438835144
0.6810834407806396


In [30]:
print(model.metrics_names) # model metrics

['loss', 'accuracy']


# **Task_1: Save the model and use the saved model to predict on new text data (ex,“A lot of good things are happening. We are respected again throughout the world, and that's a great thing.@realDonaldTrump”)**

In [31]:
# saving the model
model.save('sentimentAnalysis.h5') 

from keras.models import load_model       #importing the package to get the saved model
model= load_model('sentimentAnalysis.h5') #loading the model which is saved

In [32]:
print(integer_encoded)
print(data['sentiment'])                  # get the sentiment analysis

[1 2 1 ... 2 0 2]
0         Neutral
1        Positive
2         Neutral
3        Positive
4        Positive
           ...   
13866    Negative
13867    Positive
13868    Positive
13869    Negative
13870    Positive
Name: sentiment, Length: 13871, dtype: object


In [34]:
# predicting the new text data

# Processing the input text 
text_in = ["A lot of good things are happening. We are respected again throughout the world, and that's a great thing.@realDonaldTrump"]
text_in = tokenizer.texts_to_sequences(text_in)                        # tokenizing the sentence
text_in = pad_sequences(text_in, maxlen=28, dtype='int32', value=0)    # padding - add zeros to match the sentence length

# standard analyzer defines up to three basic polar emotions (positive, negative, neutral)
res_sentiment = model.predict_classes(text_in,batch_size=1,verbose = 2)[0]  # predicting the sentence text
print(res_sentiment)
if ( res_sentiment < 0 ):
  print("\n Negative")
elif ( res_sentiment == 0 ):
  print("\n Neutral")
elif ( res_sentiment > 0 ):
  print("\n Positive")
else:
  print("not determined")



1/1 - 0s
0

 Neutral


# **Task_2: Apply GridSearchCV on the source code provided in the class**

In [35]:
# importing the required libraries
from keras.wrappers.scikit_learn import KerasClassifier 
from sklearn.model_selection import GridSearchCV 

# applying GridSearchCV on model
model_1 = KerasClassifier(build_fn=createmodel,verbose=2) # applying multiple hyper parameters for model initiation

# hyper parameters
batch_size= [10, 20, 40]     
epochs = [1, 2]             

param_grid= {'batch_size':batch_size, 'epochs':epochs}          # creating dictionary for batch size and no. of epochs
grid  = GridSearchCV(estimator=model_1, param_grid=param_grid)  # applying dictionary with hyper parameters for GridSearchCV
grid_result= grid.fit(X_train,Y_train)                          # fitting the model

# summarizing the results - best score, best hyper parameters
print("Best Score achieved: %f by using the parameters %s" % (grid_result.best_score_, grid_result.best_params_)) 

744/744 - 91s - loss: 0.8243 - accuracy: 0.6512
186/186 - 2s - loss: 0.7577 - accuracy: 0.6681
744/744 - 90s - loss: 0.8261 - accuracy: 0.6442
186/186 - 2s - loss: 0.7957 - accuracy: 0.6616
744/744 - 90s - loss: 0.8257 - accuracy: 0.6427
186/186 - 3s - loss: 0.7666 - accuracy: 0.6799
744/744 - 92s - loss: 0.8203 - accuracy: 0.6480
186/186 - 2s - loss: 0.7703 - accuracy: 0.6744
744/744 - 92s - loss: 0.8180 - accuracy: 0.6518
186/186 - 2s - loss: 0.7835 - accuracy: 0.6744
Epoch 1/2
744/744 - 92s - loss: 0.8208 - accuracy: 0.6507
Epoch 2/2
744/744 - 89s - loss: 0.6802 - accuracy: 0.7132
186/186 - 3s - loss: 0.7399 - accuracy: 0.6853
Epoch 1/2
744/744 - 92s - loss: 0.8216 - accuracy: 0.6453
Epoch 2/2
744/744 - 89s - loss: 0.6769 - accuracy: 0.7143
186/186 - 2s - loss: 0.7564 - accuracy: 0.6880
Epoch 1/2
744/744 - 90s - loss: 0.8234 - accuracy: 0.6453
Epoch 2/2
744/744 - 88s - loss: 0.6791 - accuracy: 0.7096
186/186 - 2s - loss: 0.7547 - accuracy: 0.6869
Epoch 1/2
744/744 - 93s - loss: 0.82