In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
#importing set of libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import re
#Package for splitting the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
#Tokenization
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical

In [3]:
import pandas as pd

# Loading the dataset as a Pandas DataFrame
dataset = pd.read_csv('/content/gdrive/My Drive/Sentiment.csv')

# Selecting only the necessary columns 'text' and 'sentiment'
mask = dataset.columns.isin(['text', 'sentiment'])
data = dataset.loc[:, mask]

In [4]:
# Keeping only necessary columns
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))


In [5]:
#Removing Re tweets
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ')

In [6]:
#Maximum words is 2000 to tokenize the sentence
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)


In [7]:
#Padding the feature matrix
X = pad_sequences(X)
embed_dim = 128
#Long short-term memory (LSTM) layer
lstm_out = 196

In [8]:
def createmodel():
    model = Sequential() #Sequential Neural Network
    #input dimension - 2000 Neurons, output dimension-128 Neurons
    model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy']) #Compiling the model
    return model


In [9]:
#Applying label Encoding on the label matrix
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['sentiment'])
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

In [10]:
batch_size = 32 #Batch size 32
model = createmodel()
model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size, verbose = 2) #verbose the higher, the more messages
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size) #evaluating the model
print(score)
print(acc)

291/291 - 61s - loss: 0.8264 - accuracy: 0.6436 - 61s/epoch - 209ms/step
144/144 - 4s - loss: 0.7842 - accuracy: 0.6551 - 4s/epoch - 25ms/step
0.7841821908950806
0.6550895571708679


In [11]:
print(model.metrics_names) #metrics of the model

['loss', 'accuracy']


In [12]:
#1)Save the model and use the saved model to predict on new text data (ex, “A lot of good things are
#happening. We are respected again throughout the world, and that's a great thing.@realDonaldTrump”)

#Saving the model
model.save('sentimentAnalysis.h5')

In [13]:
#Importing the package for importing the saved model
from keras.models import load_model
model= load_model('sentimentAnalysis.h5') #loading the saved model

In [14]:
print(integer_encoded)
print(data['sentiment'])

[1 2 1 ... 2 0 2]
0         Neutral
1        Positive
2         Neutral
3        Positive
4        Positive
           ...   
13866    Negative
13867    Positive
13868    Positive
13869    Negative
13870    Positive
Name: sentiment, Length: 13871, dtype: object


In [15]:
# Predicting on the text data
sentence = ['A lot of good things are happening. We are respected again throughout the world, and that is a great thing.@realDonaldTrump']
sentence = tokenizer.texts_to_sequences(sentence) # Tokenizing the sentence
sentence = pad_sequences(sentence, maxlen=28, dtype='int32', value=0) # Padding the sentence
sentiment_probs = model.predict(sentence, batch_size=1, verbose=2)[0] # Predicting the sentence text
sentiment = np.argmax(sentiment_probs)

print(sentiment_probs)
if sentiment == 0:
    print("Neutral")
elif sentiment < 0:
    print("Negative")
elif sentiment > 0:
    print("Positive")
else:
    print("Cannot be determined")


1/1 - 2s - 2s/epoch - 2s/step
[0.5035101  0.22521064 0.2712793 ]
Neutral


In [16]:
#2. Apply GridSearchCV on the source code provided in the class

#importing Keras classifier
from keras.wrappers.scikit_learn import KerasClassifier
#importing Grid search CV
from sklearn.model_selection import GridSearchCV
#initiating model to test performance by applying multiple hyper parameters
model = KerasClassifier(build_fn=createmodel,verbose=2)
#batch_size
batch_size= [10, 20, 40]
# no. of epochs
epochs = [1, 2]
#dictionaries
param_grid= {'batch_size':batch_size, 'epochs':epochs}
grid  = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result= grid.fit(X_train,Y_train) #Fitting the model
#summarizing the  results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) #best score, best hyper parameters

  model = KerasClassifier(build_fn=createmodel,verbose=2)


744/744 - 123s - loss: 0.8243 - accuracy: 0.6421 - 123s/epoch - 166ms/step
186/186 - 3s - loss: 0.7384 - accuracy: 0.6772 - 3s/epoch - 17ms/step
744/744 - 124s - loss: 0.8213 - accuracy: 0.6468 - 124s/epoch - 166ms/step
186/186 - 4s - loss: 0.7669 - accuracy: 0.6584 - 4s/epoch - 24ms/step
744/744 - 117s - loss: 0.8253 - accuracy: 0.6423 - 117s/epoch - 157ms/step
186/186 - 3s - loss: 0.7630 - accuracy: 0.6805 - 3s/epoch - 16ms/step
744/744 - 117s - loss: 0.8316 - accuracy: 0.6402 - 117s/epoch - 157ms/step
186/186 - 3s - loss: 0.7527 - accuracy: 0.6706 - 3s/epoch - 17ms/step
186/186 - 3s - loss: 0.7860 - accuracy: 0.6652 - 3s/epoch - 17ms/step
Epoch 1/2
744/744 - 119s - loss: 0.8225 - accuracy: 0.6473 - 119s/epoch - 161ms/step
Epoch 2/2
744/744 - 113s - loss: 0.6799 - accuracy: 0.7093 - 113s/epoch - 152ms/step
186/186 - 4s - loss: 0.7440 - accuracy: 0.6842 - 4s/epoch - 20ms/step
Epoch 1/2
744/744 - 113s - loss: 0.8296 - accuracy: 0.6492 - 113s/epoch - 152ms/step
Epoch 2/2
744/744 - 115s 