In [26]:
import pandas as pd #Basic packages for creating dataframes and loading dataset
import numpy as np
import matplotlib.pyplot as plt #Package for visualization
import re #importing package for Regular expression operations
from sklearn.model_selection import train_test_split #Package for splitting the data
from sklearn.preprocessing import LabelEncoder #Package for conversion of categorical to Numerical
from keras.preprocessing.text import Tokenizer #Tokenization
from tensorflow.keras.preprocessing.sequence import pad_sequences #Add zeros or crop based on the length
from keras.models import Sequential #Sequential Neural Network
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D #For layers in Neural Network
from keras.utils import to_categorical

In [6]:
# Load the dataset as a Pandas DataFrame
dataset = pd.read_csv('/content/sample_data/Sentiment.csv')

In [8]:
import pandas as pd

# Select only the necessary columns 'text' and 'sentiment'
mask = dataset.columns.isin(['text', 'sentiment'])
data = dataset.loc[:, mask]

# Keeping only the necessary columns
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))


In [9]:
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ') #Removing Retweets
    max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ') #Maximum words is 2000 to tokenize sentence
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values) #taking values to feature matrix
X = pad_sequences(X) #Padding the feature matrix

embed_dim = 128 #Dimension of the Embedded layer
lstm_out = 196 #Long short-term memory (LSTM) layer neurons
def createmodel():
    model = Sequential() #Sequential Neural Network
    model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1])) #input dimension 2000 Neurons, output dimension 128 Neurons
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) #Drop out 20%, 196 output Neurons, recurrent dropout 20%
    model.add(Dense(3,activation='softmax')) #3 output neurons[positive, Neutral, Negative], softmax as activation
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy']) #Compiling the model
    return model
# print(model.summary())
labelencoder = LabelEncoder() #Applying label Encoding on the label matrix
integer_encoded = labelencoder.fit_transform(data['sentiment']) #fitting the model
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42) #67% training data, 33% test data split
batch_size = 32 #Batch size 32
model = createmodel() #Function call to Sequential Neural Network
model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size, verbose = 2) #verbose the higher, the more messages
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size) #evaluating the model
print(score)
print(acc)

291/291 - 51s - loss: 0.8225 - accuracy: 0.6467 - 51s/epoch - 174ms/step
144/144 - 3s - loss: 0.7427 - accuracy: 0.6741 - 3s/epoch - 22ms/step
0.7426541447639465
0.67409348487854


In [10]:
print(model.metrics_names) #metrics of the model

['loss', 'accuracy']


In [12]:
#1. Save the model and use the saved model to predict on new text data
#(ex, “A lot of good things are happening. We are respected again throughout the world, and that's a great thing.@realDonaldTrump”)
model.save('sentimentAnalysis.h5') #Saving the model

  saving_api.save_model(


In [13]:
from keras.models import load_model #Importing the package for importing the saved model
model= load_model('sentimentAnalysis.h5') #loading the saved model

In [14]:
print(integer_encoded)
print(data['sentiment'])

[1 2 1 ... 2 0 2]
0         Neutral
1        Positive
2         Neutral
3        Positive
4        Positive
           ...   
13866    Negative
13867    Positive
13868    Positive
13869    Negative
13870    Positive
Name: sentiment, Length: 13871, dtype: object


In [15]:
# Predicting on the text data
sentence = ['A lot of good things are happening. We are respected again throughout the world, and that is a great thing.@realDonaldTrump']
sentence = tokenizer.texts_to_sequences(sentence) # Tokenizing the sentence
sentence = pad_sequences(sentence, maxlen=28, dtype='int32', value=0) # Padding the sentence
sentiment_probs = model.predict(sentence, batch_size=1, verbose=2)[0] # Predicting the sentence text
sentiment = np.argmax(sentiment_probs)

print(sentiment_probs)
if sentiment == 0:
    print("Neutral")
elif sentiment < 0:
    print("Negative")
elif sentiment > 0:
    print("Positive")
else:
    print("Cannot be determined")


1/1 - 0s - 295ms/epoch - 295ms/step
[0.6498836  0.11129723 0.23881917]
Neutral


In [25]:
#2. Apply GridSearchCV on the source code provided in the class
!pip install scikeras #install scikeras package
from scikeras.wrappers import KerasClassifier #importing Keras classifier
from sklearn.model_selection import GridSearchCV #importing Grid search CV

model = KerasClassifier(build_fn=createmodel,verbose=2) #initiating model to test performance by applying multiple hyper parameters
batch_size= [10, 20, 40] #hyper parameter batch_size
epochs = [1, 2] #hyper parameter no. of epochs
param_grid= {'batch_size':batch_size, 'epochs':epochs} #creating dictionary for batch size, no. of epochs
grid  = GridSearchCV(estimator=model, param_grid=param_grid) #Applying dictionary with hyper parameters
grid_result= grid.fit(X_train,Y_train) #Fitting the model
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) #best score, best hyper parameters

Collecting scikeras
  Downloading scikeras-0.12.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.12.0


  X, y = self._initialize(X, y)


744/744 - 104s - loss: 0.8238 - accuracy: 0.6454 - 104s/epoch - 140ms/step
186/186 - 3s - 3s/epoch - 16ms/step


  X, y = self._initialize(X, y)


744/744 - 103s - loss: 0.8247 - accuracy: 0.6501 - 103s/epoch - 138ms/step
186/186 - 3s - 3s/epoch - 17ms/step


  X, y = self._initialize(X, y)


744/744 - 104s - loss: 0.8244 - accuracy: 0.6439 - 104s/epoch - 140ms/step
186/186 - 3s - 3s/epoch - 18ms/step


  X, y = self._initialize(X, y)


744/744 - 107s - loss: 0.8266 - accuracy: 0.6440 - 107s/epoch - 143ms/step
186/186 - 3s - 3s/epoch - 18ms/step


  X, y = self._initialize(X, y)


744/744 - 102s - loss: 0.8238 - accuracy: 0.6406 - 102s/epoch - 138ms/step
186/186 - 3s - 3s/epoch - 15ms/step


  X, y = self._initialize(X, y)


Epoch 1/2
744/744 - 104s - loss: 0.8230 - accuracy: 0.6492 - 104s/epoch - 140ms/step
Epoch 2/2
744/744 - 116s - loss: 0.6782 - accuracy: 0.7121 - 116s/epoch - 156ms/step
186/186 - 3s - 3s/epoch - 17ms/step


  X, y = self._initialize(X, y)


Epoch 1/2
744/744 - 116s - loss: 0.8186 - accuracy: 0.6481 - 116s/epoch - 155ms/step
Epoch 2/2
744/744 - 102s - loss: 0.6784 - accuracy: 0.7129 - 102s/epoch - 137ms/step
186/186 - 3s - 3s/epoch - 16ms/step


  X, y = self._initialize(X, y)


Epoch 1/2
744/744 - 105s - loss: 0.8237 - accuracy: 0.6500 - 105s/epoch - 141ms/step
Epoch 2/2
744/744 - 99s - loss: 0.6788 - accuracy: 0.7172 - 99s/epoch - 133ms/step
186/186 - 3s - 3s/epoch - 15ms/step


  X, y = self._initialize(X, y)


Epoch 1/2
744/744 - 110s - loss: 0.8250 - accuracy: 0.6448 - 110s/epoch - 148ms/step
Epoch 2/2
744/744 - 97s - loss: 0.6740 - accuracy: 0.7103 - 97s/epoch - 131ms/step
186/186 - 3s - 3s/epoch - 17ms/step


  X, y = self._initialize(X, y)


Epoch 1/2
744/744 - 108s - loss: 0.8181 - accuracy: 0.6444 - 108s/epoch - 146ms/step
Epoch 2/2
744/744 - 105s - loss: 0.6706 - accuracy: 0.7145 - 105s/epoch - 141ms/step
186/186 - 3s - 3s/epoch - 16ms/step


  X, y = self._initialize(X, y)


372/372 - 60s - loss: 0.8330 - accuracy: 0.6410 - 60s/epoch - 161ms/step
93/93 - 2s - 2s/epoch - 20ms/step


  X, y = self._initialize(X, y)


372/372 - 59s - loss: 0.8247 - accuracy: 0.6435 - 59s/epoch - 158ms/step
93/93 - 2s - 2s/epoch - 20ms/step


  X, y = self._initialize(X, y)


372/372 - 64s - loss: 0.8339 - accuracy: 0.6410 - 64s/epoch - 172ms/step
93/93 - 2s - 2s/epoch - 21ms/step


  X, y = self._initialize(X, y)


372/372 - 60s - loss: 0.8245 - accuracy: 0.6494 - 60s/epoch - 160ms/step
93/93 - 2s - 2s/epoch - 20ms/step


  X, y = self._initialize(X, y)


372/372 - 63s - loss: 0.8251 - accuracy: 0.6425 - 63s/epoch - 170ms/step
93/93 - 2s - 2s/epoch - 21ms/step


  X, y = self._initialize(X, y)


Epoch 1/2
372/372 - 61s - loss: 0.8335 - accuracy: 0.6458 - 61s/epoch - 164ms/step
Epoch 2/2
372/372 - 55s - loss: 0.6813 - accuracy: 0.7080 - 55s/epoch - 148ms/step
93/93 - 2s - 2s/epoch - 20ms/step


  X, y = self._initialize(X, y)


Epoch 1/2
372/372 - 60s - loss: 0.8275 - accuracy: 0.6430 - 60s/epoch - 161ms/step
Epoch 2/2
372/372 - 56s - loss: 0.6828 - accuracy: 0.7139 - 56s/epoch - 151ms/step
93/93 - 2s - 2s/epoch - 20ms/step


  X, y = self._initialize(X, y)


Epoch 1/2
372/372 - 63s - loss: 0.8308 - accuracy: 0.6414 - 63s/epoch - 170ms/step
Epoch 2/2
372/372 - 57s - loss: 0.6755 - accuracy: 0.7107 - 57s/epoch - 153ms/step
93/93 - 3s - 3s/epoch - 28ms/step


  X, y = self._initialize(X, y)


Epoch 1/2
372/372 - 61s - loss: 0.8376 - accuracy: 0.6418 - 61s/epoch - 164ms/step
Epoch 2/2
372/372 - 56s - loss: 0.6783 - accuracy: 0.7108 - 56s/epoch - 152ms/step
93/93 - 2s - 2s/epoch - 20ms/step


  X, y = self._initialize(X, y)


Epoch 1/2
372/372 - 59s - loss: 0.8276 - accuracy: 0.6386 - 59s/epoch - 158ms/step
Epoch 2/2
372/372 - 56s - loss: 0.6710 - accuracy: 0.7154 - 56s/epoch - 150ms/step
93/93 - 2s - 2s/epoch - 20ms/step


  X, y = self._initialize(X, y)


186/186 - 39s - loss: 0.8395 - accuracy: 0.6410 - 39s/epoch - 212ms/step
47/47 - 1s - 1s/epoch - 29ms/step


  X, y = self._initialize(X, y)


186/186 - 38s - loss: 0.8383 - accuracy: 0.6359 - 38s/epoch - 206ms/step
47/47 - 1s - 1s/epoch - 28ms/step


  X, y = self._initialize(X, y)


186/186 - 39s - loss: 0.8494 - accuracy: 0.6347 - 39s/epoch - 210ms/step
47/47 - 1s - 1s/epoch - 28ms/step


  X, y = self._initialize(X, y)


186/186 - 38s - loss: 0.8472 - accuracy: 0.6367 - 38s/epoch - 204ms/step
47/47 - 1s - 1s/epoch - 28ms/step


  X, y = self._initialize(X, y)


186/186 - 40s - loss: 0.8379 - accuracy: 0.6429 - 40s/epoch - 213ms/step
47/47 - 1s - 1s/epoch - 28ms/step


  X, y = self._initialize(X, y)


Epoch 1/2
186/186 - 41s - loss: 0.8533 - accuracy: 0.6314 - 41s/epoch - 219ms/step
Epoch 2/2
186/186 - 35s - loss: 0.6873 - accuracy: 0.7086 - 35s/epoch - 186ms/step
47/47 - 1s - 1s/epoch - 32ms/step


  X, y = self._initialize(X, y)


Epoch 1/2
186/186 - 40s - loss: 0.8381 - accuracy: 0.6430 - 40s/epoch - 212ms/step
Epoch 2/2
186/186 - 34s - loss: 0.6848 - accuracy: 0.7139 - 34s/epoch - 184ms/step
47/47 - 1s - 1s/epoch - 28ms/step


  X, y = self._initialize(X, y)


Epoch 1/2
186/186 - 38s - loss: 0.8475 - accuracy: 0.6302 - 38s/epoch - 204ms/step
Epoch 2/2
186/186 - 36s - loss: 0.6996 - accuracy: 0.6984 - 36s/epoch - 193ms/step
47/47 - 1s - 1s/epoch - 28ms/step


  X, y = self._initialize(X, y)


Epoch 1/2
186/186 - 40s - loss: 0.8471 - accuracy: 0.6366 - 40s/epoch - 216ms/step
Epoch 2/2
186/186 - 36s - loss: 0.6808 - accuracy: 0.7067 - 36s/epoch - 191ms/step
47/47 - 1s - 1s/epoch - 29ms/step


  X, y = self._initialize(X, y)


Epoch 1/2
186/186 - 39s - loss: 0.8415 - accuracy: 0.6360 - 39s/epoch - 212ms/step
Epoch 2/2
186/186 - 35s - loss: 0.6753 - accuracy: 0.7151 - 35s/epoch - 189ms/step
47/47 - 1s - 1s/epoch - 28ms/step


  X, y = self._initialize(X, y)


Epoch 1/2
930/930 - 131s - loss: 0.8082 - accuracy: 0.6565 - 131s/epoch - 141ms/step
Epoch 2/2
930/930 - 125s - loss: 0.6741 - accuracy: 0.7139 - 125s/epoch - 134ms/step
Best: 0.679650 using {'batch_size': 10, 'epochs': 2}
