In [51]:
import pandas as pd #Basic packages for creating dataframes and loading dataset
import numpy as np
import matplotlib.pyplot as plt #Package for visualization
import re #importing package for Regular expression operations
from sklearn.model_selection import train_test_split #Package for splitting the data
from sklearn.preprocessing import LabelEncoder #Package for conversion of categorical to Numerical
from tensorflow.keras.preprocessing.text import Tokenizer #Tokenization (use from tensorflow.keras)
from tensorflow.keras.preprocessing.sequence import pad_sequences #Add zeros or crop based on the length
from tensorflow.keras.models import Sequential #Sequential Neural Network
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D #For layers in Neural Network
from tensorflow.keras.utils import to_categorical #For converting labels to categorical data


In [52]:
import pandas as pd

# Load the dataset as a Pandas DataFrame
dataset = pd.read_csv('Sentiment.csv')

# Select only the necessary columns 'text' and 'sentiment'
mask = dataset.columns.isin(['text', 'sentiment'])
data = dataset.loc[:, mask]

In [53]:
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))


In [54]:
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ') #Removing Retweets

In [55]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ') #Maximum words is 2000 to tokenize sentence
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values) #taking values to feature matrix

In [56]:
X = pad_sequences(X) #Padding the feature matrix

embed_dim = 128 #Dimension of the Embedded layer
lstm_out = 196 #Long short-term memory (LSTM) layer neurons

In [57]:
def createmodel(optimizer='adam'):
    model = Sequential() #Sequential Neural Network
    model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1])) #input dimension 2000 Neurons, output dimension 128 Neurons
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) #Drop out 20%, 196 output Neurons, recurrent dropout 20%
    model.add(Dense(3,activation='softmax')) #3 output neurons[positive, Neutral, Negative], softmax as activation
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy']) #Compiling the model
    return model
# print(model.summary())

In [58]:
labelencoder = LabelEncoder() #Applying label Encoding on the label matrix
integer_encoded = labelencoder.fit_transform(data['sentiment']) #fitting the model
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42) #67% training data, 33% test data split
     

In [59]:
batch_size = 32 #Batch size 32
model = createmodel() #Function call to Sequential Neural Network
model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size, verbose = 2) #verbose the higher, the more messages
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size) #evaluating the model
print("The test score is:",score)
print("the test accuracy is:",acc)



291/291 - 35s - 119ms/step - accuracy: 0.6398 - loss: 0.8277
144/144 - 4s - 31ms/step - accuracy: 0.6621 - loss: 0.7793
The test score is: 0.7792977690696716
the test accuracy is: 0.6620795130729675


In [60]:
print(model.metrics_names) #metrics of the model

['loss', 'compile_metrics']


In [61]:
model.save('sentimentAnalysis.h5')



In [62]:
from tensorflow.keras.models import load_model

# Load the saved model
loaded_model = load_model('sentimentAnalysis.h5')



In [63]:
loaded_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [64]:
print(integer_encoded)
print(data['sentiment'])

[1 2 1 ... 2 0 2]
0         Neutral
1        Positive
2         Neutral
3        Positive
4        Positive
           ...   
13866    Negative
13867    Positive
13868    Positive
13869    Negative
13870    Positive
Name: sentiment, Length: 13871, dtype: object


In [65]:
# Predicting on the text data
sentence = ['A lot of good things are happening. We are respected again throughout the world, and that is a great thing.@realDonaldTrump']
sentence = tokenizer.texts_to_sequences(sentence) # Tokenizing the sentence
sentence = pad_sequences(sentence, maxlen=28, dtype='int32', value=0) # Padding the sentence
sentiment_probs = model.predict(sentence, batch_size=1, verbose=2)[0] # Predicting the sentence text
sentiment = np.argmax(sentiment_probs)

print(sentiment_probs)
if sentiment == 0:
    print("Neutral")
elif sentiment < 0:
    print("Negative")
elif sentiment > 0:
    print("Positive")
else:
    print("Cannot be determined")

1/1 - 2s - 2s/step
[0.67102534 0.17469758 0.15427707]
Neutral


In [66]:
for optimizer in ['adam', 'rmsprop']:
    print(f"Training with optimizer: {optimizer}")
    model = createmodel(optimizer=optimizer)
    model.fit(X_train, Y_train, epochs=1, batch_size=32, verbose=2)
    score, acc = model.evaluate(X_test, Y_test, verbose=2)
    print(f"Optimizer: {optimizer}, Test Score: {score}, Test Accuracy: {acc}")

Training with optimizer: adam
291/291 - 76s - 262ms/step - accuracy: 0.6391 - loss: 0.8303
144/144 - 4s - 27ms/step - accuracy: 0.6671 - loss: 0.7758
Optimizer: adam, Test Score: 0.7757890224456787, Test Accuracy: 0.6671035289764404
Training with optimizer: rmsprop
291/291 - 29s - 99ms/step - accuracy: 0.6378 - loss: 0.8350
144/144 - 4s - 27ms/step - accuracy: 0.6667 - loss: 0.7537
Optimizer: rmsprop, Test Score: 0.7537410855293274, Test Accuracy: 0.6666666865348816


I tried using the GridSearchCV to tune model's hyperparameters suing below sample code which was mentioned in the class:
model = KerasClassifier(build_fn=model,verbose=0)
batch_size = [10, 20, 40]
epochs = [1, 2, 3]
param_grid = dict(batch_size=batch_size, epochs=epochs)
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(X_train, Y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


But faced errors like all 45 fits failed,alueError: Sequential model 'sequential_35' has no defined outputs yet.

so i manually tuned the hyperparameters using adam and rsmprop optimizers
