Sentiment Analysis for Twitter data

In [2]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
import re
from sklearn.preprocessing import LabelEncoder

In [3]:
data = pd.read_csv('/content/Sentiment.csv')
# Keeping only the neccessary columns
data = data[['text','sentiment']]

data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ')

max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)

X = pad_sequences(X)

embed_dim = 128
lstm_out = 196
def createmodel():
    model = Sequential()
    model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    return model
# print(model.summary())

labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['sentiment'])
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

batch_size = 32
model = createmodel()
model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size, verbose = 2)
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size)
print(score)
print(acc)
print(model.metrics_names)




291/291 - 29s - 98ms/step - accuracy: 0.6383 - loss: 0.8373
144/144 - 3s - 22ms/step - accuracy: 0.6651 - loss: 0.7659
0.7658703923225403
0.6651375889778137
['loss', 'compile_metrics']


In [6]:
model.save('SentimentAnalysis.h5')



In [8]:
import tweepy
from keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import re

In [9]:
# Load the saved model
model = load_model('/content/SentimentAnalysis.h5')

# Define a function for preprocessing text
def preprocess_data(text):
  text = text.lower()
  text = re.sub('[^a-zA-z0-9\s]', '', text)
  return text

new_data = "A lot of good things are happening. We are respected again throughout the world, and that's a great thing"
# Preprocess the new text data
new_data = preprocess_data(new_data)

# Tokenize and pad the new text data
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts([new_data])
X_new = tokenizer.texts_to_sequences([new_data])
X_new = pad_sequences(X_new, maxlen=model.input_shape[1])

# Make predictions
predictions = model.predict(X_new)

# Determine the sentiment based on the prediction
sentiments = ['Negative', 'Neutral', 'Positive']
predicted_sentiment = sentiments[predictions.argmax()]

# Print the result
print("Predicted Sentiment: " + predicted_sentiment)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 244ms/step
Predicted Sentiment: Negative


Apply GridSearchCV on the source code

In [10]:
# !pip install scikeras
from scikeras.wrappers import KerasClassifier

In [33]:

import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from scikeras.wrappers import KerasClassifier

# Assuming the data loading and preprocessing steps are the same
model = load_model('/content/SentimentAnalysis.h5')
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
# Assuming tokenizer fitting and text preprocessing is done here

def createmodel(optimizer='adam'):
    model1 = Sequential()
    model1.add(Embedding(max_features, embed_dim, input_length=X.shape[1]))
    model1.add(SpatialDropout1D(0.2))
    model1.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model1.add(Dense(3, activation='softmax'))
    model1.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Define the KerasClassifier with the build_fn as our model creation function
model = KerasClassifier(model, verbose=2)

# Define hyperparameters to tune
param_grid = {
    'batch_size': [32, 64],
    'epochs': [1, 2],
    'optimizer': ['adam', 'rmsprop']
}



In [34]:

# Initialize GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=3)


In [35]:
# Fit GridSearchCV
grid_result = grid.fit(X_train, Y_train)

# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

  saveable.load_own_variables(weights_store.get(inner_path))


194/194 - 16s - 82ms/step - accuracy: 0.7080 - loss: 0.6942
97/97 - 2s - 25ms/step
194/194 - 16s - 83ms/step - accuracy: 0.7106 - loss: 0.6834
97/97 - 2s - 25ms/step
194/194 - 16s - 80ms/step - accuracy: 0.7079 - loss: 0.6864
97/97 - 2s - 24ms/step
194/194 - 16s - 81ms/step - accuracy: 0.7052 - loss: 0.6961
97/97 - 2s - 25ms/step
194/194 - 16s - 84ms/step - accuracy: 0.7070 - loss: 0.6809
97/97 - 3s - 32ms/step
194/194 - 16s - 82ms/step - accuracy: 0.7055 - loss: 0.6821
97/97 - 4s - 38ms/step
Epoch 1/2
194/194 - 16s - 81ms/step - accuracy: 0.7035 - loss: 0.6947
Epoch 2/2
194/194 - 20s - 105ms/step - accuracy: 0.7406 - loss: 0.6278
97/97 - 4s - 36ms/step
Epoch 1/2
194/194 - 16s - 85ms/step - accuracy: 0.7075 - loss: 0.6839
Epoch 2/2
194/194 - 20s - 104ms/step - accuracy: 0.7422 - loss: 0.6152
97/97 - 3s - 36ms/step
Epoch 1/2
194/194 - 17s - 89ms/step - accuracy: 0.7080 - loss: 0.6787
Epoch 2/2
194/194 - 13s - 68ms/step - accuracy: 0.7456 - loss: 0.6094
97/97 - 3s - 28ms/step
Epoch 1/2
1