# Sentiment (trying to understand NLP)

We need to run this in the HPC as soon as possible.

In [13]:
#Import modules
import pandas as pd
import numpy as np

In [14]:
#Load the sentiment dataset
df = pd.read_csv('sentiment_data.csv')
df.dropna(inplace=True)
df.drop(columns=['Unnamed: 0'], inplace=True)
df

Unnamed: 0,Comment,Sentiment
0,lets forget apple pay required brand new iphon...,1
1,nz retailers don’t even contactless credit car...,0
2,forever acknowledge channel help lessons ideas...,2
3,whenever go place doesn’t take apple pay doesn...,0
4,apple pay convenient secure easy use used kore...,2
...,...,...
241140,crores paid neerav modi recovered congress lea...,0
241141,dear rss terrorist payal gawar modi killing pl...,0
241142,cover interaction forum left,1
241143,big project came india modi dream project happ...,1


In [15]:
#Count how many positive and negative reviews we have
df['Sentiment'].value_counts()

Sentiment
2    103046
1     82777
0     55105
Name: count, dtype: int64

We have a bunch more positive phrases than negative; is there enough reason to oversample the minority class? Look into the obtained confusion matrices and metrics.
[30/06/2025]: After running tests 0 and 2 get different scores, so class imbalance is hurting the model a little bit, especially in the case of more complex algorithms. -> SMOTE.

In [16]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X = df['Comment']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [17]:
#First simple approach: CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
#Import random forest classifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier    
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#Count the number of different words in the dataset
print(f'Number of training samples: {len(X_train)}')
print(f'Number of unique words in the training dataset: {len(set(" ".join(X_train).split()))}')
#Create a CountVectorizer object
vectorizer = CountVectorizer()
#Fit and transform the training data
X_train_vectorized = vectorizer.fit_transform(X_train)
#Transform the test data
X_test_vectorized = vectorizer.transform(X_test)   
print(f'Number of features: {X_train_vectorized.shape[1]}')
print(f'Number of training samples (from matrix size) from : {X_train_vectorized.shape[0]}')
#SMOTE on training set [30/06/2025]
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_vectorized, y_train_smoted = smote.fit_resample(X_train_vectorized, y_train)


Number of training samples: 216835
Number of unique words in the training dataset: 149124
Number of features: 143649
Number of training samples (from matrix size) from : 216835


We can clearly see that the numbers match, number of unique words is very similar (should be about the same, processing of commas, exclamations aside) as features that each word vector has. Also, the number of training samples is exactly the same as the number of vectors in that vectorized matrix.

## Trying the Naive-Bayes classifier

In [None]:
 
#Create a Multinomial Naive Bayes classifier
classifier = MultinomialNB()
#Fit the classifier on the training data
classifier.fit(X_train_vectorized, y_train_smoted)
#Make predictions on the test data
y_pred = classifier.predict(X_test_vectorized)
#Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.69
              precision    recall  f1-score   support

           0       0.58      0.71      0.64      5484
           1       0.75      0.53      0.62      8337
           2       0.71      0.80      0.75     10272

    accuracy                           0.69     24093
   macro avg       0.68      0.68      0.67     24093
weighted avg       0.70      0.69      0.68     24093

[[3920  494 1070]
 [1674 4386 2277]
 [1108  941 8223]]


## Trying a more complex model: XGBoost (Random Forest Classifier in steroids)

In this dataset, we can skip the outlier identification, it should be all correct.

Now train the classifier with the best hyperparams. We applied optuna (put in in a cluster and obtained the next parameter results).

In [29]:
xgb_classifier = XGBClassifier(
    n_estimators=200,
    max_depth=17,
    learning_rate=0.7868971451620371,
    subsample=0.823473166496611,
    colsample_bytree=0.6167464902271917,
    min_child_weight=1,
    gamma=0.6279386777664179,
    reg_alpha=0.015410166345441269,
    reg_lambda=0.3200061281054938,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)
xgb_classifier.fit(X_train_vectorized, y_train_smoted)
#Make predictions on the test data
y_pred_xgb = xgb_classifier.predict(X_test_vectorized)
#Evaluate the model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f'XGBoost Classifier Accuracy: {accuracy_xgb:.2f}')
print(classification_report(y_test, y_pred_xgb))
print(confusion_matrix(y_test, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Classifier Accuracy: 0.84
              precision    recall  f1-score   support

           0       0.82      0.74      0.78      5484
           1       0.81      0.89      0.84      8337
           2       0.89      0.86      0.87     10272

    accuracy                           0.84     24093
   macro avg       0.84      0.83      0.83     24093
weighted avg       0.84      0.84      0.84     24093

[[4082  801  601]
 [ 396 7409  532]
 [ 486  991 8795]]


## Neural Networks

In [6]:
from sklearn.neural_network import MLPClassifier

# Create and train the neural network
nn = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=300, random_state=42, verbose = True)
nn.fit(X_train_vectorized, y_train_smoted)

# Predict and evaluate
y_pred_nn = nn.predict(X_test_vectorized)
accuracy_nn = accuracy_score(y_test, y_pred_nn)
print(f"Neural Network Accuracy: {accuracy_nn:.2f}")
print(classification_report(y_test, y_pred_nn))
print(confusion_matrix(y_test, y_pred_nn))

Neural Network Accuracy: 0.84
              precision    recall  f1-score   support

           0       0.74      0.79      0.77      5484
           1       0.85      0.83      0.84      8337
           2       0.88      0.87      0.88     10272

    accuracy                           0.84     24093
   macro avg       0.83      0.83      0.83     24093
weighted avg       0.84      0.84      0.84     24093

[[4330  510  644]
 [ 861 6900  576]
 [ 640  673 8959]]




In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
maxlen = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

# Build LSTM model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128),
    LSTM(64),
    Dense(3, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test), verbose=1)

Epoch 1/10
[1m6777/6777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 34ms/step - accuracy: 0.7509 - loss: 0.6133 - val_accuracy: 0.8409 - val_loss: 0.4338
Epoch 2/10
[1m6777/6777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 35ms/step - accuracy: 0.8603 - loss: 0.3890 - val_accuracy: 0.8512 - val_loss: 0.4149
Epoch 3/10
[1m6777/6777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 35ms/step - accuracy: 0.8823 - loss: 0.3260 - val_accuracy: 0.8552 - val_loss: 0.4185
Epoch 4/10
[1m6777/6777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 35ms/step - accuracy: 0.9070 - loss: 0.2621 - val_accuracy: 0.8551 - val_loss: 0.4336
Epoch 5/10
[1m6777/6777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 35ms/step - accuracy: 0.9285 - loss: 0.2052 - val_accuracy: 0.8549 - val_loss: 0.4816
Epoch 6/10
[1m6777/6777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 35ms/step - accuracy: 0.9482 - loss: 0.1537 - val_accuracy: 0.8529 - val_loss: 0.535

<keras.src.callbacks.history.History at 0x29408fb10>

In [19]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict class labels for the test set
y_pred_lstm = model.predict(X_test_pad)
y_pred_lstm_classes = y_pred_lstm.argmax(axis=1)  # Get the class with highest probability

# Evaluate
print("LSTM Accuracy:", accuracy_score(y_test, y_pred_lstm_classes))
print("Classification Report:\n", classification_report(y_test, y_pred_lstm_classes))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lstm_classes))

[1m753/753[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step
LSTM Accuracy: 0.8489602789191881
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.78      0.80      5484
           1       0.85      0.86      0.85      8337
           2       0.87      0.87      0.87     10272

    accuracy                           0.85     24093
   macro avg       0.84      0.84      0.84     24093
weighted avg       0.85      0.85      0.85     24093

Confusion Matrix:
 [[4304  545  635]
 [ 497 7170  670]
 [ 539  753 8980]]


LSTM gave us some pretty amazing results.

# LSTM + XGBoost ensemble

In [30]:
# 1. Train both models (already done)
# xgb_classifier.fit(X_train_vectorized, y_train_smoted)
# model.fit(X_train_pad, y_train, ...)

# 2. Get predicted probabilities for the test set
proba_xgb = xgb_classifier.predict_proba(X_test_vectorized)  # shape: (n_samples, n_classes)
proba_lstm = model.predict(X_test_pad)                       # shape: (n_samples, n_classes)

# 3. Average the probabilities
ensemble_proba = (proba_xgb + proba_lstm) / 2

# 4. Get final predictions
ensemble_pred = np.argmax(ensemble_proba, axis=1)

# 5. Evaluate
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("Ensemble Accuracy:", accuracy_score(y_test, ensemble_pred))
print("Classification Report:\n", classification_report(y_test, ensemble_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, ensemble_pred))

[1m753/753[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step
Ensemble Accuracy: 0.8659776698626157
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.80      0.82      5484
           1       0.86      0.88      0.87      8337
           2       0.89      0.89      0.89     10272

    accuracy                           0.87     24093
   macro avg       0.86      0.86      0.86     24093
weighted avg       0.87      0.87      0.87     24093

Confusion Matrix:
 [[4395  511  578]
 [ 410 7376  551]
 [ 457  722 9093]]
