In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

# Load the dataset
data = pd.read_csv('FinalDataset.tsv', sep='\t', on_bad_lines='skip')


In [None]:
import re

def clean_text(text):
    # Remove any HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

data['review_body'] = data['review_body'].apply(clean_text)


In [None]:
X = data['review_body']
y = data['star_rating']

# Convert labels to one-hot encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = np.eye(len(set(y)))[y]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenization and padding
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=100, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=100, padding='post', truncating='post')


In [None]:
model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=5000, output_dim=64, input_length=100))

# Convolutional layer
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())

# Dense layers
model.add(Dense(64, activation='relu'))
model.add(Dense(data['star_rating'].nunique(), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
model.fit(X_train_padded, y_train, epochs=5, validation_data=(X_test_padded, y_test))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7bef1f66c670>

In [None]:
# Predictions
y_pred = model.predict(X_test_padded)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

# Function to map original labels to new categories
def map_sentiment(label):
    if label in [0, 1]:  # Assuming original labels are 0-indexed
        return 'negative'
    elif label == 2:
        return 'neutral'
    else:
        return 'positive'

# Map the original labels to the new categories
y_test_mapped = [map_sentiment(label) for label in y_true_classes]
y_pred_mapped = [map_sentiment(label) for label in y_pred_classes]

# Accuracy and other metrics
from sklearn.metrics import classification_report
print(classification_report(y_test_mapped, y_pred_mapped, labels=['negative', 'neutral', 'positive']))

from sklearn.metrics import confusion_matrix

# Compute and print the confusion matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_test_mapped, y_pred_mapped, labels=['negative', 'neutral', 'positive']))



              precision    recall  f1-score   support

    negative       0.79      0.67      0.72       717
     neutral       0.32      0.26      0.29       209
    positive       0.91      0.96      0.93      3063

    accuracy                           0.87      3989
   macro avg       0.68      0.63      0.65      3989
weighted avg       0.86      0.87      0.86      3989


Confusion Matrix:
 [[ 478   65  174]
 [  45   55  109]
 [  79   51 2933]]
