In [None]:
!pip install nltk



In [None]:
import json
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Reading in data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
file_path_train = "/content/drive/MyDrive/Colab Notebooks/IE4483/data/train.json"


with open(file_path_train, 'r', encoding='utf-8') as file:
    train_data = json.load(file)

In [None]:
file_path_test = "/content/drive/MyDrive/Colab Notebooks/IE4483/data/test.json"



with open(file_path_test, 'r', encoding='utf-8') as file:
    test_data = json.load(file)

In [None]:
# 0: negative sentiment
# 1: positive sentiment

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

In [None]:
train_df.head()

Unnamed: 0,reviews,sentiments
0,I bought this belt for my daughter in-law for ...,1
1,The size was perfect and so was the color. It...,1
2,"Fits and feels good, esp. for doing a swim rac...",1
3,These socks are absolutely the best. I take pi...,1
4,Thank you so much for the speedy delivery they...,1


# Data processing

In [None]:
#Lowercasing
train_df["reviews"] = train_df["reviews"].astype(str).str.lower()

test_df["reviews"] = test_df["reviews"].astype(str).str.lower()


In [None]:
#Function to remove punctuation
def remove_punc(text):
    for punc in string.punctuation:
        text = text.replace(punc, '')
    return text


train_df["reviews"] = train_df["reviews"].apply(remove_punc)

test_df["reviews"] = test_df["reviews"].apply(remove_punc)


In [None]:
# Remove special characters and numbers from the "reviews" column
train_df['reviews'] = train_df['reviews'].str.replace(r'[^A-Za-z\s]', '', regex=True)

test_df['reviews'] = test_df['reviews'].str.replace(r'[^A-Za-z\s]', '', regex=True)

In [None]:
# Function to remove stopwords
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]
    return ' '.join(filtered_words)

# Apply the remove_stopwords function to the 'reviews' column
train_df['reviews'] = train_df['reviews'].apply(remove_stopwords)

test_df['reviews'] = test_df['reviews'].apply(remove_stopwords)

In [None]:
findDuplicate = train_df.duplicated()
print(findDuplicate)

0       False
1       False
2       False
3       False
4       False
        ...  
7396    False
7397    False
7398    False
7399    False
7400    False
Length: 7401, dtype: bool


In [None]:
train_df = train_df.drop_duplicates()
print(train_df)

test_df = test_df.drop_duplicates()
print(test_df)

                                                reviews  sentiments
0            bought belt daughter inlaw christmas loved           1
1               size perfect color looked like web page           1
2     fits feels good esp swim race highly recommend...           1
3     socks absolutely best take pilates classes hot...           1
4     thank much speedy delivery came time rehearsal...           1
...                                                 ...         ...
7396  bought shirts black medium wear daily basis di...           0
7397  first thought scarf might good quality since c...           1
7398  picky comes bras want something support comfor...           1
7399  jacket wind water resistant waterproof soked f...           0
7400  extremely confortable material soft cotton pou...           1

[7178 rows x 2 columns]
                                                reviews
0     bought sleepers sleeper holes arm pit area sle...
1     dare say sexiest things ive ever worn oh 

In [None]:
#Save df as csv
train_df.to_csv('processed_train_data.csv', index=False)
test_df.to_csv('processed_test_data.csv', index=False)

In [None]:
print(train_df)

                                                reviews  sentiments
0            bought belt daughter inlaw christmas loved           1
1               size perfect color looked like web page           1
2     fits feels good esp swim race highly recommend...           1
3     socks absolutely best take pilates classes hot...           1
4     thank much speedy delivery came time rehearsal...           1
...                                                 ...         ...
7396  bought shirts black medium wear daily basis di...           0
7397  first thought scarf might good quality since c...           1
7398  picky comes bras want something support comfor...           1
7399  jacket wind water resistant waterproof soked f...           0
7400  extremely confortable material soft cotton pou...           1

[7178 rows x 2 columns]


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

In [None]:
# Putting reviews into the x axis, and sentiments into the y axis
x, y = (train_df['reviews'].values, train_df['sentiments'].values)

In [None]:
# Tokenization + padding of training data
tok = Tokenizer(lower=True)
tok.fit_on_texts(x)
x_sequence = tok.texts_to_sequences(x)
x_padding = pad_sequences(x_sequence, maxlen=32, padding='post')


In [None]:
# split data into training and validation set
x_train, x_val, y_train, y_val = train_test_split(x_padding, y, test_size=0.25, random_state=1)

In [None]:
# Building RNN model
vocabulary_size = len(tok.word_counts.keys()) + 1
max_words = 32  # To match the input_length in the Embedding layer
embedding_size = 200

model = Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(200))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Training the model
model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=32, epochs=3)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7b9fbe2efe50>

In [None]:
# Evaluation and Classification Report
accuracy = model.evaluate(x_val, y_val, verbose=1)
print("Test accuracy is: {:.2%}".format(accuracy[1]))


Test accuracy is: 88.69%


In [None]:
# Tokenization + padding
tok = Tokenizer(lower=True)
tok.fit_on_texts(test_df['reviews'].values)
x_test_sequence = tok.texts_to_sequences(test_df['reviews'].values)
x_test_padding = pad_sequences(x_test_sequence, maxlen=32, padding='post')

# Make predictions
y_test_prediction_probability = model.predict(x_test_padding)
y_test_prediction = np.round(y_test_prediction_probability).astype(int)




In [None]:
# Display the predictions
test_df['predicted_sentiment'] = y_test_prediction
print(test_df[['reviews', 'predicted_sentiment']])

                                                reviews  predicted_sentiment
0     bought sleepers sleeper holes arm pit area sle...                    1
1     dare say sexiest things ive ever worn oh ive g...                    1
2     everything transaction price delivery time qua...                    1
3     bad shirt durable matched teams colors perfect...                    1
4     truly wrinkle free longer average womans botto...                    1
...                                                 ...                  ...
1845  im glad got solved issues tootight bands wear ...                    1
1847  bought longsleeved colored shirts child wear r...                    1
1848  really cute sexy make nice valentines day pres...                    1
1849  shoers daughter loves long happy happy adidas ...                    1
1850  umbrellas handle light rain well leave wet pan...                    1

[1829 rows x 2 columns]


In [None]:
ground_truth_labels = test_df['predicted_sentiment']
predicted_labels = y_test_prediction.astype(int)

# Generate classification report
class_report = classification_report(ground_truth_labels, predicted_labels, target_names=['0', '1'])

# Display the classification report
print(class_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       162
           1       1.00      1.00      1.00      1667

    accuracy                           1.00      1829
   macro avg       1.00      1.00      1.00      1829
weighted avg       1.00      1.00      1.00      1829

