In [None]:
 !pip install nltk



In [None]:
import json
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Reading in data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path_train = "/content/drive/MyDrive/Colab Notebooks/IE4483/data/train.json"


file_path = "train.json"

with open(file_path_train, 'r', encoding='utf-8') as file:
    train_data = json.load(file)

In [None]:
file_path_test = "/content/drive/MyDrive/Colab Notebooks/IE4483/data/test.json"

with open(file_path_test, 'r', encoding='utf-8') as file:
    test_data = json.load(file)

In [None]:
# 0: negative sentiment
# 1: positive sentiment

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

In [None]:
train_df.head()

Unnamed: 0,reviews,sentiments
0,I bought this belt for my daughter in-law for ...,1
1,The size was perfect and so was the color. It...,1
2,"Fits and feels good, esp. for doing a swim rac...",1
3,These socks are absolutely the best. I take pi...,1
4,Thank you so much for the speedy delivery they...,1


# Data processing

In [None]:
#Lowercasing
train_df["reviews"] = train_df["reviews"].astype(str).str.lower()
train_df.head()

test_df["reviews"] = test_df["reviews"].astype(str).str.lower()
test_df.head()

Unnamed: 0,reviews
0,i bought 2 sleepers. sleeper had holes in the...
1,i dare say these are just about the sexiest th...
2,"everything about the transaction (price, deliv..."
3,"not bad for just a shirt. very durable, and m..."
4,these are truly wrinkle free and longer than t...


In [None]:
#Function to remove punctuation
def remove_punc(text):
    for punc in string.punctuation:
        text = text.replace(punc, '')
    return text


train_df["reviews"] = train_df["reviews"].apply(remove_punc)
train_df.head()

test_df["reviews"] = test_df["reviews"].apply(remove_punc)
test_df.head()

Unnamed: 0,reviews
0,i bought 2 sleepers sleeper had holes in the ...
1,i dare say these are just about the sexiest th...
2,everything about the transaction price deliver...
3,not bad for just a shirt very durable and mat...
4,these are truly wrinkle free and longer than t...


In [None]:
# Remove special characters and numbers from the "reviews" column
train_df['reviews'] = train_df['reviews'].str.replace(r'[^A-Za-z\s]', '', regex=True)

test_df['reviews'] = test_df['reviews'].str.replace(r'[^A-Za-z\s]', '', regex=True)

In [None]:
# Function to remove stopwords
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]
    return ' '.join(filtered_words)

# Apply the remove_stopwords function to the 'reviews' column
train_df['reviews'] = train_df['reviews'].apply(remove_stopwords)

test_df['reviews'] = test_df['reviews'].apply(remove_stopwords)

In [None]:
findDuplicate = train_df.duplicated()
print(findDuplicate)

0       False
1       False
2       False
3       False
4       False
        ...  
7396    False
7397    False
7398    False
7399    False
7400    False
Length: 7401, dtype: bool


In [None]:
train_df = train_df.drop_duplicates()
print(train_df)

test_df = test_df.drop_duplicates()
print(test_df)

                                                reviews  sentiments
0            bought belt daughter inlaw christmas loved           1
1               size perfect color looked like web page           1
2     fits feels good esp swim race highly recommend...           1
3     socks absolutely best take pilates classes hot...           1
4     thank much speedy delivery came time rehearsal...           1
...                                                 ...         ...
7396  bought shirts black medium wear daily basis di...           0
7397  first thought scarf might good quality since c...           1
7398  picky comes bras want something support comfor...           1
7399  jacket wind water resistant waterproof soked f...           0
7400  extremely confortable material soft cotton pou...           1

[7178 rows x 2 columns]
                                                reviews
0     bought sleepers sleeper holes arm pit area sle...
1     dare say sexiest things ive ever worn oh 

In [None]:
#Save df as csv
train_df.to_csv('processed_train_data.csv', index=False)
test_df.to_csv('processed_test_data.csv', index=False)

In [None]:
def word_tokenize(text):
    tokenised_words =  nltk.word_tokenize(text)
    return tokenised_words

train_df['reviews'] = train_df['reviews'].apply(word_tokenize)

test_df['reviews'] = test_df['reviews'].apply(word_tokenize)

In [None]:
print(train_df)

                                                reviews  sentiments
0     [bought, belt, daughter, inlaw, christmas, loved]           1
1       [size, perfect, color, looked, like, web, page]           1
2     [fits, feels, good, esp, swim, race, highly, r...           1
3     [socks, absolutely, best, take, pilates, class...           1
4     [thank, much, speedy, delivery, came, time, re...           1
...                                                 ...         ...
7396  [bought, shirts, black, medium, wear, daily, b...           0
7397  [first, thought, scarf, might, good, quality, ...           1
7398  [picky, comes, bras, want, something, support,...           1
7399  [jacket, wind, water, resistant, waterproof, s...           0
7400  [extremely, confortable, material, soft, cotto...           1

[7178 rows x 2 columns]


In [None]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

max_words = 700

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    train_df['reviews'], train_df['sentiments'], test_size=0.2, random_state=1
)

# Tokenization
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Padding
max_length = max(len(seq) for seq in X_train_sequences)
X_train_padded = sequence.pad_sequences(X_train_sequences, maxlen=max_length)
X_test_padded = sequence.pad_sequences(X_test_sequences, maxlen=max_length)

# Convert labels to NumPy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

# Create the CNN model
embedding_dim = 50  # Adjust based on your preference
num_classes = len(set(y_train))

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 64
epochs = 10

model.fit(X_train_padded, y_train, validation_data=(X_test_padded, y_test), batch_size=batch_size, epochs=epochs)

# Evaluate the model on your test set
test_loss, test_acc = model.evaluate(X_test_padded, y_test)
print(f'Test Accuracy: {test_acc * 100:.2f}%')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 88.02%


In [None]:
from sklearn.metrics import classification_report

00# Make predictions on the test set
y_pred_probs = model.predict(X_test_padded)

# Convert predicted probabilities to classes
y_pred = np.argmax(y_pred_probs, axis=1)

# Display the classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.63      0.50      0.56       217
           1       0.91      0.95      0.93      1219

    accuracy                           0.88      1436
   macro avg       0.77      0.72      0.74      1436
weighted avg       0.87      0.88      0.87      1436



In [None]:
import json
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Tokenize and pad sequences
X_test_sequences = tokenizer.texts_to_sequences(test_df['reviews'])
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length)

# Make predictions
y_test_pred_probs = model.predict(X_test_padded)
y_test_pred = np.argmax(y_test_pred_probs, axis=1)



In [None]:
# Display the predictions
test_df['predicted_sentiment'] = y_test_pred
print(test_df[['reviews', 'predicted_sentiment']])

                                                reviews  predicted_sentiment
0     [bought, sleepers, sleeper, holes, arm, pit, a...                    0
1     [dare, say, sexiest, things, ive, ever, worn, ...                    1
2     [everything, transaction, price, delivery, tim...                    1
3     [bad, shirt, durable, matched, teams, colors, ...                    1
4     [truly, wrinkle, free, longer, average, womans...                    1
...                                                 ...                  ...
1845  [im, glad, got, solved, issues, tootight, band...                    1
1847  [bought, longsleeved, colored, shirts, child, ...                    0
1848  [really, cute, sexy, make, nice, valentines, d...                    1
1849  [shoers, daughter, loves, long, happy, happy, ...                    1
1850  [umbrellas, handle, light, rain, well, leave, ...                    1

[1829 rows x 2 columns]
