In [1]:
import numpy as np
import pandas as pd

#### Step 1: Load the datasets

In [2]:
#load the preprocessed train data
df_train_preprocessed= pd.read_csv('/Users/smirghor/Library/Mobile Documents/com~apple~CloudDocs/Personal/Machine Learning Projects/data.nosync/amazon_review_polarity_csv/preprocessed_reviews_train.csv')


In [3]:
#load the preprocessed test data
df_test_preprocessed = pd.read_csv('/Users/smirghor/Library/Mobile Documents/com~apple~CloudDocs/Personal/Machine Learning Projects/data.nosync/amazon_review_polarity_csv/preprocessed_reviews_test.csv')

In [4]:
df_train_preprocessed.head()

Unnamed: 0,polarity,title,text
0,2,The best soundtrack ever to anything.,"['im', 'reading', 'lot', 'review', 'saying', '..."
1,2,Amazing!,"['soundtrack', 'favorite', 'music', 'time', 'h..."
2,2,Excellent Soundtrack,"['truly', 'like', 'soundtrack', 'enjoy', 'vide..."
3,2,"Remember, Pull Your Jaw Off The Floor After He...","['youve', 'played', 'game', 'know', 'divine', ..."
4,2,an absolute masterpiece,"['quite', 'sure', 'actually', 'taking', 'time'..."


In [19]:
df_test_preprocessed.head()

Unnamed: 0,polarity,title,text
0,2,One of the best game music soundtracks - for a...,"['despite', 'fact', 'played', 'small', 'portio..."
1,1,Batteries died within a year ...,"['bought', 'charger', 'jul', '2003', 'worked',..."
2,2,"works fine, but Maha Energy is better","['check', 'maha', 'energy', 'website', 'powere..."
3,2,Great for the non-audiophile,"['reviewed', 'quite', 'bit', 'combo', 'player'..."
4,1,DVD Player crapped out after one year,"['also', 'began', 'incorrect', 'disc', 'proble..."


#### Step 2: Convert text to numerical features. 
Options are: Bag of word (BoW), Term Frequency-Inverse Document Frequency (TF-IDF), and Word Embeddings (Optional for Deep Learning)

In [6]:
#convert strings in train and test set to list on column 'text'

df_train_preprocessed['text'] = df_train_preprocessed['text'].apply(lambda x: eval(x))
df_test_preprocessed['text'] = df_test_preprocessed['text'].apply(lambda x: eval(x))

#check the type of column 'text'
print(type(df_train_preprocessed['text'].iloc[0]))
print(type(df_test_preprocessed['text'].iloc[0]))

<class 'list'>
<class 'list'>


In [7]:
#join the tokenized words into a single string for each row
df_train_preprocessed['text'] = df_train_preprocessed['text'].apply(lambda x: ' '.join(x))
df_test_preprocessed['text'] = df_test_preprocessed['text'].apply(lambda x: ' '.join(x))

#verify the joined text
print(df_train_preprocessed['text'].head())
print(df_test_preprocessed['text'].head())

0    im reading lot review saying best game soundtr...
1    soundtrack favorite music time hand intense sa...
2    truly like soundtrack enjoy video game music p...
3    youve played game know divine music every sing...
4    quite sure actually taking time read played ga...
Name: text, dtype: object
0    despite fact played small portion game music h...
1    bought charger jul 2003 worked ok design nice ...
2    check maha energy website powerex mhc204f char...
3    reviewed quite bit combo player hesitant due u...
4    also began incorrect disc problem ive read vcr...
Name: text, dtype: object


In [8]:
#Start with the simplest model, Bag of Words
from sklearn.feature_extraction.text import CountVectorizer

#initialize the vectorizer
vectorizer = CountVectorizer()

#Fit and transform the train data
X_train = vectorizer.fit_transform(df_train_preprocessed['text'])

#Transform the test data using the same vocabulary
X_test =vectorizer.transform(df_test_preprocessed['text'])

Step 3: Model training

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

#train a logistic regression model
model = LogisticRegression()
model.fit(X_train, df_train_preprocessed['polarity'])

#make predictions on the test set
y_pred = model.predict(X_test)

#Evaluate the model
accuracy = accuracy_score(df_test_preprocessed['polarity'], y_pred)
print(f"Accuracy: {accuracy}")

#Print a classification report for detailed evaluation
print(classification_report(df_test_preprocessed['polarity'], y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.8745421863554659
              precision    recall  f1-score   support

           1       0.88      0.87      0.87    200000
           2       0.87      0.88      0.88    199999

    accuracy                           0.87    399999
   macro avg       0.87      0.87      0.87    399999
weighted avg       0.87      0.87      0.87    399999



#### Time for a more advanced model to train this dataset like Deep Learning Models for NLP: RNNs, LSTMs, and GRUs
- Vanilla RNN: Use for tasks with short sequences or when computational efficiency is critical.
- LSTM: Use for tasks with longer sequences, where retaining long-term dependencies is crucial (e.g., machine translation, long text classification).
- GRU: Use when you want a compromise between LSTM’s performance and RNN’s efficiency.

In [15]:
#load the data again: 
#the preprocessed train data
df_train_preprocessed= pd.read_csv('/Users/smirghor/Library/Mobile Documents/com~apple~CloudDocs/Personal/Machine Learning Projects/data.nosync/amazon_review_polarity_csv/preprocessed_reviews_train.csv')


In [16]:
#load the preprocessed test data
df_test_preprocessed = pd.read_csv('/Users/smirghor/Library/Mobile Documents/com~apple~CloudDocs/Personal/Machine Learning Projects/data.nosync/amazon_review_polarity_csv/preprocessed_reviews_test.csv')

In [17]:
#Import necessary libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense 

#LSTM model 
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128)) # Embedding layer for word vector
model.add(LSTM(units=128, return_sequences=False)) #LSTM layer
model.add(Dense(units=1, activation='sigmoid')) #output layer for binary classification

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [24]:
#preprocess the data
#step 1: fit a tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer

#initialize the tokenizer
tokenizer = Tokenizer(num_words=10000) #choose vocab size based on dataset size
tokenizer.fit_on_texts(df_train_preprocessed['text']) #fit on the training data

#Convert text to sequences
X_train = tokenizer.texts_to_sequences(df_train_preprocessed['text'])
X_test = tokenizer.texts_to_sequences(df_test_preprocessed['text'])

In [30]:
#step 2: pad the sequence
#LSTM model expects sequences to be of the same length, so padding is neccesary

from tensorflow.keras.preprocessing.sequence import pad_sequences

#pad sequences to ensure they are the same length
X_train = pad_sequences(X_train, maxlen=100)
X_test = pad_sequences(X_test, maxlen=100)

In [31]:
#determine y_train and y_test
y_train = df_train_preprocessed['polarity']
y_test = df_test_preprocessed['polarity']

In [None]:
#step 3: traint he LSTM model

history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32
)

Epoch 1/10
[1m   658/112500[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:13:34[0m 72ms/step - accuracy: 0.4888 - loss: -15.5928