# Import necessary libraries

In [None]:
import pandas as pd
import gensim.downloader as api
from gensim.test.utils import datapath
from gensim import utils
import gensim.models
from bs4 import BeautifulSoup
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.svm import LinearSVC
import torch
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import sys

In [None]:
print("Pandas version:", pd.__version__)
print("Gensim version:", gensim.__version__)
print("NLTK version:", nltk.__version__)
print("NumPy version:", np.__version__)
print("Torch version:", torch.__version__)
print("TorchVision version:", torchvision.__version__)
print("Python version:", sys.version)

Pandas version: 2.1.4
Gensim version: 4.3.2
NLTK version: 3.8.1
NumPy version: 1.26.2
Torch version: 2.1.2
TorchVision version: 0.16.2
Python version: 3.9.12 (main, Apr  5 2022, 01:52:34) 
[Clang 12.0.0 ]


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cpu


In [None]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/priyamvora/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/priyamvora/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/priyamvora/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# 1. Dataset Generation

In [None]:
df = pd.read_csv('data.tsv',on_bad_lines='skip', sep='\t')

  df = pd.read_csv('data.tsv',on_bad_lines='skip', sep='\t')


# Keep review and ratings and assign classes

In [None]:
# keep only reviews and ratings
df = df[['star_rating', 'review_body']]

# Check for null values in the df
df.isnull().any(axis=1).sum()
df = df.dropna()

# it seems that some values of star_rating are string while some are numeric. the below code will give an error and hence i was able to deduce this
# df['sentiment'] = df['star_rating'].map(lambda x: 1 if x > 3 else 0 if x <= 2 else None)
# df.shape

# Convert 'star_rating' to numeric
df['star_rating'] = pd.to_numeric(df['star_rating'], errors='coerce')
df['star_rating'] = df['star_rating'].astype(int)

# Get counts of reviews for each sentiment class
reviews_greater_than_3 = df[df['star_rating'] > 3].shape[0]
reviews_less_than_equal_2 = df[df['star_rating'] <= 2].shape[0]
reviews_equal_3 = df[df['star_rating'] == 3].shape[0]

print("Number of Reviews with Rating > 3:", reviews_greater_than_3)
print("Number of Reviews with Rating <= 2:", reviews_less_than_equal_2)
print("Number of Reviews with Rating = 3:", reviews_equal_3)

# create sentiment column
df['sentiment'] = df['star_rating'].map(lambda x: 0 if x > 3 else 1 if x <= 2 else 2 if x == 3 else None)


# convert sentiment to int type
df['sentiment'] = df['sentiment'].astype(int)

rating_one = df[df['star_rating'] == 1].sample(n=50000, random_state=42)
rating_two = df[df['star_rating'] == 2].sample(n=50000, random_state=42)
rating_three = df[df['star_rating'] == 3].sample(n=50000, random_state=42)
rating_four = df[df['star_rating'] == 4].sample(n=50000, random_state=42)
rating_five = df[df['star_rating'] == 5].sample(n=50000, random_state=42)

downsized_df = pd.concat([rating_one, rating_two, rating_three, rating_four, rating_five])

Number of Reviews with Rating > 3: 2001052
Number of Reviews with Rating <= 2: 445348
Number of Reviews with Rating = 3: 193680


Got 250000 ratings of each type of rating with classed assigned

# Preprocess and clean

In [None]:
contractions = {"ain't": 'am not / is not / are not / has not / have not', "aren't": 'are not', "can't": 'cannot', "can't've": 'cannot have', "'cause": 'because', "could've": 'could have', "couldn't": 'could not', "couldn't've": 'could not have', "didn't": 'did not', "doesn't": 'does not', "don't": 'do not', "hadn't": 'had not', "hadn't've": 'had not have', "hasn't": 'has not', "haven't": 'have not', "he'd": 'he would / he had', "he'd've": 'he would have', "he'll": 'he will', "he'll've": 'he will have', "he's": 'he is / he has', "how'd": 'how did', "how'd'y": 'how do you', "how'll": 'how will', "how's": 'how is', "I'd": 'I would / I had', "I'd've": 'I would have', "I'll": 'I will', "I'll've": 'I will have', "I'm": 'I am', "I've": 'I have', "isn't": 'is not', "it'd": 'it would / it had', "it'd've": 'it would have', "it'll": 'it will', "it'll've": 'it will have', "it's": 'it is / it has', "let's": 'let us', "ma'am": 'madam', "mayn't": 'may not', "might've": 'might have', "mightn't": 'might not', "mightn't've": 'might not have', "must've": 'must have', "mustn't": 'must not', "mustn't've": 'must not have', "needn't": 'need not', "needn't've": 'need not have', "o'clock": 'of the clock', "oughtn't": 'ought not', "oughtn't've": 'ought not have', "shan't": 'shall not', "sha'n't": 'shall not', "shan't've": 'shall not have', "she'd": 'she would / she had', "she'd've": 'she would have', "she'll": 'she will', "she'll've": 'she will have', "she's": 'she is / she has', "should've": 'should have', "shouldn't": 'should not', "shouldn't've": 'should not have', "so've": 'so have', "so's": 'so is', "that'd": 'that would', "that'd've": 'that would have', "that's": 'that is / that has', "there'd": 'there had', "there'd've": 'there would have', "there's": 'there is / there has', "they'd": 'they would / they had', "they'd've": 'they would have', "they'll": 'they will', "they'll've": 'they will have', "they're": 'they are', "they've": 'they have', "to've": 'to have', "wasn't": 'was not', "we'd": 'we would / we had', "we'd've": 'we would have', "we'll": 'we will', "we'll've": 'we will have', "we're": 'we are', "we've": 'we have', "weren't": 'were not', "what'll": 'what will', "what'll've": 'what will have', "what're": 'what are', "what's": 'what is / what has', "what've": 'what have', "when's": 'when is', "when've": 'when have', "where'd": 'where did', "where's": 'where is / where has', "where've": 'where have', "who'll": 'who will', "who'll've": 'who will have', "who's": 'who is / who has', "who've": 'who have', "why's": 'why is', "why've": 'why have', "will've": 'will have', "won't": 'will not', "won't've": 'will not have', "would've": 'would have', "wouldn't": 'would not', "wouldn't've": 'would not have', "y'all": 'you all', "y'alls": 'you alls', "y'all'd": 'you all would', "y'all'd've": 'you all would have', "y'all're": 'you all are', "y'all've": 'you all have', "you'd": 'you would / you had', "you'd've": 'you would have', "you'll": 'you you will', "you'll've": 'you you will have', "you're": 'you are', "you've": 'you have', "who'd": 'who would / who had', "who're": 'who are'}

def expand_contractions(text):
     for contraction, expansion_options in contractions.items():
        # Select the first option when there are multiple choices
        first_option = expansion_options.split('/')[0].strip()
        text = text.replace(contraction, first_option)
     return text


In [None]:
downsized_df['review_body'] = downsized_df['review_body'].str.lower()
downsized_df['review_body'] = downsized_df['review_body'].apply(lambda x: ' '.join(BeautifulSoup(x, "html.parser").stripped_strings))
downsized_df['review_body'] = downsized_df['review_body'].str.replace('http[s]?://\S+', '', regex=True)
downsized_df['review_body'] = downsized_df['review_body'].str.replace(r'[^a-zA-Z ]', '', regex=True)
downsized_df['review_body'] = downsized_df['review_body'].str.replace(' +', ' ', regex=True)
downsized_df['review_body'] = downsized_df['review_body'].apply(expand_contractions)

  downsized_df['review_body'] = downsized_df['review_body'].apply(lambda x: ' '.join(BeautifulSoup(x, "html.parser").stripped_strings))
  downsized_df['review_body'] = downsized_df['review_body'].apply(lambda x: ' '.join(BeautifulSoup(x, "html.parser").stripped_strings))


In [None]:
# lemmatizer = WordNetLemmatizer()
# def process_stop_filtered_reviews(review):
#     tokens = word_tokenize(review)
#     pos_tags = pos_tag(tokens)
#     pos_tags_mapped = [(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
#     lemmatized_tokens = [lemmatizer.lemmatize(word, pos=tag) if tag is not None else word for word, tag in pos_tags_mapped]
#     return ' '.join(lemmatized_tokens)

# def get_wordnet_pos(tag):
#     if tag.startswith('N'):
#         return 'n'
#     elif tag.startswith('V'):
#         return 'v'
#     elif tag.startswith('R'):
#         return 'r'
#     elif tag.startswith('J'):
#         return 'a'
#     else:
#         return None
# downsized_df['review_body'] = downsized_df['review_body'].apply(process_stop_filtered_reviews)


# Word Embeddings

In [None]:
# using example link as reference - https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html
wv = api.load('word2vec-google-news-300')


In [None]:
print(wv.similarity('cheap', 'inexpensive'))

0.70098954


In [None]:
print(wv.similarity('customer', 'helpful'))

0.08879091


In [None]:
print(wv.similarity('exceed', 'expectation'))

0.2354098


In [None]:
print(wv.similarity('hate', 'dislike'))

0.60132354


In [None]:
print(wv.similarity('product', 'item'))

0.25702554


In [None]:
print(wv.similarity('knife', 'sharp'))

0.2341723


In [None]:
print(wv.most_similar(positive=['woman', 'king'], negative=['man']))

[('queen', 0.7118192911148071), ('monarch', 0.6189674735069275), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321243286133), ('kings', 0.5236844420433044), ('Queen_Consort', 0.5235945582389832), ('queens', 0.5181134343147278), ('sultan', 0.5098593235015869), ('monarchy', 0.5087411403656006)]


In [None]:
print(wv.most_similar('knife', topn=1))

[('kitchen_knife', 0.8097632527351379)]


In [None]:
wv.most_similar(positive=['daughter', 'man'], negative=['woman'])

[('son', 0.8490633368492126),
 ('nephew', 0.7544960975646973),
 ('father', 0.7490662336349487),
 ('brother', 0.7456980347633362),
 ('grandson', 0.719298243522644),
 ('younger_brother', 0.7111448049545288),
 ('uncle', 0.6908944249153137),
 ('dad', 0.6855338215827942),
 ('sons', 0.6790387630462646),
 ('stepson', 0.6781994700431824)]

In [None]:
class ReviewsCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __init__(self, df):
        self.df = df

    def __iter__(self):
        for review_body in self.df['review_body']:
                yield utils.simple_preprocess(review_body)



In [None]:
corpus = ReviewsCorpus(downsized_df)
wv_custom = gensim.models.Word2Vec(sentences=corpus, vector_size=300, window=11, min_count=10)


In [None]:
print(wv_custom.wv.similarity('cheap', 'inexpensive'))

0.5449395


In [None]:
print(wv_custom.wv.similarity('customer', 'helpful'))

0.2393541


In [None]:
print(wv_custom.wv.similarity('exceed', 'expectation'))

0.38266268


In [None]:
print(wv_custom.wv.similarity('hate', 'dislike'))

0.4512464


In [None]:
print(wv_custom.wv.similarity('product', 'item'))

0.76116943


In [None]:
print(wv_custom.wv.similarity('knife', 'sharp'))

0.39065558


In [None]:
print(wv_custom.wv.most_similar(positive=['woman', 'king'], negative=['man']))

[('british', 0.4386180639266968), ('noble', 0.4322429895401001), ('david', 0.4279828369617462), ('barnes', 0.42732563614845276), ('mead', 0.41298791766166687), ('kings', 0.4086189270019531), ('architecture', 0.4056771397590637), ('indiana', 0.4024195969104767), ('franklin', 0.3976956605911255), ('doll', 0.39660584926605225)]


In [None]:
print(wv_custom.wv.most_similar('knife', topn=1))

[('blade', 0.7703830003738403)]


In [None]:
print(wv_custom.wv.most_similar(positive=['daughter', 'man'], negative=['woman']))

[('son', 0.7203930020332336), ('niece', 0.6802166104316711), ('grandson', 0.6796866655349731), ('granddaughter', 0.6793904900550842), ('husband', 0.6417809128761292), ('girlfriend', 0.6299337148666382), ('sister', 0.6271196007728577), ('dad', 0.61836177110672), ('daughters', 0.6152809858322144), ('sons', 0.6127235293388367)]


# Conclusion : Custom Word2Vec vs PreTrained Word2Vec
## Note - I have tried more than 2 pair of words to gain better understanding
For some word pairs, the Google News model appears to capture semantic similarities better, while for others, the custom Amazon model shows higher similarity scores.
For generic pairs of words which can appear in any context and not just Amazon reviews such as (cheap, inexpensive) or (hate, dislike), the Google News model performs significantly better.
For words that are more likely to appear in a review, our custom model performs significantly better than pre trained model such as ('knife', 'sharp') or ('product', 'item')

# Simple Model

In [None]:
# take positive and neg sentiments
simple_df = downsized_df[downsized_df['sentiment'].isin([0, 1])]

In [None]:
def create_X_avg(df, word2vec_model):
    X_avg = []

    for i in range(df.shape[0]):
        curr_review = df.iloc[i]['review_body']
        curr_review = curr_review.replace(',', '')
        curr_review = curr_review.replace('.', '')
        curr_review = curr_review.split()
        curr_vect = []

        for word in curr_review:
            if word in word2vec_model:
                curr_vect.append(word2vec_model[word])

        if len(curr_vect) == 0:
            curr_vect = np.zeros(300, dtype=float)
        else:
            curr_vect = np.mean(curr_vect, axis=0)

        X_avg.append(curr_vect)

    return np.array(X_avg)


In [None]:
X_avg_pretrained = create_X_avg(simple_df, wv)

In [None]:
X_avg_pretrained.shape

(200000, 300)

In [None]:
def create_X_avg_custom(df, word2vec_model):
    X_avg = []

    for i in range(df.shape[0]):
        curr_review = df.iloc[i]['review_body']
        curr_review = curr_review.replace(',', '')
        curr_review = curr_review.replace('.', '')
        curr_review = curr_review.split()
        curr_vect = []

        for word in curr_review:
            if word in word2vec_model.wv:
                curr_vect.append(word2vec_model.wv[word])

        if len(curr_vect) == 0:
            curr_vect = np.zeros(300, dtype=float)
        else:
            curr_vect = np.mean(curr_vect, axis=0)

        X_avg.append(curr_vect)

    return np.array(X_avg)

In [None]:
X_avg_custom = create_X_avg_custom(simple_df, wv_custom)

In [None]:
# Split two different train test - one for pretrained google word2vec and one for our custom word2vec
X_train_pretrained, X_test_pretrained, Y_train_pretrained, Y_test_pretrained = train_test_split(X_avg_pretrained, simple_df['sentiment'], test_size=0.2, random_state=48)
X_train_custom, X_test_custom, Y_train_custom, Y_test_custom = train_test_split(X_avg_custom, simple_df['sentiment'], test_size=0.2, random_state=48)


In [None]:
perceptron_model_pretrained = Perceptron(penalty='elasticnet', alpha=0.000001, random_state=200)

# Train the model on the training data
perceptron_model_pretrained.fit(X_train_pretrained, Y_train_pretrained)

# Make predictions on the test data
Y_test_pred_pretrained = perceptron_model_pretrained.predict(X_test_pretrained)

print('Perceptron Results on pretrained model: ')

# Print accuracy
print("\nAccuracy on Testing data:", round(accuracy_score(Y_test_pretrained, Y_test_pred_pretrained), 2))

Perceptron Results on pretrained model: 

Accuracy on Testing data: 0.72


In [None]:
perceptron_model_custom = Perceptron(penalty='elasticnet', alpha=0.000001, random_state=200)

# Train the model on the training data
perceptron_model_custom.fit(X_train_custom, Y_train_custom)

# Make predictions on the test data
Y_test_pred_custom = perceptron_model_custom.predict(X_test_custom)

print('Perceptron Results on custom model: ')

# Print accuracy
print("\nAccuracy on Testing data:", round(accuracy_score(Y_test_custom, Y_test_pred_custom), 2))

Perceptron Results on custom model: 

Accuracy on Testing data: 0.8


In [None]:
svc_pretrained = LinearSVC(penalty='l1',dual='auto', C=0.2)
svc_pretrained = svc_pretrained.fit(X_train_pretrained, Y_train_pretrained)

Y_test_pred_svc_pretrained = svc_pretrained.predict(X_test_pretrained)

print('SVM Results pretrained: ')


# Print accuracy
print("\nAccuracy on Testing data:", round(accuracy_score(Y_test_pretrained, Y_test_pred_svc_pretrained), 2))

SVM Results pretrained: 

Accuracy on Testing data: 0.83




In [None]:
svc_custom = LinearSVC(penalty='l1',dual='auto', C=0.2)
svc_custom = svc_pretrained.fit(X_train_custom, Y_train_custom)

Y_test_pred_svc_custom = svc_custom.predict(X_test_custom)

print('SVM Results custom: ')


# Print accuracy
print("\nAccuracy on Testing data:", round(accuracy_score(Y_test_custom, Y_test_pred_svc_custom), 2))



SVM Results custom: 

Accuracy on Testing data: 0.86


# Perceptron
Accuracy when using TF-IDF (from HW1) - 0.87 \
Accuracy when using Google pretrained Word2Vec - ~0.72 \
Accuracy when using custom Word2Vec - ~0.79

# SVM
Accuracy when using TF-IDF (from HW1) - 0.91 \
Accuracy when using Google pretrained Word2Vec - 0.83 \
Accuracy when using custom Word2Vec - 0.86

# Conclusion
Our custom Word2Vec outperforms Google's pretrained model for both perceptron and SVM. This should be obvious, because even though Google's training set is much larger, it is more general and does not contain just reviews. Where as our dataset is purely reviews and is much more specialised and hence can capture word semantics much better. \
TF-IDF performs much better than Word2Vec (both custom and pre trained) for both the models. This is because a lot of words in reviews tend be similar or even the same and TF-IDF captures this particular information (how important a word is in a document) really well compared to Word2Vec

# FFNN using Google Word2Vec Model (Binary)

In [None]:
# Following mnist tutorial. Splitting the raw data first
X_train_raw_binary, X_test_raw_binary, Y_train_raw_binary, Y_test_raw_binary = train_test_split(simple_df['review_body'], simple_df['sentiment'], test_size=0.2, random_state=48)


In [None]:
class TrainReview(Dataset):
    def __init__(self, reviews, sentiment, word2vec_model, type):
        self.reviews = reviews
        self.sentiment = sentiment
        self.word2vec_model = word2vec_model
        self.type = type

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        curr_review = self.reviews.iloc[index]
        curr_review = curr_review.replace(',', '')
        curr_review = curr_review.replace('.', '')
        curr_review = curr_review.split()
        curr_vect = []

        if self.type == "google":
            for word in curr_review:
                if word in self.word2vec_model:
                    curr_vect.append(self.word2vec_model[word])
        elif self.type == "custom":
            for word in curr_review:
                if word in self.word2vec_model.wv:
                    curr_vect.append(self.word2vec_model.wv[word])

        if len(curr_vect) == 0:
            curr_vect = np.zeros(300, dtype=float)
        else:
            curr_vect = np.mean(curr_vect, axis=0)

        # Convert to pytorch tensor
        curr_vect = torch.from_numpy(curr_vect)
        sentiment = self.sentiment.iloc[index]

        return curr_vect, sentiment



In [None]:
class TestReview(Dataset):
    def __init__(self, reviews, sentiment, word2vec_model, type):
        self.reviews = reviews
        self.sentiment = sentiment
        self.word2vec_model = word2vec_model
        self.type = type

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        curr_review = self.reviews.iloc[index]
        curr_review = curr_review.replace(',', '')
        curr_review = curr_review.replace('.', '')
        curr_review = curr_review.split()
        curr_vect = []
        if self.type == "google":
            for word in curr_review:
                if word in self.word2vec_model:
                    curr_vect.append(self.word2vec_model[word])
        elif self.type == "custom":
            for word in curr_review:
                if word in self.word2vec_model.wv:
                    curr_vect.append(self.word2vec_model.wv[word])
        if len(curr_vect) == 0:
            curr_vect = np.zeros(300, dtype=float)
        else:
            curr_vect = np.mean(curr_vect, axis=0)

        # Convert to pytorch tensor
        curr_vect = torch.from_numpy(curr_vect)
        sentiment = self.sentiment.iloc[index]

        return curr_vect, sentiment


In [None]:
train_data_avg_google_binary = TrainReview(X_train_raw_binary, Y_train_raw_binary, wv, "google")
test_data_avg_google_binary = TestReview(X_test_raw_binary, Y_test_raw_binary, wv, "google")

In [None]:
# how many samples per batch to load
batch_size = 100
# percentage of training set to use as validation
valid_size = 0.2

# obtain training indices that will be used for validation
num_train = len(train_data_avg_google_binary)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data_avg_google_binary, batch_size=batch_size, sampler=train_sampler)
valid_loader = torch.utils.data.DataLoader(train_data_avg_google_binary, batch_size=batch_size, sampler=valid_sampler)
test_loader = torch.utils.data.DataLoader(test_data_avg_google_binary, batch_size=batch_size)


In [None]:
class FFNetBinary(nn.Module):
    def __init__(self):
        super(FFNetBinary, self).__init__()
        # number of hidden nodes in each layer (512)
        hidden_1 = 50
        hidden_2 = 10

        self.fc1 = nn.Linear(300, hidden_1)

        self.fc2 = nn.Linear(hidden_1, hidden_2)

        self.fc3 = nn.Linear(hidden_2, 2)
        # dropout prevents overfitting of data
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):

        # Flatten the input if it's not already flattened
        x = x.to(torch.float32)

        # Apply the first linear layer with activation and dropout
        x = F.relu(self.fc1(x))
        x = self.dropout(x)

        # Apply the second linear layer with activation and dropout
        x = F.relu(self.fc2(x))
        x = self.dropout(x)

        # Output layer with two units (binary classification)
        x = self.fc3(x)

        return x



In [None]:
FFNetGoogleBinaryModel = FFNetBinary()
print(FFNetGoogleBinaryModel)

FFNetBinary(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(FFNetGoogleBinaryModel.parameters(), lr=0.007)


In [None]:
n_epochs = 50

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf  # set initial "min" to infinity

for epoch in range(n_epochs):
    train_loss = 0.0
    valid_loss = 0.0

    FFNetGoogleBinaryModel.train()  # prep model for training
    for data, target in train_loader:
        optimizer.zero_grad()
        data = data.float()
        output = FFNetGoogleBinaryModel(data)
        target = target.long()  # Convert target to torch.long
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * data.size(0)

    FFNetGoogleBinaryModel.eval()  # prep model for evaluation
    for data, target in valid_loader:
        data = data.float()
        output = FFNetGoogleBinaryModel(data)
        target = target.long()  # Convert target to torch.long
        loss = criterion(output, target)
        valid_loss += loss.item() * data.size(0)

    train_loss = train_loss / (len(train_loader) * batch_size)
    valid_loss = valid_loss / (len(valid_loader) * batch_size)

    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch + 1,
        train_loss,
        valid_loss
    ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
        torch.save(FFNetGoogleBinaryModel.state_dict(), 'model.pt')
        valid_loss_min = valid_loss


Epoch: 1 	Training Loss: 0.425156 	Validation Loss: 0.378977
Validation loss decreased (inf --> 0.378977).  Saving model ...
Epoch: 2 	Training Loss: 0.392802 	Validation Loss: 0.377574
Validation loss decreased (0.378977 --> 0.377574).  Saving model ...
Epoch: 3 	Training Loss: 0.385032 	Validation Loss: 0.360155
Validation loss decreased (0.377574 --> 0.360155).  Saving model ...
Epoch: 4 	Training Loss: 0.377194 	Validation Loss: 0.359571
Validation loss decreased (0.360155 --> 0.359571).  Saving model ...
Epoch: 5 	Training Loss: 0.371030 	Validation Loss: 0.353485
Validation loss decreased (0.359571 --> 0.353485).  Saving model ...
Epoch: 6 	Training Loss: 0.368516 	Validation Loss: 0.358350
Epoch: 7 	Training Loss: 0.367582 	Validation Loss: 0.354510
Epoch: 8 	Training Loss: 0.363359 	Validation Loss: 0.366122
Epoch: 9 	Training Loss: 0.360222 	Validation Loss: 0.363110
Epoch: 10 	Training Loss: 0.357364 	Validation Loss: 0.345815
Validation loss decreased (0.353485 --> 0.345815)

In [None]:
FFNetGoogleBinaryModel.load_state_dict(torch.load('model.pt'))

<All keys matched successfully>

In [None]:
test_loader = torch.utils.data.DataLoader(test_data_avg_google_binary, batch_size=1)


In [None]:
def predict(model, dataloader):
    prediction_list = []
    actual_list = []

    for i, batch in enumerate(dataloader):
        inputs, targets = batch
        inputs = inputs.float()
        inputs = inputs.to(device)  # Convert inputs to Float if needed
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        prediction_list.append(int(predicted[0]))
        actual_list.append(int(targets[0]))

    total = 0
    for i in range(len(prediction_list)):
        if prediction_list[i] == actual_list[i]:
            total += 1

    accuracy = float(total) / len(prediction_list)
    return accuracy

In [None]:
print('Accuracy of FNN using average Google Word2Vec vectors (Binary) :',str(predict(FFNetGoogleBinaryModel, test_loader)))


Accuracy of FNN using average Google Word2Vec vectors (Binary) : 0.8548


# FFNN Using Custom Word2Vec Model (Binary)

In [None]:
train_data_avg_custom_binary = TrainReview(X_train_raw_binary, Y_train_raw_binary, wv_custom, "custom")
test_data_avg_custom_binary = TestReview(X_test_raw_binary, Y_test_raw_binary, wv_custom, "custom")

In [None]:
# how many samples per batch to load
batch_size = 100
# percentage of training set to use as validation
valid_size = 0.2

# obtain training indices that will be used for validation
num_train = len(train_data_avg_custom_binary)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data_avg_custom_binary, batch_size=batch_size, sampler=train_sampler)
valid_loader = torch.utils.data.DataLoader(train_data_avg_custom_binary, batch_size=batch_size, sampler=valid_sampler)
test_loader = torch.utils.data.DataLoader(test_data_avg_custom_binary, batch_size=batch_size)


In [None]:
FFNetCustomBinary = FFNetBinary()
print(FFNetCustomBinary)

FFNetBinary(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(FFNetCustomBinary.parameters(), lr=0.005)


In [None]:
n_epochs = 50

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf  # set initial "min" to infinity

for epoch in range(n_epochs):
    train_loss = 0.0
    valid_loss = 0.0

    FFNetCustomBinary.train()  # prep model for training
    for data, target in train_loader:
        optimizer.zero_grad()
        data = data.float()
        output = FFNetCustomBinary(data)
        target = target.long()  # Convert target to torch.long
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * data.size(0)

    FFNetCustomBinary.eval()  # prep model for evaluation
    for data, target in valid_loader:
        data = data.float()
        output = FFNetCustomBinary(data)
        target = target.long()  # Convert target to torch.long
        loss = criterion(output, target)
        valid_loss += loss.item() * data.size(0)

    train_loss = train_loss / (len(train_loader) * batch_size)
    valid_loss = valid_loss / (len(valid_loader) * batch_size)

    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch + 1,
        train_loss,
        valid_loss
    ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
        torch.save(FFNetCustomBinary.state_dict(), 'FFNetCustomBinary.pt')
        valid_loss_min = valid_loss


Epoch: 1 	Training Loss: 0.662923 	Validation Loss: 0.609698
Validation loss decreased (inf --> 0.609698).  Saving model ...
Epoch: 2 	Training Loss: 0.561073 	Validation Loss: 0.485508
Validation loss decreased (0.609698 --> 0.485508).  Saving model ...
Epoch: 3 	Training Loss: 0.474919 	Validation Loss: 0.417222
Validation loss decreased (0.485508 --> 0.417222).  Saving model ...
Epoch: 4 	Training Loss: 0.429091 	Validation Loss: 0.384521
Validation loss decreased (0.417222 --> 0.384521).  Saving model ...
Epoch: 5 	Training Loss: 0.404059 	Validation Loss: 0.366905
Validation loss decreased (0.384521 --> 0.366905).  Saving model ...
Epoch: 6 	Training Loss: 0.388166 	Validation Loss: 0.355093
Validation loss decreased (0.366905 --> 0.355093).  Saving model ...
Epoch: 7 	Training Loss: 0.377824 	Validation Loss: 0.347460
Validation loss decreased (0.355093 --> 0.347460).  Saving model ...
Epoch: 8 	Training Loss: 0.370564 	Validation Loss: 0.341855
Validation loss decreased (0.34746

In [None]:
FFNetCustomBinary.load_state_dict(torch.load('FFNetCustomBinary.pt'))

<All keys matched successfully>

In [None]:
test_loader = torch.utils.data.DataLoader(test_data_avg_custom_binary, batch_size=1)


In [None]:
print('Accuracy of FNN using average custom Word2Vec vectors (Binary) :',str(predict(FFNetCustomBinary, test_loader)))


Accuracy of FNN using average custom Word2Vec vectors (Binary) : 0.8758


# FFNN using Google Word2Vec Model (Ternary):

In [None]:
X_train_raw_ternary, X_test_raw_ternary, Y_train_raw_ternary, Y_test_raw_ternary = train_test_split(downsized_df['review_body'], downsized_df['sentiment'], test_size=0.2, random_state=48)


In [None]:
train_data_avg_google_ternary = TrainReview(X_train_raw_ternary, Y_train_raw_ternary, wv, "google")
test_data_avg_google_ternary = TestReview(X_test_raw_ternary, Y_test_raw_ternary, wv, "google")

In [None]:
# how many samples per batch to load
batch_size = 100
# percentage of training set to use as validation
valid_size = 0.2

# obtain training indices that will be used for validation
num_train = len(train_data_avg_google_ternary)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data_avg_google_ternary, batch_size=batch_size, sampler=train_sampler)
valid_loader = torch.utils.data.DataLoader(train_data_avg_google_ternary, batch_size=batch_size, sampler=valid_sampler)
test_loader = torch.utils.data.DataLoader(test_data_avg_google_ternary, batch_size=batch_size)


In [None]:
class FFNetTernary(nn.Module):
    def __init__(self):
        super(FFNetTernary, self).__init__()
        # number of hidden nodes in each layer (512)
        hidden_1 = 50
        hidden_2 = 10

        self.fc1 = nn.Linear(300, hidden_1)

        self.fc2 = nn.Linear(hidden_1, hidden_2)

        self.fc3 = nn.Linear(hidden_2, 3)
        # dropout prevents overfitting of data
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):

        # Flatten the input if it's not already flattened
        x = x.to(torch.float32)

        # Apply the first linear layer with activation and dropout
        x = F.relu(self.fc1(x))
        x = self.dropout(x)

        # Apply the second linear layer with activation and dropout
        x = F.relu(self.fc2(x))
        x = self.dropout(x)

        x = self.fc3(x)

        return x



In [None]:
FFNetGoogleTernary = FFNetTernary()
print(FFNetGoogleTernary)

FFNetTernary(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(FFNetGoogleTernary.parameters(), lr=0.0007)


In [None]:
n_epochs = 50

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf  # set initial "min" to infinity

for epoch in range(n_epochs):
    train_loss = 0.0
    valid_loss = 0.0

    FFNetGoogleTernary.train()  # prep model for training
    for data, target in train_loader:
        optimizer.zero_grad()
        data = data.float()
        output = FFNetGoogleTernary(data)
        target = target.long()  # Convert target to torch.long
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * data.size(0)

    FFNetGoogleTernary.eval()  # prep model for evaluation
    for data, target in valid_loader:
        data = data.float()
        output = FFNetGoogleTernary(data)
        target = target.long()  # Convert target to torch.long
        loss = criterion(output, target)
        valid_loss += loss.item() * data.size(0)

    train_loss = train_loss / (len(train_loader) * batch_size)
    valid_loss = valid_loss / (len(valid_loader) * batch_size)

    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch + 1,
        train_loss,
        valid_loss
    ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
        torch.save(FFNetGoogleTernary.state_dict(), 'FFNetGoogleTernary.pt')
        valid_loss_min = valid_loss


Epoch: 1 	Training Loss: 0.850719 	Validation Loss: 0.777041
Validation loss decreased (inf --> 0.777041).  Saving model ...
Epoch: 2 	Training Loss: 0.788402 	Validation Loss: 0.760509
Validation loss decreased (0.777041 --> 0.760509).  Saving model ...
Epoch: 3 	Training Loss: 0.773484 	Validation Loss: 0.750814
Validation loss decreased (0.760509 --> 0.750814).  Saving model ...
Epoch: 4 	Training Loss: 0.766543 	Validation Loss: 0.747825
Validation loss decreased (0.750814 --> 0.747825).  Saving model ...
Epoch: 5 	Training Loss: 0.759825 	Validation Loss: 0.740753
Validation loss decreased (0.747825 --> 0.740753).  Saving model ...
Epoch: 6 	Training Loss: 0.753545 	Validation Loss: 0.747729
Epoch: 7 	Training Loss: 0.750422 	Validation Loss: 0.731484
Validation loss decreased (0.740753 --> 0.731484).  Saving model ...
Epoch: 8 	Training Loss: 0.745416 	Validation Loss: 0.728270
Validation loss decreased (0.731484 --> 0.728270).  Saving model ...
Epoch: 9 	Training Loss: 0.743404 

In [None]:
FFNetGoogleTernary.load_state_dict(torch.load('FFNetGoogleTernary.pt'))

<All keys matched successfully>

In [None]:
test_loader = torch.utils.data.DataLoader(test_data_avg_google_ternary, batch_size=1)


In [None]:
print('Accuracy of FNN using average Google Word2Vec vectors (Ternary) :',str(predict(FFNetGoogleTernary, test_loader)))


Accuracy of FNN using average Google Word2Vec vectors (Ternary) : 0.69978


# FFNN using Custom Word2Vec Model (Ternary)

In [None]:
train_data_avg_custom_ternary = TrainReview(X_train_raw_ternary, Y_train_raw_ternary, wv_custom, "custom")
test_data_avg_custom_ternary = TestReview(X_test_raw_ternary, Y_test_raw_ternary, wv_custom, "custom")

In [None]:
# how many samples per batch to load
batch_size = 100
# percentage of training set to use as validation
valid_size = 0.2

# obtain training indices that will be used for validation
num_train = len(train_data_avg_custom_ternary)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data_avg_custom_ternary, batch_size=batch_size, sampler=train_sampler)
valid_loader = torch.utils.data.DataLoader(train_data_avg_custom_ternary, batch_size=batch_size, sampler=valid_sampler)
test_loader = torch.utils.data.DataLoader(test_data_avg_custom_ternary, batch_size=batch_size)


In [None]:
FFNetCustomTernary = FFNetTernary()
print(FFNetCustomTernary)

FFNetTernary(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(FFNetCustomTernary.parameters(), lr=0.005)


In [None]:
n_epochs = 50

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf  # set initial "min" to infinity

for epoch in range(n_epochs):
    train_loss = 0.0
    valid_loss = 0.0

    FFNetCustomTernary.train()  # prep model for training
    for data, target in train_loader:
        optimizer.zero_grad()
        data = data.float()
        output = FFNetCustomTernary(data)
        target = target.long()  # Convert target to torch.long
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * data.size(0)

    FFNetCustomTernary.eval()  # prep model for evaluation
    for data, target in valid_loader:
        data = data.float()
        output = FFNetCustomTernary(data)
        target = target.long()  # Convert target to torch.long
        loss = criterion(output, target)
        valid_loss += loss.item() * data.size(0)

    train_loss = train_loss / (len(train_loader) * batch_size)
    valid_loss = valid_loss / (len(valid_loader) * batch_size)

    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch + 1,
        train_loss,
        valid_loss
    ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
        torch.save(FFNetCustomTernary.state_dict(), 'FFNetCustomTernary.pt')
        valid_loss_min = valid_loss


Epoch: 1 	Training Loss: 1.027750 	Validation Loss: 0.971297
Validation loss decreased (inf --> 0.971297).  Saving model ...
Epoch: 2 	Training Loss: 0.925748 	Validation Loss: 0.857305
Validation loss decreased (0.971297 --> 0.857305).  Saving model ...
Epoch: 3 	Training Loss: 0.853395 	Validation Loss: 0.802508
Validation loss decreased (0.857305 --> 0.802508).  Saving model ...
Epoch: 4 	Training Loss: 0.816546 	Validation Loss: 0.771337
Validation loss decreased (0.802508 --> 0.771337).  Saving model ...
Epoch: 5 	Training Loss: 0.794933 	Validation Loss: 0.754074
Validation loss decreased (0.771337 --> 0.754074).  Saving model ...
Epoch: 6 	Training Loss: 0.781182 	Validation Loss: 0.742689
Validation loss decreased (0.754074 --> 0.742689).  Saving model ...
Epoch: 7 	Training Loss: 0.769515 	Validation Loss: 0.734337
Validation loss decreased (0.742689 --> 0.734337).  Saving model ...
Epoch: 8 	Training Loss: 0.763547 	Validation Loss: 0.727912
Validation loss decreased (0.73433

In [None]:
FFNetCustomTernary.load_state_dict(torch.load('FFNetCustomTernary.pt'))

<All keys matched successfully>

In [None]:
test_loader = torch.utils.data.DataLoader(test_data_avg_custom_ternary, batch_size=1)


In [None]:
print('Accuracy of FNN using average custom Word2Vec vectors (Ternary) :',str(predict(FFNetCustomTernary, test_loader)))


Accuracy of FNN using average custom Word2Vec vectors (Ternary) : 0.71816


# FFNN Concat Google Vectors - (Binary)

In [None]:
X_train_raw_binary, X_test_raw_binary, Y_train_raw_binary, Y_test_raw_binary = train_test_split(simple_df['review_body'], simple_df['sentiment'], test_size=0.2, random_state=48)


In [None]:
class TrainReviewConcatenation(Dataset):
    def __init__(self, reviews, sentiment, word2vec_model, type):
        self.reviews = reviews
        self.sentiment = sentiment
        self.word2vec_model = word2vec_model
        self.type = type

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        curr_review = self.reviews.iloc[index]
        curr_review = curr_review.replace(',', '')
        curr_review = curr_review.replace('.', '')
        curr_review = curr_review.split()
        curr_vect = []
        count = 0
        if self.type == "google":
            for word in curr_review:
                if count == 10:
                    break
                if word in self.word2vec_model:
                    count+=1
                    curr_vect.append(self.word2vec_model[word])
        elif self.type == "custom":
            for word in curr_review:
                if count == 10:
                    break
                if word in self.word2vec_model.wv:
                    count+=1
                    curr_vect.append(self.word2vec_model.wv[word])
        # if review is less than 10 words, append zeros
        while count < 10:
            curr_vect.append(np.zeros(300, dtype=float))
            count+=1
        if len(curr_vect) == 0:
            curr_vect = np.zeros(3000, dtype=float)
        else:
            curr_vect = np.array(curr_vect)
            curr_vect = curr_vect.flatten()


        # Convert to pytorch tensor
        curr_vect = torch.from_numpy(curr_vect)
        sentiment = self.sentiment.iloc[index]

        return curr_vect, sentiment



In [None]:
class TestReviewConcatenation(Dataset):
    def __init__(self, reviews, sentiment, word2vec_model, type):
        self.reviews = reviews
        self.sentiment = sentiment
        self.word2vec_model = word2vec_model
        self.type = type

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        curr_review = self.reviews.iloc[index]
        curr_review = curr_review.replace(',', '')
        curr_review = curr_review.replace('.', '')
        curr_review = curr_review.split()
        curr_vect = []
        count = 0
        if self.type == "google":
            for word in curr_review:
                if(count == 10):
                    break
                if word in self.word2vec_model:
                    count+=1
                    curr_vect.append(self.word2vec_model[word])
        elif self.type == "custom":
            for word in curr_review:
                if(count == 10):
                    break
                if word in self.word2vec_model.wv:
                    count+=1
                    curr_vect.append(self.word2vec_model.wv[word])
        while count < 10:
            curr_vect.append(np.zeros(300, dtype=float))
            count+=1
        if len(curr_vect) == 0:
            curr_vect = np.zeros(3000, dtype=float)
        else:
            curr_vect = np.array(curr_vect)
            curr_vect = curr_vect.flatten()

        # Convert to pytorch tensor
        curr_vect = torch.from_numpy(curr_vect)
        sentiment = self.sentiment.iloc[index]

        return curr_vect, sentiment


In [None]:
train_data_concat_google_binary = TrainReviewConcatenation(X_train_raw_binary, Y_train_raw_binary, wv, "google")
test_data_concat_google_binary = TestReviewConcatenation(X_test_raw_binary, Y_test_raw_binary, wv, "google")

In [None]:
sample_index = 12454
sample, label = train_data_concat_google_binary[sample_index]

# Print the shape of the sample and its label
print("Sample shape:", sample.shape)
print("Label:", label)
print("Sample content:", sample)

# unique_labels = set()
# for _, label in train_data_concat_google_binary:
#     unique_labels.add(label)

# # Print unique labels
# print("Unique Labels:", unique_labels)

Sample shape: torch.Size([3000])
Label: 0
Sample content: tensor([0.0801, 0.1050, 0.0498,  ..., 0.0325, 0.2793, 0.2451])


In [None]:
# how many samples per batch to load
batch_size = 100
# percentage of training set to use as validation
valid_size = 0.2

# obtain training indices that will be used for validation
num_train = len(train_data_concat_google_binary)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data_concat_google_binary, batch_size=batch_size, sampler=train_sampler)
valid_loader = torch.utils.data.DataLoader(train_data_concat_google_binary, batch_size=batch_size, sampler=valid_sampler)
test_loader = torch.utils.data.DataLoader(test_data_concat_google_binary, batch_size=batch_size)


In [None]:
class FFNetConcatBinary(nn.Module):
    def __init__(self):
        super(FFNetConcatBinary, self).__init__()
        hidden_1 = 50
        hidden_2 = 10

        self.fc1 = nn.Linear(3000, hidden_1)

        self.fc2 = nn.Linear(hidden_1, hidden_2)

        self.fc3 = nn.Linear(hidden_2, 2)
        # dropout prevents overfitting of data
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):

        # Flatten the input if it's not already flattened
        x = x.to(torch.float32)

        # Apply the first linear layer with activation and dropout
        x = F.relu(self.fc1(x))
        x = self.dropout(x)

        # Apply the second linear layer with activation and dropout
        x = F.relu(self.fc2(x))
        x = self.dropout(x)

        # Output layer with two units (binary classification)
        x = self.fc3(x)

        return x



In [None]:
FFNetGoogleBinaryConcatModel = FFNetConcatBinary()
print(FFNetGoogleBinaryConcatModel)

FFNetConcatBinary(
  (fc1): Linear(in_features=3000, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(FFNetGoogleBinaryConcatModel.parameters(), lr=0.007)


In [None]:
n_epochs = 50

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf  # set initial "min" to infinity

for epoch in range(n_epochs):
    train_loss = 0.0
    valid_loss = 0.0

    FFNetGoogleBinaryConcatModel.train()  # prep model for training
    for data, target in train_loader:
        optimizer.zero_grad()
        data = data.float()
        output = FFNetGoogleBinaryConcatModel(data)
        target = target.long()  # Convert target to torch.long
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * data.size(0)

    FFNetGoogleBinaryConcatModel.eval()  # prep model for evaluation
    for data, target in valid_loader:
        data = data.float()
        output = FFNetGoogleBinaryConcatModel(data)
        target = target.long()  # Convert target to torch.long
        loss = criterion(output, target)
        valid_loss += loss.item() * data.size(0)

    train_loss = train_loss / (len(train_loader) * batch_size)
    valid_loss = valid_loss / (len(valid_loader) * batch_size)

    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch + 1,
        train_loss,
        valid_loss
    ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
        torch.save(FFNetGoogleBinaryConcatModel.state_dict(), 'FFNetGoogleBinaryConcatModel.pt')
        valid_loss_min = valid_loss


Epoch: 1 	Training Loss: 0.436369 	Validation Loss: 0.469405
Validation loss decreased (inf --> 0.469405).  Saving model ...
Epoch: 2 	Training Loss: 0.435617 	Validation Loss: 0.469731
Epoch: 3 	Training Loss: 0.433587 	Validation Loss: 0.471212
Epoch: 4 	Training Loss: 0.433345 	Validation Loss: 0.469882
Epoch: 5 	Training Loss: 0.432315 	Validation Loss: 0.467921
Validation loss decreased (0.469405 --> 0.467921).  Saving model ...
Epoch: 6 	Training Loss: 0.432433 	Validation Loss: 0.468019
Epoch: 7 	Training Loss: 0.430994 	Validation Loss: 0.467667
Validation loss decreased (0.467921 --> 0.467667).  Saving model ...
Epoch: 8 	Training Loss: 0.431140 	Validation Loss: 0.467769
Epoch: 9 	Training Loss: 0.430080 	Validation Loss: 0.467743
Epoch: 10 	Training Loss: 0.430776 	Validation Loss: 0.468133
Epoch: 11 	Training Loss: 0.430516 	Validation Loss: 0.467035
Validation loss decreased (0.467667 --> 0.467035).  Saving model ...
Epoch: 12 	Training Loss: 0.428352 	Validation Loss: 0.4

In [None]:
FFNetGoogleBinaryConcatModel.load_state_dict(torch.load('FFNetGoogleBinaryConcatModel.pt'))

<All keys matched successfully>

In [None]:
test_loader = torch.utils.data.DataLoader(test_data_concat_google_binary, batch_size=1)


In [None]:
print('Accuracy of FNN using concatened Google Word2Vec vectors (Binary) :',str(predict(FFNetGoogleBinaryConcatModel, test_loader)))


Accuracy of FNN using concatened Google Word2Vec vectors (Binary) : 0.772575


# FNN using concatened Custom Word2Vec vectors (Binary)

In [None]:
train_data_concat_custom_binary = TrainReviewConcatenation(X_train_raw_binary, Y_train_raw_binary, wv_custom, "custom")
test_data_concat_custom_binary = TestReviewConcatenation(X_test_raw_binary, Y_test_raw_binary, wv_custom, "custom")

In [None]:
# how many samples per batch to load
batch_size = 100
# percentage of training set to use as validation
valid_size = 0.2

# obtain training indices that will be used for validation
num_train = len(train_data_concat_custom_binary)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data_concat_custom_binary, batch_size=batch_size, sampler=train_sampler)
valid_loader = torch.utils.data.DataLoader(train_data_concat_custom_binary, batch_size=batch_size, sampler=valid_sampler)
test_loader = torch.utils.data.DataLoader(test_data_concat_custom_binary, batch_size=batch_size)


In [None]:
FFNetCustomBinaryConcatModel = FFNetConcatBinary()
print(FFNetCustomBinaryConcatModel)

FFNetConcatBinary(
  (fc1): Linear(in_features=3000, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(FFNetCustomBinaryConcatModel.parameters(), lr=0.0007)


In [None]:
n_epochs = 50

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf  # set initial "min" to infinity

for epoch in range(n_epochs):
    train_loss = 0.0
    valid_loss = 0.0

    FFNetCustomBinaryConcatModel.train()  # prep model for training
    for data, target in train_loader:
        optimizer.zero_grad()
        data = data.float()
        output = FFNetCustomBinaryConcatModel(data)
        target = target.long()  # Convert target to torch.long
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * data.size(0)

    FFNetCustomBinaryConcatModel.eval()  # prep model for evaluation
    for data, target in valid_loader:
        data = data.float()
        output = FFNetCustomBinaryConcatModel(data)
        target = target.long()  # Convert target to torch.long
        loss = criterion(output, target)
        valid_loss += loss.item() * data.size(0)

    train_loss = train_loss / (len(train_loader) * batch_size)
    valid_loss = valid_loss / (len(valid_loader) * batch_size)

    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch + 1,
        train_loss,
        valid_loss
    ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
        torch.save(FFNetCustomBinaryConcatModel.state_dict(), 'FFNetCustomBinaryConcatModel.pt')
        valid_loss_min = valid_loss


Epoch: 1 	Training Loss: 0.360061 	Validation Loss: 0.465044
Validation loss decreased (inf --> 0.465044).  Saving model ...
Epoch: 2 	Training Loss: 0.358535 	Validation Loss: 0.464150
Validation loss decreased (0.465044 --> 0.464150).  Saving model ...
Epoch: 3 	Training Loss: 0.359751 	Validation Loss: 0.466547
Epoch: 4 	Training Loss: 0.359735 	Validation Loss: 0.465934
Epoch: 5 	Training Loss: 0.360517 	Validation Loss: 0.463393
Validation loss decreased (0.464150 --> 0.463393).  Saving model ...
Epoch: 6 	Training Loss: 0.358409 	Validation Loss: 0.465918
Epoch: 7 	Training Loss: 0.359137 	Validation Loss: 0.466010
Epoch: 8 	Training Loss: 0.358698 	Validation Loss: 0.464017
Epoch: 9 	Training Loss: 0.358866 	Validation Loss: 0.465587
Epoch: 10 	Training Loss: 0.358853 	Validation Loss: 0.462787
Validation loss decreased (0.463393 --> 0.462787).  Saving model ...
Epoch: 11 	Training Loss: 0.359252 	Validation Loss: 0.462414
Validation loss decreased (0.462787 --> 0.462414).  Savi

In [None]:
FFNetCustomBinaryConcatModel.load_state_dict(torch.load('FFNetCustomBinaryConcatModel.pt'))

<All keys matched successfully>

In [None]:
test_loader = torch.utils.data.DataLoader(test_data_concat_custom_binary, batch_size=1)


In [None]:
print('Accuracy of FNN using concatened Custom Word2Vec vectors (Binary) :',str(predict(FFNetCustomBinaryConcatModel, test_loader)))


Accuracy of FNN using concatened Custom Word2Vec vectors (Binary) : 0.7847


# FFNN using Google Concat vectors (Ternary)

In [None]:
X_train_raw_ternary, X_test_raw_ternary, Y_train_raw_ternary, Y_test_raw_ternary = train_test_split(downsized_df['review_body'], downsized_df['sentiment'], test_size=0.2, random_state=48)


In [None]:
train_data_concat_google_ternary = TrainReviewConcatenation(X_train_raw_ternary, Y_train_raw_ternary, wv, "google")
test_data_concat_google_ternary = TestReviewConcatenation(X_test_raw_ternary, Y_test_raw_ternary, wv, "google")

In [None]:
# how many samples per batch to load
batch_size = 100
# percentage of training set to use as validation
valid_size = 0.2

# obtain training indices that will be used for validation
num_train = len(train_data_concat_google_ternary)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data_concat_google_ternary, batch_size=batch_size, sampler=train_sampler)
valid_loader = torch.utils.data.DataLoader(train_data_concat_google_ternary, batch_size=batch_size, sampler=valid_sampler)
test_loader = torch.utils.data.DataLoader(test_data_concat_google_ternary, batch_size=batch_size)


In [None]:
class FFNetConcatTernary(nn.Module):
    def __init__(self):
        super(FFNetConcatTernary, self).__init__()
        hidden_1 = 50
        hidden_2 = 10

        self.fc1 = nn.Linear(3000, hidden_1)

        self.fc2 = nn.Linear(hidden_1, hidden_2)

        self.fc3 = nn.Linear(hidden_2, 3)
        # dropout prevents overfitting of data
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):

        # Flatten the input if it's not already flattened
        x = x.to(torch.float32)

        # Apply the first linear layer with activation and dropout
        x = F.relu(self.fc1(x))
        x = self.dropout(x)

        # Apply the second linear layer with activation and dropout
        x = F.relu(self.fc2(x))
        x = self.dropout(x)

        x = self.fc3(x)

        return x



In [None]:
FFNetGoogleTernaryConcatModel = FFNetConcatTernary()
print(FFNetGoogleTernaryConcatModel)

FFNetConcatTernary(
  (fc1): Linear(in_features=3000, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(FFNetGoogleTernaryConcatModel.parameters(), lr=0.005)


In [None]:
n_epochs = 50

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf  # set initial "min" to infinity

for epoch in range(n_epochs):
    train_loss = 0.0
    valid_loss = 0.0

    FFNetGoogleTernaryConcatModel.train()  # prep model for training
    for data, target in train_loader:
        optimizer.zero_grad()
        data = data.float()
        output = FFNetGoogleTernaryConcatModel(data)
        target = target.long()  # Convert target to torch.long
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * data.size(0)

    FFNetGoogleTernaryConcatModel.eval()  # prep model for evaluation
    for data, target in valid_loader:
        data = data.float()
        output = FFNetGoogleTernaryConcatModel(data)
        target = target.long()  # Convert target to torch.long
        loss = criterion(output, target)
        valid_loss += loss.item() * data.size(0)

    train_loss = train_loss / (len(train_loader) * batch_size)
    valid_loss = valid_loss / (len(valid_loader) * batch_size)

    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch + 1,
        train_loss,
        valid_loss
    ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
        torch.save(FFNetGoogleTernaryConcatModel.state_dict(), 'FFNetGoogleTernaryConcatModel.pt')
        valid_loss_min = valid_loss


Epoch: 1 	Training Loss: 0.803996 	Validation Loss: 0.842168
Validation loss decreased (inf --> 0.842168).  Saving model ...
Epoch: 2 	Training Loss: 0.800964 	Validation Loss: 0.841476
Validation loss decreased (0.842168 --> 0.841476).  Saving model ...
Epoch: 3 	Training Loss: 0.800026 	Validation Loss: 0.841482
Epoch: 4 	Training Loss: 0.798978 	Validation Loss: 0.842239
Epoch: 5 	Training Loss: 0.798609 	Validation Loss: 0.840493
Validation loss decreased (0.841476 --> 0.840493).  Saving model ...
Epoch: 6 	Training Loss: 0.797656 	Validation Loss: 0.841151
Epoch: 7 	Training Loss: 0.797754 	Validation Loss: 0.841166
Epoch: 8 	Training Loss: 0.796667 	Validation Loss: 0.840628
Epoch: 9 	Training Loss: 0.796081 	Validation Loss: 0.840361
Validation loss decreased (0.840493 --> 0.840361).  Saving model ...
Epoch: 10 	Training Loss: 0.796908 	Validation Loss: 0.840928
Epoch: 11 	Training Loss: 0.796157 	Validation Loss: 0.840847
Epoch: 12 	Training Loss: 0.796553 	Validation Loss: 0.8

In [None]:
FFNetGoogleTernaryConcatModel.load_state_dict(torch.load('FFNetGoogleTernaryConcatModel.pt'))

<All keys matched successfully>

In [None]:
test_loader = torch.utils.data.DataLoader(test_data_concat_google_ternary, batch_size=1)

In [None]:
print('Accuracy of FNN using concatened Google Word2Vec vectors (Ternary) :',str(predict(FFNetGoogleTernaryConcatModel, test_loader)))


Accuracy of FNN using concatened Google Word2Vec vectors (Ternary) : 0.62866


# FFNN using Custom Concat vectors (Ternary)

In [None]:
train_data_concat_custom_ternary = TrainReviewConcatenation(X_train_raw_ternary, Y_train_raw_ternary, wv_custom, "custom")
test_data_concat_custom_ternary = TestReviewConcatenation(X_test_raw_ternary, Y_test_raw_ternary, wv_custom, "custom")

In [None]:
# how many samples per batch to load
batch_size = 100
# percentage of training set to use as validation
valid_size = 0.2

# obtain training indices that will be used for validation
num_train = len(train_data_concat_custom_ternary)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data_concat_custom_ternary, batch_size=batch_size, sampler=train_sampler)
valid_loader = torch.utils.data.DataLoader(train_data_concat_custom_ternary, batch_size=batch_size, sampler=valid_sampler)
test_loader = torch.utils.data.DataLoader(test_data_concat_custom_ternary, batch_size=batch_size)


In [None]:
FFNetCustomTernaryConcatModel = FFNetConcatTernary()
print(FFNetCustomTernaryConcatModel)

FFNetConcatTernary(
  (fc1): Linear(in_features=3000, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(FFNetCustomTernaryConcatModel.parameters(), lr=0.005)


In [None]:
n_epochs = 50

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf  # set initial "min" to infinity

for epoch in range(n_epochs):
    train_loss = 0.0
    valid_loss = 0.0

    FFNetCustomTernaryConcatModel.train()  # prep model for training
    for data, target in train_loader:
        optimizer.zero_grad()
        data = data.float()
        output = FFNetCustomTernaryConcatModel(data)
        target = target.long()  # Convert target to torch.long
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * data.size(0)

    FFNetCustomTernaryConcatModel.eval()  # prep model for evaluation
    for data, target in valid_loader:
        data = data.float()
        output = FFNetCustomTernaryConcatModel(data)
        target = target.long()  # Convert target to torch.long
        loss = criterion(output, target)
        valid_loss += loss.item() * data.size(0)

    train_loss = train_loss / (len(train_loader) * batch_size)
    valid_loss = valid_loss / (len(valid_loader) * batch_size)

    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch + 1,
        train_loss,
        valid_loss
    ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
        torch.save(FFNetCustomTernaryConcatModel.state_dict(), 'FFNetCustomTernaryConcatModel.pt')
        valid_loss_min = valid_loss


Epoch: 1 	Training Loss: 0.781682 	Validation Loss: 0.820052
Validation loss decreased (inf --> 0.820052).  Saving model ...
Epoch: 2 	Training Loss: 0.777908 	Validation Loss: 0.820021
Validation loss decreased (0.820052 --> 0.820021).  Saving model ...
Epoch: 3 	Training Loss: 0.776842 	Validation Loss: 0.822191
Epoch: 4 	Training Loss: 0.775477 	Validation Loss: 0.817134
Validation loss decreased (0.820021 --> 0.817134).  Saving model ...
Epoch: 5 	Training Loss: 0.775500 	Validation Loss: 0.819469
Epoch: 6 	Training Loss: 0.776160 	Validation Loss: 0.816090
Validation loss decreased (0.817134 --> 0.816090).  Saving model ...
Epoch: 7 	Training Loss: 0.773364 	Validation Loss: 0.816122
Epoch: 8 	Training Loss: 0.772598 	Validation Loss: 0.816022
Validation loss decreased (0.816090 --> 0.816022).  Saving model ...
Epoch: 9 	Training Loss: 0.772099 	Validation Loss: 0.816231
Epoch: 10 	Training Loss: 0.772981 	Validation Loss: 0.817808
Epoch: 11 	Training Loss: 0.772320 	Validation Lo

In [None]:
FFNetCustomTernaryConcatModel.load_state_dict(torch.load('FFNetCustomTernaryConcatModel.pt'))

<All keys matched successfully>

In [None]:
test_loader = torch.utils.data.DataLoader(test_data_concat_custom_ternary, batch_size=1)

In [None]:
print('Accuracy of FNN using concatened Custom Word2Vec vectors (Ternary) :',str(predict(FFNetCustomTernaryConcatModel, test_loader)))


Accuracy of FNN using concatened Custom Word2Vec vectors (Ternary) : 0.6348


# Conclusion
We trained a lot of FFNN models in this section. Total of about 8.
As expected the Binary Models performed better than Ternary Models.
In each of the models, our own model performed relatively better than model trained on Google's dataset. This is obvious because Google data set is not about reviews and more generic. Our dataset is about reviews and hence our models perform better on review classification task

## Comparision of Binary Models with Simple Models
### When taking average vectors
1. FFNN trained on Google vectors and our own vectors performed much better than Perceptron and SVM trained on Google vector and our own vector. This is because of the larger network of hidden layers and nodes, which enables better learning across epochs to classify the data.
   
### When taking concatenated vectors
1. FFNN trained on Google vectors and our own vectors performed almost same as Perceptron and poor compared to SVM trained on Google vector and our own vector. It could be  because as the first 10 words being concatenated do not necessarily have all the information needed to conclude the sentiment of the review


# CNN Google Vectors (Binary)

In [None]:
X_train_raw_binary, X_test_raw_binary, Y_train_raw_binary, Y_test_raw_binary = train_test_split(simple_df['review_body'], simple_df['sentiment'], test_size=0.2, random_state=48)


In [None]:
class TrainReviewCNN(Dataset):
    def __init__(self, reviews, sentiment, word2vec_model, type, max_length=50, vector_size=300):
        self.reviews = reviews
        self.sentiment = sentiment
        self.word2vec_model = word2vec_model
        self.type = type
        self.max_length = max_length
        self.vector_size = vector_size

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        curr_review = self.reviews.iloc[index]
        curr_review = curr_review.replace(',', '')
        curr_review = curr_review.replace('.', '')
        curr_review = curr_review.split()
        curr_vect = []
        count = 0
        if self.type == "google":
            for word in curr_review:
                if count == self.max_length:
                    break
                if word in self.word2vec_model:
                    count += 1
                    curr_vect.append(self.word2vec_model[word])
        elif self.type == "custom":
            for word in curr_review:
                if count == self.max_length:
                    break
                if word in self.word2vec_model.wv:
                    count += 1
                    curr_vect.append(self.word2vec_model.wv[word])
        # if review is less than max_length words, append zeros
        while count < self.max_length:
            curr_vect.append(np.zeros(self.vector_size, dtype=float))
            count += 1
        if len(curr_vect) == 0:
            curr_vect = np.zeros([self.max_length, self.vector_size], dtype=float)
        else:
            curr_vect = np.array(curr_vect)
            # curr_vect = curr_vect.flatten()
        curr_vect = np.transpose(curr_vect)
        # Convert to pytorch tensor
        curr_vect = torch.from_numpy(curr_vect)
        sentiment = self.sentiment.iloc[index]

        return curr_vect, sentiment


In [None]:
class TestReviewCNN(Dataset):
    def __init__(self, reviews, sentiment, word2vec_model, type, max_length=50, vector_size=300):
        self.reviews = reviews
        self.sentiment = sentiment
        self.word2vec_model = word2vec_model
        self.type = type
        self.max_length = max_length
        self.vector_size = vector_size

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        curr_review = self.reviews.iloc[index]
        curr_review = curr_review.replace(',', '')
        curr_review = curr_review.replace('.', '')
        curr_review = curr_review.split()
        curr_vect = []
        count = 0
        if self.type == "google":
            for word in curr_review:
                if count == self.max_length:
                    break
                if word in self.word2vec_model:
                    count += 1
                    curr_vect.append(self.word2vec_model[word])
        elif self.type == "custom":
            for word in curr_review:
                if count == self.max_length:
                    break
                if word in self.word2vec_model.wv:
                    count += 1
                    curr_vect.append(self.word2vec_model.wv[word])
        # if review is less than max_length words, append zeros
        while count < self.max_length:
            curr_vect.append(np.zeros(self.vector_size, dtype=float))
            count += 1
        if len(curr_vect) == 0:
            curr_vect = np.zeros([self.max_length,self.vector_size], dtype=float)
        else:
            curr_vect = np.array(curr_vect)
            # curr_vect = curr_vect.flatten()
        curr_vect = np.transpose(curr_vect)
        # Convert to pytorch tensor
        curr_vect = torch.from_numpy(curr_vect)
        sentiment = self.sentiment.iloc[index]

        return curr_vect, sentiment


In [None]:
train_data_cnn_google_binary = TrainReviewCNN(X_train_raw_binary, Y_train_raw_binary, wv, "google")
test_data_cnn_google_binary = TestReviewCNN(X_test_raw_binary, Y_test_raw_binary, wv, "google")

In [None]:
sample_index = 12454
sample, label = train_data_cnn_google_binary[sample_index]

# Print the shape of the sample and its label
print("Sample shape:", sample.shape)
print("Label:", label)
print("Sample content:", sample)


Sample shape: torch.Size([50, 300])
Label: 0
Sample content: tensor([[ 0.0801,  0.1050,  0.0498,  ...,  0.0037,  0.0476, -0.0688],
        [-0.1021, -0.0603, -0.1123,  ...,  0.0216, -0.0095, -0.1523],
        [-0.0199, -0.0237,  0.0767,  ..., -0.0786,  0.0952, -0.2451],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       dtype=torch.float64)


In [None]:
# how many samples per batch to load
batch_size = 100
# percentage of training set to use as validation
valid_size = 0.2

# obtain training indices that will be used for validation
num_train = len(train_data_cnn_google_binary)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data_cnn_google_binary, batch_size=batch_size, sampler=train_sampler)
valid_loader = torch.utils.data.DataLoader(train_data_cnn_google_binary, batch_size=batch_size, sampler=valid_sampler)
test_loader = torch.utils.data.DataLoader(test_data_cnn_google_binary, batch_size=batch_size)


In [None]:
class BinaryCNN(nn.Module):
    def __init__(self, output_channels1=50, output_channels2=10, max_length=50, vector_size=300):
        super(BinaryCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=vector_size, out_channels=output_channels1, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=output_channels1, out_channels=output_channels2, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(2)

        self.fc1 = nn.Linear(120, 2)

        self.dropout1 = nn.Dropout(0.3)
        self.dropout2 = nn.Dropout(0.3)

    def forward(self, x):
        x = F.relu(self.pool(self.conv1(x)))
        x = self.dropout1(x)
        x = F.relu(self.pool(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        return x




In [None]:
GoogleBinaryCNN = BinaryCNN().to(device)
print(GoogleBinaryCNN)

BinaryCNN(
  (conv1): Conv1d(300, 50, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(50, 10, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=120, out_features=2, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (dropout2): Dropout(p=0.3, inplace=False)
)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(GoogleBinaryCNN.parameters(), lr=0.005)


In [None]:
n_epochs = 10

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf  # set initial "min" to infinity

for epoch in range(n_epochs):
    train_loss = 0.0
    valid_loss = 0.0

    GoogleBinaryCNN.train()
    for data, target in train_loader:
        optimizer.zero_grad()
        data = data.float()
        data = data.to(device)
        output = GoogleBinaryCNN(data)
        target = target.long()
        target = target.to(device)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * data.size(0)

    GoogleBinaryCNN.eval()
    for data, target in valid_loader:
        data = data.float()
        data = data.to(device)
        output = GoogleBinaryCNN(data)
        target = target.long()  # Convert target to torch.long
        target = target.to(device)
        loss = criterion(output, target)
        valid_loss += loss.item() * data.size(0)

    train_loss = train_loss / (len(train_loader) * batch_size)
    valid_loss = valid_loss / (len(valid_loader) * batch_size)

    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch + 1,
        train_loss,
        valid_loss
    ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
        torch.save(GoogleBinaryCNN.state_dict(), 'GoogleBinaryCNN.pt')
        valid_loss_min = valid_loss


Epoch: 1 	Training Loss: 0.692319 	Validation Loss: 0.690934
Validation loss decreased (inf --> 0.690934).  Saving model ...
Epoch: 2 	Training Loss: 0.688618 	Validation Loss: 0.686410
Validation loss decreased (0.690934 --> 0.686410).  Saving model ...
Epoch: 3 	Training Loss: 0.684175 	Validation Loss: 0.682329
Validation loss decreased (0.686410 --> 0.682329).  Saving model ...
Epoch: 4 	Training Loss: 0.680013 	Validation Loss: 0.677336
Validation loss decreased (0.682329 --> 0.677336).  Saving model ...
Epoch: 5 	Training Loss: 0.672881 	Validation Loss: 0.666637
Validation loss decreased (0.677336 --> 0.666637).  Saving model ...
Epoch: 6 	Training Loss: 0.654480 	Validation Loss: 0.623516
Validation loss decreased (0.666637 --> 0.623516).  Saving model ...
Epoch: 7 	Training Loss: 0.570193 	Validation Loss: 0.515572
Validation loss decreased (0.623516 --> 0.515572).  Saving model ...
Epoch: 8 	Training Loss: 0.492142 	Validation Loss: 0.460515
Validation loss decreased (0.51557

In [None]:
GoogleBinaryCNN.load_state_dict(torch.load('GoogleBinaryCNN.pt'))


<All keys matched successfully>

In [None]:
test_loader = torch.utils.data.DataLoader(test_data_cnn_google_binary, batch_size=1)


In [None]:
print('Accuracy of CNN using  Google Word2Vec vectors (Binary) :',str(predict(GoogleBinaryCNN, test_loader)))


Accuracy of CNN using  Google Word2Vec vectors (Binary) : 0.81615


# CNN binary custom model

In [None]:
train_data_cnn_custom_binary = TrainReviewCNN(X_train_raw_binary, Y_train_raw_binary, wv_custom, "custom")
test_data_cnn_custom_binary = TestReviewCNN(X_test_raw_binary, Y_test_raw_binary, wv_custom, "custom")

In [None]:
# how many samples per batch to load
batch_size = 100
# percentage of training set to use as validation
valid_size = 0.2

# obtain training indices that will be used for validation
num_train = len(train_data_cnn_custom_binary)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data_cnn_custom_binary, batch_size=batch_size, sampler=train_sampler)
valid_loader = torch.utils.data.DataLoader(train_data_cnn_custom_binary, batch_size=batch_size, sampler=valid_sampler)
test_loader = torch.utils.data.DataLoader(test_data_cnn_custom_binary, batch_size=batch_size)


In [None]:
CustomBinaryCNN = BinaryCNN().to(device)
print(CustomBinaryCNN)

BinaryCNN(
  (conv1): Conv1d(300, 50, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(50, 10, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=120, out_features=2, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (dropout2): Dropout(p=0.3, inplace=False)
)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(CustomBinaryCNN.parameters(), lr=0.005)


In [None]:
n_epochs = 10

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf  # set initial "min" to infinity

for epoch in range(n_epochs):
    train_loss = 0.0
    valid_loss = 0.0

    CustomBinaryCNN.train()
    for data, target in train_loader:
        optimizer.zero_grad()
        data = data.float()
        data = data.to(device)
        output = CustomBinaryCNN(data)
        target = target.long()
        target = target.to(device)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * data.size(0)

    CustomBinaryCNN.eval()
    for data, target in valid_loader:
        data = data.float()
        data = data.to(device)
        output = CustomBinaryCNN(data)
        target = target.long()  # Convert target to torch.long
        target = target.to(device)
        loss = criterion(output, target)
        valid_loss += loss.item() * data.size(0)

    train_loss = train_loss / (len(train_loader) * batch_size)
    valid_loss = valid_loss / (len(valid_loader) * batch_size)

    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch + 1,
        train_loss,
        valid_loss
    ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
        torch.save(CustomBinaryCNN.state_dict(), 'CustomBinaryCNN.pt')
        valid_loss_min = valid_loss


Epoch: 1 	Training Loss: 0.688606 	Validation Loss: 0.663523
Validation loss decreased (inf --> 0.663523).  Saving model ...
Epoch: 2 	Training Loss: 0.526122 	Validation Loss: 0.418610
Validation loss decreased (0.663523 --> 0.418610).  Saving model ...
Epoch: 3 	Training Loss: 0.394990 	Validation Loss: 0.364230
Validation loss decreased (0.418610 --> 0.364230).  Saving model ...
Epoch: 4 	Training Loss: 0.358007 	Validation Loss: 0.337522
Validation loss decreased (0.364230 --> 0.337522).  Saving model ...
Epoch: 5 	Training Loss: 0.336285 	Validation Loss: 0.323360
Validation loss decreased (0.337522 --> 0.323360).  Saving model ...
Epoch: 6 	Training Loss: 0.322404 	Validation Loss: 0.312290
Validation loss decreased (0.323360 --> 0.312290).  Saving model ...
Epoch: 7 	Training Loss: 0.312077 	Validation Loss: 0.304687
Validation loss decreased (0.312290 --> 0.304687).  Saving model ...
Epoch: 8 	Training Loss: 0.305302 	Validation Loss: 0.299005
Validation loss decreased (0.30468

In [None]:
CustomBinaryCNN.load_state_dict(torch.load('CustomBinaryCNN.pt'))


<All keys matched successfully>

In [None]:
test_loader = torch.utils.data.DataLoader(test_data_cnn_custom_binary, batch_size=1)


In [None]:
print('Accuracy of CNN using  Custom Word2Vec vectors (Binary) :',str(predict(CustomBinaryCNN, test_loader)))


Accuracy of CNN using  Custom Word2Vec vectors (Binary) : 0.878325


# CNN Ternary Google model

In [None]:
X_train_raw_ternary, X_test_raw_ternary, Y_train_raw_ternary, Y_test_raw_ternary = train_test_split(downsized_df['review_body'], downsized_df['sentiment'], test_size=0.2, random_state=48)


In [None]:
train_data_cnn_google_ternary = TrainReviewCNN(X_train_raw_ternary, Y_train_raw_ternary, wv, "google")
test_data_cnn_google_ternary = TestReviewCNN(X_test_raw_ternary, Y_test_raw_ternary, wv, "google")

In [None]:
# how many samples per batch to load
batch_size = 100
# percentage of training set to use as validation
valid_size = 0.2

# obtain training indices that will be used for validation
num_train = len(train_data_cnn_google_ternary)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data_cnn_google_ternary, batch_size=batch_size, sampler=train_sampler)
valid_loader = torch.utils.data.DataLoader(train_data_cnn_google_ternary, batch_size=batch_size, sampler=valid_sampler)
test_loader = torch.utils.data.DataLoader(test_data_cnn_google_ternary, batch_size=batch_size)


In [None]:
class TernaryCNN(nn.Module):
    def __init__(self, output_channels1=50, output_channels2=10, max_length=50, vector_size=300):
        super(TernaryCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=vector_size, out_channels=output_channels1, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=output_channels1, out_channels=output_channels2, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(2)

        self.fc1 = nn.Linear(120, 3)

        self.dropout1 = nn.Dropout(0.3)
        self.dropout2 = nn.Dropout(0.3)

    def forward(self, x):
        x = F.relu(self.pool(self.conv1(x)))
        x = self.dropout1(x)
        x = F.relu(self.pool(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        return x



In [None]:
GoogleTernaryModel = TernaryCNN().to(device)
print(GoogleTernaryModel)

TernaryCNN(
  (conv1): Conv1d(300, 50, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(50, 10, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=120, out_features=3, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (dropout2): Dropout(p=0.3, inplace=False)
)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(GoogleTernaryModel.parameters(), lr=0.005)


In [None]:
n_epochs = 15

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf  # set initial "min" to infinity

for epoch in range(n_epochs):
    train_loss = 0.0
    valid_loss = 0.0

    GoogleTernaryModel.train()
    for data, target in train_loader:
        optimizer.zero_grad()
        data = data.float()
        data = data.to(device)
        output = GoogleTernaryModel(data)
        target = target.long()
        target = target.to(device)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * data.size(0)

    GoogleTernaryModel.eval()
    for data, target in valid_loader:
        data = data.float()
        data = data.to(device)
        output = GoogleTernaryModel(data)
        target = target.long()  # Convert target to torch.long
        target = target.to(device)
        loss = criterion(output, target)
        valid_loss += loss.item() * data.size(0)

    train_loss = train_loss / (len(train_loader) * batch_size)
    valid_loss = valid_loss / (len(valid_loader) * batch_size)

    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch + 1,
        train_loss,
        valid_loss
    ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
        torch.save(GoogleTernaryModel.state_dict(), 'GoogleTernaryModel.pt')
        valid_loss_min = valid_loss


Epoch: 1 	Training Loss: 1.091424 	Validation Loss: 1.086575
Validation loss decreased (inf --> 1.086575).  Saving model ...
Epoch: 2 	Training Loss: 1.083366 	Validation Loss: 1.080107
Validation loss decreased (1.086575 --> 1.080107).  Saving model ...
Epoch: 3 	Training Loss: 1.063947 	Validation Loss: 1.037872
Validation loss decreased (1.080107 --> 1.037872).  Saving model ...
Epoch: 4 	Training Loss: 1.022200 	Validation Loss: 1.001029
Validation loss decreased (1.037872 --> 1.001029).  Saving model ...
Epoch: 5 	Training Loss: 0.972040 	Validation Loss: 0.936014
Validation loss decreased (1.001029 --> 0.936014).  Saving model ...
Epoch: 6 	Training Loss: 0.906739 	Validation Loss: 0.875824
Validation loss decreased (0.936014 --> 0.875824).  Saving model ...
Epoch: 7 	Training Loss: 0.860525 	Validation Loss: 0.837321
Validation loss decreased (0.875824 --> 0.837321).  Saving model ...
Epoch: 8 	Training Loss: 0.832828 	Validation Loss: 0.816177
Validation loss decreased (0.83732

In [None]:
GoogleTernaryModel.load_state_dict(torch.load('GoogleTernaryModel.pt'))


<All keys matched successfully>

In [None]:
test_loader = torch.utils.data.DataLoader(test_data_cnn_google_ternary, batch_size=1)


In [None]:
print('Accuracy of CNN using  Google Word2Vec vectors (Ternary) :',str(predict(GoogleTernaryModel, test_loader)))


Accuracy of CNN using  Google Word2Vec vectors (Ternary) : 0.6844


# CNN Ternary Custom Model

In [None]:
train_data_cnn_custom_ternary = TrainReviewCNN(X_train_raw_ternary, Y_train_raw_ternary, wv_custom, "custom")
test_data_cnn_custom_ternary = TestReviewCNN(X_test_raw_ternary, Y_test_raw_ternary, wv_custom, "custom")

In [None]:
# how many samples per batch to load
batch_size = 100
# percentage of training set to use as validation
valid_size = 0.2

# obtain training indices that will be used for validation
num_train = len(train_data_cnn_custom_ternary)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data_cnn_custom_ternary, batch_size=batch_size, sampler=train_sampler)
valid_loader = torch.utils.data.DataLoader(train_data_cnn_custom_ternary, batch_size=batch_size, sampler=valid_sampler)
test_loader = torch.utils.data.DataLoader(test_data_cnn_custom_ternary, batch_size=batch_size)


In [None]:
CustomTernaryModel = TernaryCNN().to(device)
print(CustomTernaryModel)

TernaryCNN(
  (conv1): Conv1d(300, 50, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(50, 10, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=120, out_features=3, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (dropout2): Dropout(p=0.3, inplace=False)
)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(CustomTernaryModel.parameters(), lr=0.005)


In [None]:
n_epochs = 15

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf  # set initial "min" to infinity

for epoch in range(n_epochs):
    train_loss = 0.0
    valid_loss = 0.0

    CustomTernaryModel.train()
    for data, target in train_loader:
        optimizer.zero_grad()
        data = data.float()
        data = data.to(device)
        output = CustomTernaryModel(data)
        target = target.long()
        target = target.to(device)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * data.size(0)

    CustomTernaryModel.eval()
    for data, target in valid_loader:
        data = data.float()
        data = data.to(device)
        output = CustomTernaryModel(data)
        target = target.long()  # Convert target to torch.long
        target = target.to(device)
        loss = criterion(output, target)
        valid_loss += loss.item() * data.size(0)

    train_loss = train_loss / (len(train_loader) * batch_size)
    valid_loss = valid_loss / (len(valid_loader) * batch_size)

    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch + 1,
        train_loss,
        valid_loss
    ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
        torch.save(CustomTernaryModel.state_dict(), 'CustomTernaryModel.pt')
        valid_loss_min = valid_loss


Epoch: 1 	Training Loss: 0.990258 	Validation Loss: 0.846398
Validation loss decreased (inf --> 0.846398).  Saving model ...
Epoch: 2 	Training Loss: 0.810289 	Validation Loss: 0.759780
Validation loss decreased (0.846398 --> 0.759780).  Saving model ...
Epoch: 3 	Training Loss: 0.751358 	Validation Loss: 0.719146
Validation loss decreased (0.759780 --> 0.719146).  Saving model ...
Epoch: 4 	Training Loss: 0.721840 	Validation Loss: 0.700711
Validation loss decreased (0.719146 --> 0.700711).  Saving model ...
Epoch: 5 	Training Loss: 0.706094 	Validation Loss: 0.688481
Validation loss decreased (0.700711 --> 0.688481).  Saving model ...
Epoch: 6 	Training Loss: 0.694702 	Validation Loss: 0.679873
Validation loss decreased (0.688481 --> 0.679873).  Saving model ...
Epoch: 7 	Training Loss: 0.685925 	Validation Loss: 0.675176
Validation loss decreased (0.679873 --> 0.675176).  Saving model ...
Epoch: 8 	Training Loss: 0.678519 	Validation Loss: 0.667899
Validation loss decreased (0.67517

In [None]:
CustomTernaryModel.load_state_dict(torch.load('CustomTernaryModel.pt'))


<All keys matched successfully>

In [None]:
test_loader = torch.utils.data.DataLoader(test_data_cnn_custom_ternary, batch_size=1)


In [None]:
print('Accuracy of CNN using  Custom Word2Vec vectors (Ternary) :',str(predict(CustomTernaryModel, test_loader)))


Accuracy of CNN using  Custom Word2Vec vectors (Ternary) : 0.7288
