1. Import the required libraries.

In [32]:
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

2. Load the dataset containing a set of 1,000 product reviews from Amazon.

In [6]:
df = pd.read_csv("AmazonReview.csv")

df

Unnamed: 0,Review,Label
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1
...,...,...
995,The screen does get smudged easily because it ...,0
996,What a piece of junk.. I lose more calls on th...,0
997,Item Does Not Match Picture.,0
998,The only thing that disappoint me is the infra...,0


3. Separate the data into two variables: one containing the reviews and the other containing
the labels. Remove the punctuation from the reviews.

In [7]:
reviews = df['Review'].tolist()
labels = df['Label'].tolist()

print(reviews)



In [8]:

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

reviews = [remove_punctuation(review) for review in reviews]

reviews


['So there is no way for me to plug it in here in the US unless I go by a converter',
 'Good case Excellent value',
 'Great for the jawbone',
 'Tied to charger for conversations lasting more than 45 minutesMAJOR PROBLEMS',
 'The mic is great',
 'I have to jiggle the plug to get it to line up right to get decent volume',
 'If you have several dozen or several hundred contacts then imagine the fun of sending each of them one by one',
 'If you are Razr owneryou must have this',
 'Needless to say I wasted my money',
 'What a waste of money and time',
 'And the sound quality is great',
 'He was very impressed when going from the original battery to the extended battery',
 'If the two were seperated by a mere 5 ft I started to notice excessive static and garbled sound from the headset',
 'Very good quality though',
 'The design is very odd as the ear clip is not very comfortable at all',
 'Highly recommend for any one who has a blue tooth phone',
 'I advise EVERYONE DO NOT BE FOOLED',
 'So F

4. Create a variable containing the vocabulary of the entire set of reviews.

In [10]:
vocab = set(' '.join(reviews).split())

vocab

{'Verizons',
 '10',
 'optimal',
 'flush',
 'Europe',
 'wiping',
 'knock',
 'keypads',
 'calls',
 'Signal',
 'software',
 'setup',
 'thought',
 'laughing',
 'capability',
 'AGGRAVATING',
 'customer',
 'hit',
 'flash',
 'joy',
 'just',
 'side',
 'shine',
 'freeway',
 'purpose',
 'window',
 'sent',
 'SOS',
 'Also',
 'memory',
 'Made',
 'promised',
 'work',
 'earphone',
 'if',
 'Due',
 'options',
 'usable',
 'cables',
 'ProblemVery',
 'earpad',
 'etc',
 'features',
 'drawback',
 'number',
 'kits',
 'going',
 'out',
 'ordered',
 'hoursTHe',
 'that',
 'choice',
 'seperated',
 'pull',
 'Too',
 'incrediable',
 'apart',
 'constructed',
 'wild',
 'toilet',
 'sharp',
 'removing',
 'self',
 'AC',
 'refuse',
 'sides',
 'network',
 '6',
 'fine',
 'HATED',
 'defective',
 'short',
 'seat',
 'They',
 'highest',
 '375',
 'appears',
 'looks',
 'verizon',
 'walked',
 'possesed',
 'Angeles',
 'charger',
 'Lasted',
 'V3i',
 'found',
 'Comfort',
 'Overall',
 'needed',
 'BT',
 'support',
 'except',
 'Tools',


5. Additionally, create a dictionary that maps each word to an integer, where the words will
be the keys and the integers will be the values.

In [12]:
dic = {}

for review in reviews:
    words = review.split()
    for word in words:
        if word not in dic:
            dic[word] = len(dic)

print(dic)



6. Encode the review data by replacing each word in a review for its paired integer.

In [19]:
encoded_reviews = []

for review in reviews:
    words = review.split()

    encoded_review = []

    for word in words:
        if word in vocab:
            encoded_review.append(dic[word])
        else:
            pass

    encoded_reviews.append(encoded_review)


print(encoded_reviews)


[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 12, 13, 14, 15, 16, 17, 18, 19], [20, 21, 22, 23], [24, 5, 12, 25], [26, 7, 27, 5, 28, 29, 30, 31, 32, 33, 34], [35, 36, 2, 37], [15, 38, 7, 39, 12, 8, 7, 40, 9, 7, 41, 42, 43, 7, 40, 44, 45], [46, 47, 38, 48, 49, 50, 48, 51, 52, 53, 54, 12, 55, 56, 57, 58, 56, 59, 60, 17, 60], [46, 47, 61, 62, 63, 64, 38, 65], [66, 7, 67, 15, 68, 69, 70], [71, 18, 72, 56, 70, 73, 74], [75, 12, 76, 77, 2, 37], [78, 79, 80, 81, 82, 83, 84, 12, 85, 86, 7, 12, 87, 86], [46, 12, 88, 89, 90, 17, 18, 91, 92, 93, 15, 94, 7, 95, 96, 97, 73, 98, 76, 84, 12, 99], [100, 101, 77, 102], [35, 103, 2, 80, 104, 105, 12, 106, 107, 2, 108, 80, 109, 110, 111], [112, 113, 5, 114, 60, 115, 116, 18, 117, 118, 119], [15, 120, 121, 122, 123, 124, 125], [0, 126, 0, 20], [127, 37], [128, 129, 130, 131, 10, 18, 4, 132, 133, 47, 134, 135, 136, 132, 137, 138, 139], [15, 140, 141, 142, 143, 73, 144, 111, 145, 146, 147, 108, 40, 9, 7, 148, 149], [15, 150, 65, 7, 151, 152, 69, 153, 154, 73,

7. Create a class containing the architecture of the network.
8. Make sure that you include an embedding layer. Initialize the model using 64 embedding
dimensions and 128 neurons for 3 LSTM layers.

In [24]:
class SentimentAnalysisModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SentimentAnalysisModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=3)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        return self.fc(hidden[-1, :, :])

9. Define the loss function, an optimization algorithm, and the number of epochs to train for.
For example, you can use binary cross-entropy loss as the loss function, the Adam
optimizer, and train for 10 epochs.

In [31]:
vocab_size = len(vocab)
embedding_dim = 64
hidden_dim = 128
output_dim = 1
model = SentimentAnalysisModel(vocab_size, embedding_dim, hidden_dim, output_dim)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())
epochs = 10