In [None]:
pip install torchtext==0.10.0

Collecting torchtext==0.10.0
  Downloading torchtext-0.10.0-cp37-cp37m-manylinux1_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 13.3 MB/s 
Collecting torch==1.9.0
  Downloading torch-1.9.0-cp37-cp37m-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 2.6 kB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.11.0+cu113
    Uninstalling torch-1.11.0+cu113:
      Successfully uninstalled torch-1.11.0+cu113
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.12.0
    Uninstalling torchtext-0.12.0:
      Successfully uninstalled torchtext-0.12.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.12.0+cu113 requires torch==1.11.0, but you have torch 1.9.0 which is incompatible.
torchaudio 0.11.0+cu

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.nn.modules.activation import Sigmoid

In [None]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#preprocessing

from nltk.corpus import stopwords
from nltk import word_tokenize
stopWords = set(stopwords.words('english'))

In [None]:
def normalize_whitespace(sentence):
  return " ".join(sentence.split())

In [None]:
def preprocessing(text):
  clean_text= []
  text= normalize_whitespace(text)
  for t in text:
    t = t.lower()
    t= t.replace("<br /><br />", "")
    clean_text.append(t)
  return clean_text

In [None]:
import torch
from torchtext.legacy import data

SEED = 42

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm', stop_words = stopWords)
LABEL = data.LabelField(dtype = torch.float)

In [None]:
fields = [('text', TEXT), ('label', LABEL)]

train_data = data.TabularDataset(
    path="/content/IMDB Dataset.csv",
    format='csv',
    fields=fields,
    skip_header= False,
)

In [None]:
import random
train_data, test_data = train_data.split(split_ratio=0.7, random_state = random.seed(SEED))

In [None]:
print(vars(train_data[1]))

{'text': ['Harvey', 'Keital', "'s", 'best', 'performance', 'far', 'new', 'century', '.', 'Very', 'nicely', 'photographed', ',', 'beautiful', 'snap', '-', 'shot', 'pre', '-', 'Castro', 'Cuba', '.', 'The', 'story', 'revolves', 'around', 'nephew', 'local', 'minor', 'crime', 'boss', 'develops', 'friendship', 'American', 'Hollywood', 'connections', '.', 'It', "'s", 'really', 'moment', 'boy', 'awakens', 'fact', 'small', 'circle', 'people', 'knows', 'actually', 'live', 'much', 'larger', ',', 'much', 'complex', 'world', "n't", 'yet', 'understand.the', 'script', 'strong', 'filled', 'humor', ',', 'direction', 'crisp', '.', 'Over', ',', 'really', 'professional', 'job', 'fits', 'well', 'tradition', 'Latin', 'American', 'cinema', '.', 'The', 'one', 'weakness', 'decision', 'shoot', 'sync', '-', 'sound', 'English', 'rather', 'Spanish', '-', 'probably', 'improve', 'sales', 'US', '.', 'Unfortunately', ',', 'makes', 'film', 'little', 'less', 'convincing', '.', 'But', 'see', 'beyond', ',', 'find', 'heart

In [None]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 35001
Number of testing examples: 15000


In [None]:
#validation dataset

import random

train_data, valid_data = train_data.split(random_state = random.seed(42))

In [None]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 24501
Number of validation examples: 10500
Number of testing examples: 15000


In [None]:
#Build vocab
TEXT.build_vocab(train_data, max_size = 25_000)
LABEL.build_vocab(train_data)

In [None]:
print(TEXT.vocab.itos[:14])

['<unk>', '<pad>', ',', '.', 'I', '"', "'s", '-', '/><br', 'movie', 'film', 'The', '(', "n't"]


In [None]:
#to use the GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
BATCH_SIZE = 64
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True
    )

#Model


Single layer NN.
Only use the Embedding layer to convert to dense and an unique Linear Layer

In [None]:
INPUT_DIM = len(TEXT.vocab) #size of the vocabulary
EMBEDDING_DIM = 100 #size of dense vectors
HIDDEN_DIM = 256 # size of hidden sates
OUTPUT_DIM = 1 #number of classes
DROPOUT = 0.5


In [None]:
import torch.optim as optim

In [None]:
## Calculate accuracy

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds)) # Rounding predictions: 0.75 --> 1 0.4 --> 0
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

Train

In [None]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    model.train() #Turn on the training mode
    for batch in iterator:  
        optimizer.zero_grad() #return gradients to 0 each batch
        predictions = model(batch.text).squeeze(1) 
        loss = criterion(predictions, batch.label)
        loss.backward()
        optimizer.step() 
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

Evaluate

In [None]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval() #Turn on evaluation mode
    with torch.no_grad():
        for batch in iterator:
          predictions = model(batch.text).squeeze(1) 
          loss = criterion(predictions, batch.label)
          acc = binary_accuracy(predictions, batch.label)
          epoch_loss += loss.item()
          epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

#TEST


In [None]:
'''
One Layer 
RNN and Linear
Name: "model2" 
optimizer= "Adam"
'''

import torch.nn as nn

class MultilayerPerceptron(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, dropout):
        super(MultilayerPerceptron, self).__init__() #to call the functions in the superclass
        self.input_size = input_dim 
        self.hidden_size = hidden_dim
        self.emb_size= embedding_dim
        self.output_size= output_dim
        self.dropout= dropout
        self.embedding= nn.Embedding(self.input_size, self.emb_size)
        self.rnn= nn.RNN(self.emb_size, self.hidden_size)
        self.fc = nn.Linear(hidden_dim, output_dim)
 
        
    def forward(self, text): 
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        hidden= F.sigmoid(hidden)
        hidden= self.fc(hidden.squeeze(0))
        return hidden

In [None]:

model2 = MultilayerPerceptron(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT)

#Optimizer and loss functions

criterion = nn.BCEWithLogitsLoss() #Calculate loss using binary cross entrophy
optimizer2 = optim.Adam(model2.parameters())#optim.SGD(model.parameters(), lr = 0.01) #

#Send model and loss to GPU

model2 = model2.to(device)


In [None]:
'''
Model after implementing "Sequential" :"model2"
Adam optimizer
With dropout
'''
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    train_loss = train(model2, train_iterator, optimizer2, criterion)
    valid_loss, valid_acc = evaluate(model2, valid_iterator, criterion)
    
    print('Training Loss: ', train_loss)
    print('Validation Loss: ', valid_loss)



Training Loss:  0.25738182492729267
Validation Loss:  0.2662168355602207
Training Loss:  0.2533072130098044
Validation Loss:  0.24980272433974526
Training Loss:  0.25335908469114227
Validation Loss:  0.2506401788104664
Training Loss:  0.2522907587395636
Validation Loss:  0.2507451344620098
Training Loss:  0.25220233952874616
Validation Loss:  0.2585072394573327


In [None]:
test_loss, test_acc = evaluate(model2, test_iterator, criterion)
print('Loss: ', test_loss , ' Acc: ', test_acc*100,'%')



Loss:  0.25851642907933986  Acc:  50.25044327086591 %


In [None]:
'''
It has two layers RNN and Linear and another Linear.
Name: "model"
Optimizer= "Adam"
'''

import torch.nn as nn
import torch.nn.functional as F

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, dropout):
        super().__init__() #to call the functions in the superclass
        self.embedding = nn.Embedding(input_dim, embedding_dim) #Embedding layer to create dense vector instead of sparse matrix
        self.rnn = nn.RNN(embedding_dim, hidden_dim) 
        self.hidden_fc = nn.Linear(hidden_dim,hidden_dim)
        self.out_fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.sigmoid= nn.Sigmoid()
        
    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, hidden = self.rnn(embedded) 
        hidden = self.dropout(hidden[-1,:,:])
        hidden= self.sigmoid(hidden)
        #hidden= F.relu(hidden)  
        hidden= self.sigmoid(hidden)
        #hidden = F.relu(self.hidden_fc(hidden))
        return self.out_fc(hidden)

In [None]:
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT)
optimizer = optim.Adam(model.parameters()) #to update parameters of the module
criterion = nn.MSELoss()#nn.BCEWithLogitsLoss() #Calculate loss using binary cross entrophy
model = model.to(device)

In [None]:
'''
Model = "model"
'''
#without dropout
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    train_loss = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print('Training Loss: ', train_loss)
    print('Validation Loss: ', valid_loss)

Training Loss:  0.255820761733827
Validation Loss:  0.24958872731887932
Training Loss:  0.2550050773835369
Validation Loss:  0.2502760166471655
Training Loss:  0.25460219441599385
Validation Loss:  0.2530812394438368
Training Loss:  0.2544392233339987
Validation Loss:  0.25091444949309033
Training Loss:  0.2549429510576607
Validation Loss:  0.2497423057303284


In [None]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print('Loss: ', test_loss , ' Acc: ', test_acc*100,'%')

Loss:  0.24972468075600077  Acc:  50.25044327086591 %


#Regular Machine Learning methods

In [None]:
import pandas as pd
df= pd.read_csv("/content/IMDB Dataset.csv")

In [None]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [None]:
#Split Dataset
from sklearn.model_selection import train_test_split
X = df["review"]
y = df['sentiment']
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 40)

In [None]:
#Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

tf_en_vectorizer= TfidfVectorizer(stop_words="english")

X_train_features= tf_en_vectorizer.fit_transform(X_train)

X_test_features= tf_en_vectorizer.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

clfs = {
    "RandomForest":RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
}
for clf_name in clfs:
  clf = clfs[clf_name]
  clf.fit(X_train_features, y_train)
  y_pred = clf.predict(X_test_features)
  print(clf_name, accuracy_score(y_test, y_pred))

RandomForest 0.8556
Logistic Regression 0.8912


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
confusion_matrix(y_test, y_pred)

array([[4505,  595],
       [ 493, 4407]])