## BUDT758B Lab 1 
#### Shashank Puthanveedu (116836982) 
##### Section 3

In [1]:
# Import pandas and numpy libraries 
import pandas as pd
import numpy as np

# Load the data
fname = 'facebook_comments.csv'
df_train = pd.read_csv(fname, header = None, names = ['text', 'sentiment'], encoding = 'iso-8859-1', lineterminator = '\n')

# Set labels to create levels in sentiments
sent = {'positive':2, 'neutral':1,'negative':0}
df_train['labels'] = df_train['sentiment'].str.strip().map(sent)

# Get texts and labels
training_texts = df_train.text.values
labels = df_train.labels.values

# See a sample data
print(type(training_texts), type(labels))
df_train.head()

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


Unnamed: 0,text,sentiment,labels
0,Heres a single to add to Kindle. Just read t...,neutral,1
1,If you tire of Non-Fiction.. Check out http://...,neutral,1
2,Ghost of Round Island is supposedly nonfiction.,neutral,1
3,Why is Barnes and Nobles version of the Kindle...,negative,0
4,@Maria: Do you mean the Nook? Be careful bo...,positive,2


In [2]:
# Import the TfidfVectorizer packahe from sklearn to vectorize the texts.
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words = 'english', max_features=500, ngram_range=(1,1))
instances = vectorizer.fit_transform(training_texts)

X = instances.toarray()
Y = labels


print(X.shape,',',Y.shape)
print(Y[:10])
print(X[0,:50])

(1999, 500) , (1999,)
[1 1 1 0 2 2 2 0 2 0]
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.28915636 0.         0.         0.
 0.         0.         0.2971592  0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.        ]


In [3]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

# Set the configuration for kfold cross-validation
kfold = KFold(n_splits=10, shuffle = True, random_state=2020)
# Set the configuration for a Random Forest model
rf_model = RandomForestClassifier(criterion='entropy',max_depth = 2, random_state = 2020)
rf_cvscores = []

# Train the model and calculate the accuracy
for train_idx, val_idx in kfold.split(X):
  rf_model.fit(X[train_idx], Y[train_idx])
  acc = rf_model.score(X[val_idx], Y[val_idx])
  rf_cvscores.append(acc)

print("Random Forest - mean: %.4f%% (std: +/- %.4f%%)"%(np.mean(rf_cvscores)*100,np.std(rf_cvscores)*100))

Random Forest - mean: 64.1332% (std: +/- 2.0919%)


In [4]:
# Import the required pytorch libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import torch.optim as optim

In [5]:
# Set the configuration/ tunable parameters of the model
epochs = 50
lr = 0.01
indim = X.shape[1]
outdim = 3
drate = 0.7
batch_size = 36

# Create tensors
X_tensor = torch.from_numpy(X)
Y_tensor = torch.from_numpy(Y)

# Create a Dataset
dataset = TensorDataset(X_tensor, Y_tensor)
train_size = int(0.8*len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Split the dataset into train and validation dataset
train_loader = DataLoader(train_dataset, shuffle = True, batch_size = batch_size)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size = batch_size)




In [6]:
# Build the Model Network Structure: Linear -> Relu -> Batch Normalization -> Dropout -> ... -> Log_Softmax
class SentimentNetwork(nn.Module):
  def __init__(self, input_dim, output_dim, dropout_rate):
    super(SentimentNetwork, self).__init__()
    self.fc1 = nn.Linear(input_dim, 100, bias = True)
    self.fc2 = nn.Linear(100, 50, bias = True)
    self.fc3 = nn.Linear(50, output_dim, bias = True)
    self.do1 = nn.Dropout(dropout_rate)
    self.bn1 = nn.BatchNorm1d(100)
    self.bn2 = nn.BatchNorm1d(50)
  def forward(self,x):
    x = F.relu(self.fc1(x))
    x = self.bn1(x)
    x = self.do1(x)
    x = F.relu(self.fc2(x))
    x = self.bn2(x)
    x = self.do1(x)
    x = F.log_softmax(self.fc3( x))
    return x

model = SentimentNetwork(indim, outdim, drate)
print(model)

SentimentNetwork(
  (fc1): Linear(in_features=500, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=3, bias=True)
  (do1): Dropout(p=0.7, inplace=False)
  (bn1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [7]:
# Using an Adam optimizer for adaptive learning rate

#optimizer = torch.optim.SGD(model.parameters(),lr = lr, momentum=0.9)
optimizer = optim.Adam(model.parameters(),lr = lr)

# Set CrossEntropyLoss as the loss criterion
criterion = nn.CrossEntropyLoss()

# Function to calculate the accuracy for each batch within each epoch
def accuracy(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    return acc

In [8]:
# define a training process for the model
def train(model, train_loader, optimizer, criterion):
  epoch_loss, epoch_acc = 0.0,0.0
  loss,acc=0,0
  model.train()
 
  for batch_x, batch_y in train_loader:
    #zero gradient
    optimizer.zero_grad()
    #predictions= calculates the predicted output for the current batch batch_x
    model_out=model(batch_x.float())
    #loss= loass for the current batch using predicions ans batch_y
    loss=criterion(model_out,batch_y)
    #acc= calculates the acc using predictions (batch_size X output_dim) and batch_y (batch_size X 1)
    train_acc = accuracy(model_out, batch_y)
    #backpropgate 
    loss.backward()
    optimizer.step()

    # Calculate the epoch loss and accuracy
    epoch_loss += loss.item()
    epoch_acc += train_acc.item()
  return epoch_loss/len(train_loader), epoch_acc/len(train_loader)
 
# define a validation/evaluation process function
def evaluate(model, val_loader, criterion):
  epoch_loss, epoch_acc = 0.0,0.0
  loss,acc=0,0
  model.eval()

  with torch.no_grad():
    for batch_x, batch_y in val_loader:
      #predictions= calculates the predicted output for the current batch batch_x
      model_out=model(batch_x.float())
      #loss= loass for the current batch using predicions ans batch_y
      loss=criterion(model_out,batch_y)
      #acc= calculates the acc using predictions (batch_size X output_dim) and batch_y (batch_size X 1)
      val_acc = accuracy(model_out, batch_y)

      # Calculate the epoch loss and accuracy    
      epoch_loss += loss.item()
      epoch_acc += val_acc.item()
  return epoch_loss/len(val_loader), epoch_acc/len(val_loader)
 


In [9]:
# Train and validate the model
for epoch in range(epochs):
  best_valid_loss = 9999
  # Calculate the training loss and accuracy
  train_loss, train_acc = train(model, train_loader, optimizer, criterion)
  # Calculate the validation loss and accuracy
  valid_loss, valid_acc = evaluate(model, val_loader, criterion)

  # Save the weights with least loss for future use
  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'saved_weights.pt')

  # Print the loss and accuracy values  
  print(f'Epoch: {epoch+1:02}')
  print(f'\tTrain Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}')
  print(f'\t Val. Loss: {valid_loss:.4f} |  Val. Acc: {valid_acc:.4f}')



Epoch: 01
	Train Loss: 0.9618 | Train Acc: 0.5763
	 Val. Loss: 0.7352 |  Val. Acc: 0.6389
Epoch: 02
	Train Loss: 0.6613 | Train Acc: 0.7243
	 Val. Loss: 0.4773 |  Val. Acc: 0.8125
Epoch: 03
	Train Loss: 0.5341 | Train Acc: 0.8049
	 Val. Loss: 0.4661 |  Val. Acc: 0.8009
Epoch: 04
	Train Loss: 0.4966 | Train Acc: 0.8023
	 Val. Loss: 0.4016 |  Val. Acc: 0.8611
Epoch: 05
	Train Loss: 0.4822 | Train Acc: 0.8169
	 Val. Loss: 0.3821 |  Val. Acc: 0.8449
Epoch: 06
	Train Loss: 0.4205 | Train Acc: 0.8441
	 Val. Loss: 0.3738 |  Val. Acc: 0.8403
Epoch: 07
	Train Loss: 0.3938 | Train Acc: 0.8499
	 Val. Loss: 0.2998 |  Val. Acc: 0.8819
Epoch: 08
	Train Loss: 0.3741 | Train Acc: 0.8553
	 Val. Loss: 0.3007 |  Val. Acc: 0.8866
Epoch: 09
	Train Loss: 0.3430 | Train Acc: 0.8805
	 Val. Loss: 0.3416 |  Val. Acc: 0.8681
Epoch: 10
	Train Loss: 0.3144 | Train Acc: 0.8865
	 Val. Loss: 0.2857 |  Val. Acc: 0.8981
Epoch: 11
	Train Loss: 0.2980 | Train Acc: 0.8933
	 Val. Loss: 0.2651 |  Val. Acc: 0.9120
Epoch: 12
