In [1]:
import pandas as pd
import numpy as np
fname = 'facebook_comments.csv'
df_train = pd.read_csv(fname, header= None, names=['text','sentiment'],encoding='iso-8859-1',lineterminator='\n')
df_train.head()

Unnamed: 0,text,sentiment
0,Heres a single to add to Kindle. Just read t...,neutral
1,If you tire of Non-Fiction.. Check out http://...,neutral
2,Ghost of Round Island is supposedly nonfiction.,neutral
3,Why is Barnes and Nobles version of the Kindle...,negative
4,@Maria: Do you mean the Nook? Be careful bo...,positive


In [2]:
sent = {'positive':2, 'neutral':1,'negative':0}
df_train['labels'] = df_train['sentiment'].str.strip().map(sent)  #mapping the sentiments of comments to numerical values

In [3]:
training_texts = df_train.text.values  #creating numpy array for the training data
labels = df_train.labels.values   #creating numpy array for the training labels

In [4]:
df_train.head()

Unnamed: 0,text,sentiment,labels
0,Heres a single to add to Kindle. Just read t...,neutral,1
1,If you tire of Non-Fiction.. Check out http://...,neutral,1
2,Ghost of Round Island is supposedly nonfiction.,neutral,1
3,Why is Barnes and Nobles version of the Kindle...,negative,0
4,@Maria: Do you mean the Nook? Be careful bo...,positive,2


In [5]:
#Preprocess Data 
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words ='english', max_features=500, ngram_range= (1,2))  #creating the Tfidf object                           #use 1000 or 2000 here
instances = vectorizer.fit_transform(training_texts)    #using raw data to create the Tfidf object which will return a matrix

X = instances.toarray()  #after Tfidf what we get is a sparse marix, so converting it to an array here
Y = np.array(labels)

print(X.shape, ',', Y.shape) #X is a matrix, Y is a vector

(1999, 500) , (1999,)


In [6]:
#Traditional Machine Learning: Random Forest

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

kfold = KFold(n_splits= 10, shuffle= True, random_state= 2020)
rf_model = RandomForestClassifier(random_state= 2020, max_depth=2, criterion='entropy')
rf_cvscores = []

for train,test in kfold.split(X):
  rf_model.fit(X[train], Y[train])
  rf_acc = rf_model.score(X[test], Y[test])
  rf_cvscores.append(rf_acc)

print("RF - mean: %.4f%% (std: +/- %.4f%%)" % (np.mean(rf_cvscores)*100, np.std(rf_cvscores)*100)  )

RF - mean: 64.1332% (std: +/- 2.0919%)


In [7]:
#Fully Connected Feedforward Network

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import torch.optim as optim

In [8]:
#Hyper parameter

epochs = 15  #the model learns better with every epoch
lr = 1e-3  #with a smaller learning rate the weight adjustments after backpropogation do not overshoot and learning is slow
indim = X.shape[1]
outdim = 3
drate = 0.6   #the model does not show overfitting traits and is not complex so the dropout rate is small
batch_size = 16   #based on the instances in the training dataset, a batch size of 16 provides enough batches to perform the parameter updation tasks

X_tensor = torch.from_numpy(X)  #convert the X numpy array to tensors
Y_tensor = torch.from_numpy(Y)  #convert the Y numpy array to tensors

dataset = TensorDataset(X_tensor, Y_tensor)   #vertically concatanate the tensors to form the dataset with text values and labels
train_size = int(0.8*len(dataset))         
val_size = len(dataset) - train_size        
train_dataset, val_dataset = torch.utils.data.random_split(dataset,[train_size, val_size])  #random split of dataset into training and validation 

train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle= True)  #training loader created by allowing shuffle of instances in epochs
val_loader = DataLoader(val_dataset, batch_size= batch_size, shuffle= True)     


In [9]:
#Creating the network

class SentimentNetwork(nn.Module):
  def __init__(self, input_dim, output_dim, dropout_rate):

    super(SentimentNetwork, self).__init__()

    self.fc1 = nn.Linear(input_dim,100)
    self.do1 = nn.Dropout(dropout_rate)
    self.fc2 = nn.Linear(100, 50)
    self.fc3 = nn.Linear(50, output_dim)
    
# one dropout layer to avoid overfitting situations
  
  def forward(self,x):
    x = F.relu(self.fc1(x))
    x = self.do1(x)
    x = F.relu(self.fc2(x))
    return F.log_softmax(self.fc3(x))

#creating model
model = SentimentNetwork(indim, outdim, drate)
print(model)

SentimentNetwork(
  (fc1): Linear(in_features=500, out_features=100, bias=True)
  (do1): Dropout(p=0.6, inplace=False)
  (fc2): Linear(in_features=100, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=3, bias=True)
)


In [10]:
# define a training process function
# Define the Loss function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr = lr)  #suitable with a small learning rate


In [11]:
#define a training process function for one epoch
def train(model, train_loader, optimizer, criterion):
  epoch_loss, epoch_acc = 0.0, 0.0

  model.train()  #for dropout, system needs to be told that this is the training mode
  acc = 0
  for batch_x, batch_y in train_loader:
    
    optimizer.zero_grad()  #settings gradient for all weights and biases to zero 
    net_out = model(batch_x.float())  
    
    error = criterion(net_out, batch_y)  #tensor returned
    loss = error.item() #calculate the loss using predicted output and truth and only get values here

    #acc = calculate the accuracy using predictions (batch_size x 3) and batch_y (batch_size x 1)
    pred = net_out.argmax(1)      
    acc = pred.eq(batch_y).sum().item()  #to get the number of correct predictions in a batch
                                                          
    #backpropogate
    error.backward()
    optimizer.step()
    
    epoch_loss += loss   
    epoch_acc += acc      #total number of correct predictions in an epoch

  #avg epoch_loss and avg epoch_acc
  avg_epoch_loss = epoch_loss/len(train_dataset)   #
  avg_epoch_acc = epoch_acc/len(train_dataset)
  return avg_epoch_loss, avg_epoch_acc


In [12]:
#define a validation process function for one epoch
def evaluate(model, val_loader, criterion):
  epoch_loss, epoch_acc = 0.0, 0.0

  acc = 0
  model.eval()
  with torch.no_grad():
    for batch_x, batch_y in val_loader:
      
      net_out = model(batch_x.float())
      error = criterion(net_out, batch_y)
      loss = error.item()

      #acc = calculate the accuracy using predictions (batch_size x 3) and batch_y (batch_size x 1)
      pred = net_out.data.argmax(1) #(batch_size x 1)
      acc = pred.eq(batch_y).sum().item()

      epoch_loss += loss
      epoch_acc += acc

  #avg epoch_loss and avg epoch_acc
  avg_epoch_loss = epoch_loss/len(val_dataset)
  avg_epoch_acc = epoch_acc/len(val_dataset)
  return avg_epoch_loss, avg_epoch_acc


In [13]:
# real training and evaluation process
for epoch in range(epochs):
  train_loss, train_acc = train(model, train_loader, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, val_loader, criterion)
  print(f'Epoch: {epoch+1:02}')
  print(f'\tTrain Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}')
  print(f'\t Val. Loss: {valid_loss:.4f} | Val. Acc: {valid_acc:.4f}')



Epoch: 01
	Train Loss: 0.0544 | Train Acc: 0.5991
	 Val. Loss: 0.0477 | Val. Acc: 0.6175
Epoch: 02
	Train Loss: 0.0409 | Train Acc: 0.6660
	 Val. Loss: 0.0366 | Val. Acc: 0.7175
Epoch: 03
	Train Loss: 0.0303 | Train Acc: 0.8118
	 Val. Loss: 0.0298 | Val. Acc: 0.7925
Epoch: 04
	Train Loss: 0.0244 | Train Acc: 0.8524
	 Val. Loss: 0.0260 | Val. Acc: 0.8275
Epoch: 05
	Train Loss: 0.0205 | Train Acc: 0.8774
	 Val. Loss: 0.0277 | Val. Acc: 0.8175
Epoch: 06
	Train Loss: 0.0172 | Train Acc: 0.8874
	 Val. Loss: 0.0245 | Val. Acc: 0.8450
Epoch: 07
	Train Loss: 0.0144 | Train Acc: 0.9081
	 Val. Loss: 0.0229 | Val. Acc: 0.8650
Epoch: 08
	Train Loss: 0.0119 | Train Acc: 0.9412
	 Val. Loss: 0.0212 | Val. Acc: 0.8775
Epoch: 09
	Train Loss: 0.0096 | Train Acc: 0.9581
	 Val. Loss: 0.0211 | Val. Acc: 0.8925
Epoch: 10
	Train Loss: 0.0082 | Train Acc: 0.9656
	 Val. Loss: 0.0228 | Val. Acc: 0.8975
Epoch: 11
	Train Loss: 0.0073 | Train Acc: 0.9737
	 Val. Loss: 0.0209 | Val. Acc: 0.9100
Epoch: 12
	Train Loss