<a href="https://colab.research.google.com/github/shwetamalla14/FirstNeuralNetwork/blob/main/Shweta_Malla_Lab1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# load text data and convert the label/sentiment into corresponding numeric values: '
# possible packages you might need are: pandas, numpy
import pandas as pd
import numpy as np

# read the training data
df_train=pd.read_csv('facebook_comments.csv',header=None,names=['text','sentiment'], encoding='iso-8859-1',lineterminator='\n')
#we now create a new column in the DF
sent= {'positive':2, 'neutral':1, 'negative':0}
#map every element in the series in the sentiment column:
df_train['labels']=df_train['sentiment'].str.strip().map(sent) 

# get texts and labels
training_texts = df_train.text.values
labels = df_train.labels.values

#checking data type of texts and labels(they are now numpy arrays)
print(type(training_texts),type(labels))
df_train.head()


<class 'numpy.ndarray'> <class 'numpy.ndarray'>


Unnamed: 0,text,sentiment,labels
0,Heres a single to add to Kindle. Just read t...,neutral,1
1,If you tire of Non-Fiction.. Check out http://...,neutral,1
2,Ghost of Round Island is supposedly nonfiction.,neutral,1
3,Why is Barnes and Nobles version of the Kindle...,negative,0
4,@Maria: Do you mean the Nook? Be careful bo...,positive,2


## **PREPROCESS DATA **

In [2]:
# preprocess the loaded textual data, including removing stopwords, stemming, and tok
# represent each document (i.e., comment) using TF-IDF strategy. The features are the
# possible packages you might need are: scikit-learn, numpy

#For the features, we are using uni-grams (We can use bi-grams and tri-grams too)

from sklearn.feature_extraction.text import TfidfVectorizer
# tokenize and create a document-feature matrix X and a label vector Y
vectorizer = TfidfVectorizer(stop_words='english', max_features=500, ngram_range=(1,1))
instances = vectorizer.fit_transform(training_texts)
X= instances.toarray()
Y= labels

# print out the shape of X and Y
print(X.shape,',',Y.shape)

#first 10 records of Y
print(Y[:10])
print(X[0,:50]) #first 50 columns These are the TF-IDF values

(1999, 500) , (1999,)
[1 1 1 0 2 2 2 0 2 0]
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.28915636 0.         0.         0.
 0.         0.         0.2971592  0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.        ]


### Traditional Machine Learning Models: Random Forest

In [3]:
# using 10-fold cross-validation to show the prediction accuracy
# possible packages you might need are: scikit-learn, numpy

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

#We want to fix random generator so we set random_state to a random number
kfold= KFold(n_splits=10, shuffle=True,random_state=2020)
#estimator is the number of trees by default we have 100
rf_model = RandomForestClassifier(criterion ='entropy', max_depth=2,random_state=2020) 
rf_cvscores=[]

#We train the model, get the accuracy on the validation set
for train_idx, val_idx in kfold.split(X):
  rf_model.fit(X[train_idx], Y[train_idx])
  acc= rf_model.score(X[val_idx],Y[val_idx])
  rf_cvscores.append(acc)

print("Random Forest - mean: %.4f%% (std: +/- %.4f%%)" % (np.mean(rf_cvscores)*100, np.std(rf_cvscores)*100))
#We get the mean accuracy of around 64%, which is not that great

Random Forest - mean: 64.1332% (std: +/- 2.0919%)


### Fully connected feedforward Neural Network

In [4]:
# Design your own network with the following requirements:
# 1. Having dropout
# 2. Separate the dataset into training and validation (80-20%)
# 3. The prediction accuracy on the validation set should be at least 50% for this

In [5]:
# possible packages you might need are: scikit-learn, numpy, torch
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

In [32]:
# convert your numpy array to TensorDataset and create a data loader for training and
# some hyperparameters: input dimension, output dimension, batch size, number of epoc

epochs = 50
lr = 0.01 #Learning rate  
indim = X.shape[1] #input dimensionality, for each instance it will be 500 in our case
outdim = 3 #output dimensionality, 3 categories - Neatural, psoitive, negative 
drate = 0.7
batch_size = 40


X_tensor = torch.from_numpy(X)
Y_tensor = torch.from_numpy(Y)

dataset = TensorDataset(X_tensor,Y_tensor)
train_size = int(0.8*len(dataset)) #80% for training set data 
val_size = len(dataset) - train_size

#We do a random split provided by Torch , length we can have a tuple or array
train_dataset, val_dataset = torch.utils.data.random_split(dataset, lengths=[train_size, val_size])

train_loader = DataLoader(train_dataset, shuffle=True,batch_size=batch_size)
val_loader = DataLoader(val_dataset, shuffle=True,batch_size=batch_size)

## Build the network 

In [33]:
class SentimentNetwork(nn.Module):
  def __init__(self, indim, outdim, drate):
    super(SentimentNetwork,self).__init__()
    self.fc1 = nn.Linear(indim,100,bias = True)
    self.fc2 = nn.Linear(100, 50,bias = True)
    self.fc3 = nn.Linear(50, outdim,bias = True)    #creating 2 hidden layers for our model with dropout rate of 0.7
    self.do1 = nn.Dropout(0.7)
    self.do2 = nn.Dropout(0.7) 
    self.bn1 = nn.BatchNorm1d(100)
    self.bn2 = nn.BatchNorm1d(50)
     
  def forward(self,x):
    x = F.relu(self.fc1(x))
    x = self.bn1(x)
    x = self.do1(x)
    x = F.relu(self.fc2(x))
    x = self.bn2(x)
    x = self.do2(x)
    
    return F.log_softmax(self.fc3(x))
    #return x

# create a model
model = SentimentNetwork(indim,outdim,drate)
#model = SentimentNetwork(input_dim, output_dim, dropout_rate)
print(model)

SentimentNetwork(
  (fc1): Linear(in_features=500, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=3, bias=True)
  (do1): Dropout(p=0.7, inplace=False)
  (do2): Dropout(p=0.7, inplace=False)
  (bn1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


## Create a training function to train the model and an evaluation function to evaluate the performance on the separate validation set

In [34]:
#define a training process function

def calculate_accuracy(y_pred, y_true):
  predicted= np.argmax(y_pred.detach().numpy(), axis=1)
  #predicted = torch.argmax(y_pred, dim=1)
  
  #predicted = torch.max(y_pred, 1)
  return np.sum(y_true.detach().numpy() == predicted)/batch_size
  #return (y_true == predicted).sum.mean()


# creating losss function 
criterion = torch.nn.CrossEntropyLoss()
#creating optimizer 
#optimizer = torch.optim.Adam(model.parameters(), lr = lr)
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)

def train(model, train_loader, optimizer, criterion):
  epoch_loss, epoch_acc = 0.0,0.0 # the loss and accuracy for each epoch

  model.train()

  for batch_x, batch_y in train_loader:
    y_pred = model(batch_x.float())   #predictions 
    loss = criterion(y_pred, batch_y) #calculating the loss 
    optimizer.zero_grad()       # Intialize the hidden weight to all zeros
    loss.backward()           # Backward pass to compute the weight
    optimizer.step()          ## Optimizer to update the weights of hidden nodes
    
    train_acc=calculate_accuracy(y_pred,batch_y)  #calculating the accuraxy from the function 
    epoch_loss += loss.item()
    epoch_acc += train_acc

    #calculate avg epoch loss and accuracy
    tl= len(train_loader)
    avg_acc_train = epoch_acc/tl
    avg_epoch_train = epoch_acc/tl

  return avg_epoch_train, avg_acc_train

In [35]:
# define a validation/evaluation process function


def evaluate(model, val_loader, criterion):
  epoch_loss, epoch_acc = 0.0,0.0 # the loss and accuracy for each epoch

  model.eval()

  with torch.no_grad():
    for batch_x, batch_y in val_loader:
      y_pred = model(batch_x.float())     #predictions 
      loss = criterion(y_pred, batch_y)   #calculating the loss 
      val_acc = calculate_accuracy(y_pred, batch_y)     #calculating the accuraxy from the function
      epoch_loss += loss.item()
      epoch_acc += val_acc



    #calculate avg epoch loss and accuracy 
    vl= len(val_loader)
    avg_acc_val = epoch_acc/vl
    avg_epoch_val = epoch_loss/vl
      
    return avg_epoch_val, avg_acc_val

## Main starting point: train the model and evaluate the model

In [36]:
# real training and evaluation process
for epoch in range(epochs):
  train_loss, train_acc = train(model, train_loader, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, val_loader, criterion)
  print(f'Epoch: {epoch+1:02}')
  print(f'\tTrain Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}')
  print(f'\t Val. Loss: {valid_loss:.4f} | Val. Acc: {valid_acc:.4f}')

  



Epoch: 01
	Train Loss: 0.5419 | Train Acc: 0.5419
	 Val. Loss: 0.8111 | Val. Acc: 0.6750
Epoch: 02
	Train Loss: 0.6994 | Train Acc: 0.6994
	 Val. Loss: 0.5711 | Val. Acc: 0.7825
Epoch: 03
	Train Loss: 0.7244 | Train Acc: 0.7244
	 Val. Loss: 0.5147 | Val. Acc: 0.8050
Epoch: 04
	Train Loss: 0.7525 | Train Acc: 0.7525
	 Val. Loss: 0.4862 | Val. Acc: 0.8250
Epoch: 05
	Train Loss: 0.7850 | Train Acc: 0.7850
	 Val. Loss: 0.4666 | Val. Acc: 0.8225
Epoch: 06
	Train Loss: 0.7981 | Train Acc: 0.7981
	 Val. Loss: 0.4633 | Val. Acc: 0.8100
Epoch: 07
	Train Loss: 0.8106 | Train Acc: 0.8106
	 Val. Loss: 0.4336 | Val. Acc: 0.8250
Epoch: 08
	Train Loss: 0.8325 | Train Acc: 0.8325
	 Val. Loss: 0.4523 | Val. Acc: 0.8275
Epoch: 09
	Train Loss: 0.8306 | Train Acc: 0.8306
	 Val. Loss: 0.4162 | Val. Acc: 0.8425
Epoch: 10
	Train Loss: 0.8175 | Train Acc: 0.8175
	 Val. Loss: 0.4154 | Val. Acc: 0.8450
Epoch: 11
	Train Loss: 0.8375 | Train Acc: 0.8375
	 Val. Loss: 0.3966 | Val. Acc: 0.8675
Epoch: 12
	Train Loss

We have achieved an accuracy of approx 92% on training set and accuracy of around 66% on validation set. We can improve this by adding more hidden layers and changing the number of neurons. 
Even with 6 hidden layers and reduced and increased learning rate I was able to achieve only 65% accuracy. 
