Notebook Link: https://colab.research.google.com/drive/1oBxzMsdIsYLikfExtrFd7T9vOvaCLaa_?usp=sharing

codalab username: shashwat.pandey

Student ID: spandey7

## **Importing Libraries**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset,DataLoader
import torch
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
import torch.nn as nn
import torch.optim as optim
import math
import numpy as np


In [None]:
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

## **Loading Dataset**

In [None]:
data = pd.read_csv('/content/drive/MyDrive/hw1_train-1.csv', index_col = 0)
test_data=pd.read_csv('/content/drive/MyDrive/hw1_test-2.csv', index_col = 0)

data.columns = ["text", "labels"]
test_data.columns = ["text"]
data["labels"] = data["labels"].str.replace("none", "")
data['labels'] = data['labels'].replace(np.nan,"")

labels = [str(i).split() for i in data['labels']]
text= data['text']

## **Dataset Preparation**

In [None]:
tfidfvectorizer = TfidfVectorizer(max_features=3000)
x_tfidf = tfidfvectorizer.fit_transform(text).toarray()
mlb = MultiLabelBinarizer()
mlb.fit(labels)
Y = mlb.transform(labels)
n_op_features = len(Y[0])
train_x,test_x,train_y,test_y = train_test_split(x_tfidf,Y,test_size=0.01)
n_ip_features = len(train_x[0])

In [None]:
class MovieDataset(Dataset):
  def __init__(self, X, y):
    #Converting arrays to tensors of torch
    self.X = torch.tensor(X)
    self.y = torch.tensor(y)
  def __len__(self):
    return len(self.y)
  def __getitem__(self,index):
    return self.X[index], self.y[index]

In [None]:
batch_size = 16
train_ds = MovieDataset(X=train_x, y=train_y)
test_ds = MovieDataset(X=test_x, y=test_y)
#DataLoader Definition
dataloader_train = DataLoader(dataset=train_ds,batch_size=batch_size, shuffle=True)
dataloader_test = DataLoader(dataset=test_ds, batch_size=batch_size, shuffle=True)

## **Model and Training**

In [None]:
class MLP(nn.Module):
  #a multi-layered perceptron based classifier
    def __init__(self, num_features,out_features):
        """
        Args:
            num_features (int): the size of the input feature vector
        """
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(in_features=num_features, out_features=64)
        print("num f:", num_features)
        self.fc2 = nn.Linear(in_features=64,out_features=out_features)

    def forward(self, x_in, apply_softmax=False):
        """The forward pass of the classifier
        
        Args:
            x_in (torch.Tensor): an input data tensor. 
                x_in.shape should be (batch, num_features)
            apply_softmax (bool): a flag for the sigmoid activation
                should be false if used with the Cross Entropy losses
        Returns:
            the resulting tensor. tensor.shape should be (batch,)
        """
        y_out = torch.relu(self.fc1(x_in))
        y_out = self.fc2(y_out)
        return y_out

In [None]:
learning_rate=0.01
num_epochs=100

epoch_loss_list=[]
epoch_acc_list=[]
val_epoch_acc_list=[]
val_epoch_loss_list=[]

model = MLP(n_ip_features,n_op_features)
model.to(device)
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

n_iter=math.ceil(len(train_ds)/batch_size)
print(n_iter)

losses = []
from sklearn.metrics import accuracy_score
for epoch in range(num_epochs):
    epoch_loss = 0
    epoch_acc=0
    val_epoch_loss=0
    val_epoch_acc=0
    for k,(X,y) in enumerate(dataloader_train):
        # the training routine is these 5 steps:

        # step 1. load the data
        X = X.to(device)
        y = y.to(device)
        optimizer.zero_grad()

        # step 2. compute the output
        y_pred = model(x_in=X.float())
        y_1 = (y_pred).to('cpu').detach().numpy()
        y_1=(np.array(y_1) >= 0)*1
        y_0=y.to('cpu').detach().numpy()
        acc = sum([(y_0[i]==y_1[i]).all()*1 for i in range(len(y_0))])
        epoch_acc+= acc

        # step 3. compute the loss
        loss = loss_func(y_pred, y.squeeze(1).float())
        epoch_loss+= loss.item()

        # step 4. use loss to produce gradients
        loss.backward()

        # step 5. use optimizer to take gradient step
        optimizer.step()
    epoch_loss = round(epoch_loss/(k+1),3)
    epoch_loss_list.append(epoch_loss)
    epoch_acc = round(epoch_acc/len(train_ds),3)
    epoch_acc_list.append(epoch_acc)
    
    for k,(X,y) in enumerate(dataloader_test):
        X = X.to(device)
        y = y.to(device)
        y_pred = model(x_in=X.float())
        y_1 = (y_pred).to('cpu').detach().numpy()
        y_1=(np.array(y_1) >= 0)*1
        y_0=y.to('cpu').detach().numpy()
        val_acc = sum([(y_0[i]==y_1[i]).all()*1 for i in range(len(y_0))])
        val_epoch_acc+=val_acc
        loss = loss_func(y_pred, y.squeeze(1).float())
        val_epoch_loss+= loss.item()
    val_epoch_acc=round(val_epoch_acc/len(test_ds),3)
    val_epoch_acc_list.append(val_epoch_acc)
    val_epoch_loss = round(val_epoch_loss/(k+1),3)
    val_epoch_loss_list.append(val_epoch_loss)
    print('epoch : ' + str(epoch+1)+'/'+str(num_epochs))
    print("-"*40)
    print('loss : ' + str(epoch_loss)+ ' \t val loss : '+ str(val_epoch_loss)+ '\nacc :' + str(epoch_acc)+ ' \t val acc :' + str(val_epoch_acc))
    print("+"*40)  # -----------------------------------------
    losses.append(epoch_loss)

In [None]:
torch.save(model, 'MLP-TFIDF')

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_graph(plot_var,train_plot_list,val_plot_list):
    epochs = len(train_plot_list)
    fig = plt.figure(figsize=(8,6))
    if plot_var=="accuracy": plt.title("Train/Validation Accuracy")
    elif plot_var =="loss" : plt.title("Train/Validation Loss")
    plt.plot(list(np.arange(epochs) + 1) , train_plot_list, label='train')
    plt.plot(list(np.arange(epochs) + 1), val_plot_list, label='validation')
    plt.xlabel('num_epochs', fontsize=12)
    plt.ylabel('loss', fontsize=12)
    plt.legend(loc='best')
    if plot_var=="accuracy": plt.savefig('Train_Val_accuracy.png')
    elif plot_var =="loss" : plt.savefig("Train_Val_loss.png")
    return

In [None]:
    plot_graph("accuracy",epoch_acc_list, val_epoch_acc_list)
    plot_graph("loss",epoch_loss_list, val_epoch_loss_list)

## **Predictions**

In [None]:
model = torch.load('MLP-TFIDF')
def multilabel_predict(x):
    x = tfidfvectorizer.transform([x]).toarray()
    x = torch.tensor(x, dtype=torch.float64).cuda()
    pred = model(x_in=x.float())
    y_1 = (pred).to('cpu').detach().numpy()
    y_1=(np.array(y_1) >= 0.9)*1
    y_1 = mlb.inverse_transform(y_1)
    return y_1[0]

In [None]:
test_utterences= test_data['text']
predicted_relations=[]
for utterence in test_utterences:
    test_pred=multilabel_predict(utterence)
    if len(test_pred)>0:
        if len(test_pred)>1 and 'none' in test_pred:
            test_pred=list(test_pred)
            test_pred.remove('none')
        predicted_relations.append((' ').join(sorted(test_pred)))


In [None]:
id_list = list(range(len(predicted_relations)))
final = pd.DataFrame(zip(id_list,predicted_relations), columns=['ID','CORE RELATIONS'])
final["CORE RELATIONS"] = final["CORE RELATIONS"].str.replace("none", "")
final["CORE RELATIONS"] = final["CORE RELATIONS"].replace(np.nan, "")
final.to_csv('submission.csv', index=None)