In [None]:
import numpy as np
import pandas as pd
import markov
import sys
util_path = './utils'
sys.path.insert(0, util_path)
import util
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, f1_score, classification_report, accuracy_score, f1_score, log_loss, recall_score, precision_score 
! pip install scikit-plot
import scikitplot as skplt
import re
import os

from matplotlib import pyplot as plt
%config InlineBackend.figure_format = 'retina'

from torch.utils.data import Dataset,DataLoader
import torch
from sklearn.preprocessing import MultiLabelBinarizer
import torch.nn as nn
import torch.optim as optim
import math

In [None]:
data1 = pd.read_csv("./Datasets/master_dataset.csv").fillna(' ')

In [None]:
df = data1[data1.label_sexist == 'sexist']
ndf = data1[data1.label_sexist == 'not sexist']
add_df = ndf.sample(2000)
frames = [add_df,df]
data = pd.concat(frames)

In [None]:
# stores the result in a new column called 'processed_text'
data["processed_text"] = data['text'].apply(util.process_text, model=2)
data.head(5)

In [None]:
train_X, test_X, train_Y, test_Y = train_test_split(data['processed_text'], data['label_sexist'],stratify=data['label_sexist'], test_size=0.25)

In [None]:
labels = [str(i).split('/') for i in train_Y]
text= train_X

In [None]:
"""The code creates a Tf-idf vectorizer with a minimum document frequency of 5, 
applies it to a list of texts, then creates a multilabel binarizer and 
fits it to a list of labels. It creates arrays for the input and output features, and 
splits the data into training and test sets"""

tfidfvectorizer = TfidfVectorizer(min_df = 5) #max_features=3000
x_tfidf = tfidfvectorizer.fit_transform(text).toarray()
mlb = MultiLabelBinarizer()
mlb.fit(labels)
Y = mlb.transform(labels)
n_op_features = len(Y[0])
train_x,test_x,train_y,test_y = train_test_split(x_tfidf,Y,test_size=0.2)
n_ip_features = len(train_x[0])

In [None]:
class TextDataset(Dataset):
  '''This code block converts the arrays into tensors for pytorch.'''
  def __init__(self, X, y):
    self.X = torch.tensor(X)
    self.y = torch.tensor(y)
  def __len__(self):
    return len(self.y)
  def __getitem__(self,index):
    return self.X[index], self.y[index]

In [None]:
# DataLoader Definition
batch_size = 128
train_ds = TextDataset(X=train_x, y=train_y)
test_ds = TextDataset(X=test_x, y=test_y)
dataloader_train = DataLoader(dataset=train_ds,batch_size=batch_size, shuffle=True)
dataloader_test = DataLoader(dataset=test_ds, batch_size=batch_size, shuffle=True)

In [None]:
class MLP(nn.Module):
  '''Multi-layered perceptron based classifier'''
  def __init__(self, num_features,out_features):
    """
    Args:
        num_features (int): the size of the input feature vector
    """
    super(MLP, self).__init__()
    self.fc1 = nn.Linear(in_features=num_features, out_features=64)
    print("num f:", num_features)
    self.fc2 = nn.Linear(in_features=64,out_features=32)
    self.fc3 = nn.Linear(in_features=32,out_features=out_features)

  def forward(self, x_in, apply_softmax=False):
    """The forward pass of the classifier
    
    Args:
        x_in (torch.Tensor): an input data tensor. 
            x_in.shape should be (batch, num_features)
        apply_softmax (bool): a flag for the sigmoid activation
            should be false if used with the Cross Entropy losses
    Returns:
        the resulting tensor. tensor.shape should be (batch,)
    """
    y_out_1 = torch.relu(self.fc1(x_in))
    y_out_2 = self.fc2(y_out_1)
    y_out = self.fc3(y_out_2)
    return y_out 

In [None]:
# CUDA
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

In [None]:
from markov import Project, ProjectScope

# Create a new project
my_project = Project(
    # project name
    name="Final Project AI4SG",
    # project description (optional)
    description="Visualizing the model-1 and model-2",
    # project visibility (optional; public by default)
    project_scope=ProjectScope.PUBLIC,
)

In [None]:
# Training the model
learning_rate=0.0001
num_epochs=45
hyper_parameters = {"learning_rate":0.0001,"num_epochs":45}
epoch_loss_list=[]
epoch_acc_list=[]
val_epoch_acc_list=[]
val_epoch_loss_list=[]

model = MLP(n_ip_features,n_op_features)
model.to(device)
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

n_iter=math.ceil(len(train_ds)/batch_size)
# print(n_iter)


# Use the ExperimentRecorder constuctor provided by the MarkovML SDK
# to create a new experiment recorder
recorder = markov.ExperimentRecorder(
    # Name of the experiment recording
    name="Model 2 TFIDF MLP experiment",
    # Project associated with the experiment
    project_id=my_project.project_id,
    # project_id="3Tfz2jR4xNF23H",
    # Hyper-parameters used for model training
    hyper_parameters = {"learning_rate":0.0001,"num_epochs":45},
    # Additional notes (optional)
    model_class=markov.ModelClass.TAGGING,
    notes="This is a experiment describing the model-2"
)

In [None]:
losses = []
with recorder:
    for epoch in range(num_epochs):
      epoch_loss = 0
      epoch_acc=0
      val_epoch_loss=0
      val_epoch_acc=0
      for k,(X,y) in enumerate(dataloader_train):
        # the training routine is these 5 steps:

        # step 1. load the data
        X = X.to(device)
        y = y.to(device)
        optimizer.zero_grad()

        # step 2. compute the output
        y_pred = model(x_in=X.float())
        y_1 = (y_pred).to('cpu').detach().numpy()
        y_1=(np.array(y_1) >= 0)*1
        y_0=y.to('cpu').detach().numpy()
        acc = sum([(y_0[i]==y_1[i]).all()*1 for i in range(len(y_0))])
        epoch_acc+= acc
       

    # step 3. compute the loss
        loss = loss_func(y_pred, y.squeeze(1).float())
        epoch_loss+= loss.item()

    # step 4. use loss to produce gradients
        loss.backward()

    # step 5. use optimizer to take gradient step
        optimizer.step()
      epoch_loss = round(epoch_loss/(k+1),3)
      epoch_loss_list.append(epoch_loss)
      epoch_acc = round(epoch_acc/len(train_ds),3)
      epoch_acc_list.append(epoch_acc)
  
      for k,(X,y) in enumerate(dataloader_test):
        X = X.to(device)
        y = y.to(device)
        y_pred = model(x_in=X.float())
        y_1 = (y_pred).to('cpu').detach().numpy()
        y_1=(np.array(y_1) >= 0)*1
        y_0=y.to('cpu').detach().numpy()
        val_acc = sum([(y_0[i]==y_1[i]).all()*1 for i in range(len(y_0))])
        val_epoch_acc+=val_acc
        loss = loss_func(y_pred, y.squeeze(1).float())
        val_epoch_loss+= loss.item()

      val_epoch_acc=round(val_epoch_acc/len(test_ds),3)
      val_epoch_acc_list.append(val_epoch_acc)
      val_epoch_loss = round(val_epoch_loss/(k+1),3)
      val_epoch_loss_list.append(val_epoch_loss)
      print('epoch : ' + str(epoch+1)+'/'+str(num_epochs))
      print("-"*40)
      print('loss : ' + str(epoch_loss)+ ' \t val loss : '+ str(val_epoch_loss)+ '\nacc :' + str(epoch_acc)+ ' \t val acc :' + str(val_epoch_acc))
      print("+"*40)  # -----------------------------------------
      losses.append(epoch_loss)
      recorder.add_record({"loss": epoch_loss})
      recorder.add_record({"val_loss": val_epoch_loss})
      recorder.add_record({"accuracy": epoch_acc})
      recorder.add_record({"val_accuracy": val_epoch_acc})

In [None]:
def plot_graph(plot_var,train_plot_list,val_plot_list):
  '''Function for visualizing the Accuracy and Loss'''
  epochs = len(train_plot_list)
  fig = plt.figure(figsize=(8,6))
  if plot_var=="accuracy": plt.title("Train/Validation Accuracy")
  elif plot_var =="loss" : plt.title("Train/Validation Loss")
  plt.plot(list(np.arange(epochs) + 1) , train_plot_list, label='train')
  plt.plot(list(np.arange(epochs) + 1), val_plot_list, label='validation')
  plt.xlabel('num_epochs', fontsize=12)
  plt.ylabel('loss', fontsize=12)
  plt.legend(loc='best')
  if plot_var=="accuracy": plt.savefig("./result/model2/task_a_train_Val_accuracy.png")
  elif plot_var =="loss" : plt.savefig("./result/model2/task_a_train_Val_loss.png")
  return

In [None]:
# visualizing the Accuracy and Loss
plot_graph("accuracy",epoch_acc_list, val_epoch_acc_list)
plot_graph("loss",epoch_loss_list, val_epoch_loss_list)

In [None]:
# Converting testing elements to torch tensor from array
test_x = torch.Tensor(test_x)
y_pred = model(test_x.to(device)) 
print(y_pred.shape, test_x.shape)

In [None]:
def multilabel_predict(x):
  '''Function to get predicted labels for testing data'''
  x = tfidfvectorizer.transform([x]).toarray()
  x = torch.tensor(x, dtype=torch.float64)
  pred = model(x_in=x.float().to(device))
  y_1 = (pred).to('cpu').detach().numpy()
  ind=(y_1).argmax(axis = 1)
  y_dim = y_1.shape[1]
  l = [0 for i in range(y_dim)]
  for i in range(y_dim):
      if i==ind:
          l[i] = 1
  #y_1 = np.array(l)
  #print(y_1)
  y_1 = mlb.inverse_transform(np.array(l).reshape(1,2))
  return y_1

In [None]:
# Getting relations for plotting Confusion Matrix
test_utterences= test_X
predicted_relations=[]
for utterence in test_utterences:
  test_pred=multilabel_predict(utterence)
  predicted_relations.append(test_pred[0])
print(len(predicted_relations))

In [None]:
# Plotting Confusion Matrix
y_true = [x for x in test_Y]
y_pred = [x for x in predicted_relations]
skplt.metrics.plot_confusion_matrix(y_true, y_pred, figsize=(8,8),x_tick_rotation=90)
plt.savefig("./result/model2/task_a_confusion_matrix.png")

In [None]:
# Evaluation Report
print(classification_report(y_true,y_pred))
clsf_report = pd.DataFrame(classification_report(y_true,y_pred, output_dict=True)).transpose()
clsf_report.to_csv("./result/model2/task_a_classification_report.csv", index= True)

In [None]:
dict_data = {'actual': y_true, 'predicted': y_pred} ## see which variables to take from accuracy_score line in previous code block
df_markov = pd.DataFrame(data=dict_data)
# df_markov.replace({'not_sexist': 0, 'sexist': 1}, inplace=True)
df_markov.to_csv("./result/model2/recording_model2.csv")
df_markov.head()

# now just map the not sexist with 0 and sexist with 1

In [None]:
torch.save(model.state_dict(), "./result/model2/trained_model2.pth")

In [None]:
model.load_state_dict(torch.load('./result/model2/trained_model2.pth'))

In [None]:
def predict_sexism(x):
  x = tfidfvectorizer.transform([x]).toarray()
  x = torch.tensor(x, dtype=torch.float64)
  model.load_state_dict(torch.load('./result/model2/trained_model2.pth'))
  pred = model(x_in=x.float().to(device))
  y_1 = (pred).to('cpu').detach().numpy()
  ind=(y_1).argmax(axis = 1)
  y_dim = y_1.shape[1]
  l = [0 for i in range(y_dim)]
  for i in range(y_dim):
      if i==ind:
          l[i] = 1
  y_1 = mlb.inverse_transform(np.array(l).reshape(1,2))
  return y_1

In [None]:
t1 = "women are weak and inferior"

In [None]:
test_pred=predict_sexism(t1)
print(test_pred)

In [None]:
from markov import EvaluationRecorder
evaluation_recorder = EvaluationRecorder(
    name=f"Evaluating {recorder.name}",
    notes=f"Evaluation of model2 using MarkovML",
    model_id=recorder.model_id
)

evaluation_recorder.register()

In [None]:
import os
import uuid

from markov.api.schemas.model_recording import SingleTagInferenceRecord

# create and register the recorder
filepath = os.path.join('./result/model2/recording_model2.csv')
with open(filepath) as f:
    for line in f:
        # Assign a unique identifier for individual records
        record_id = str(uuid.uuid4())
        tokens = line.strip('\n').split(',')
        record = SingleTagInferenceRecord(
            urid=record_id,
            inferred=tokens[2],
            actual=tokens[1],
            score=float(tokens[0])
        )
        evaluation_recorder.add_record(record)
outcome = evaluation_recorder.finish()
print(outcome)