#### Deep Learning using pytorch.MPL for classifying if a comment is toxic or not

Area under the Curve (or AUC) is used to measure the performance of the DL model using a binary classification metric with unbalanced data. The binary classes are if a comment is toxic or not.   


In [None]:
import sys
import pandas as pd
import numpy as np
import torch
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.autonotebook import tqdm
from sklearn.metrics import roc_auc_score

In [None]:
# Get the data
!wget -O train.csv.zip "https://www.dropbox.com/s/xei6z41mfrcnxcd/train.csv.zip?dl=1"

In [None]:
# Grab 10,000 records
df = pd.read_csv('train.csv.zip', dtype={'comment_text':str}, usecols=['comment_text', 'target'], compression = 'zip', nrows = 10000)
df['target'] = (df['target']>0.5).astype(int)
print("Number of (rows, columns) : {}".format(df.shape))
df.head(5)

In [None]:
# Create train/validation datasets 65%/35%
indx = 0.65
train_idx = range(0, int(indx * df.shape[0]))
val_idx = range(int(indx * df.shape[0]), df.shape[0])
x_train, x_valid = df['comment_text'][train_idx], df['comment_text'][val_idx]
y_train, y_valid = df['target'].values[train_idx], df['target'].values[val_idx]

In [None]:
# Creating a unigram and bigram Vectorizer
vectorizer = CountVectorizer(max_features=100000, token_pattern='\w+', ngram_range=(1,2))
X_train_s = vectorizer.fit_transform(x_train)
X_val_s = vectorizer.transform(x_valid)

In [None]:
# Creating a Deep Learning model using MLP with multiple layers
class MyDeepLearning(torch.nn.Module):
    def __init__(self, input_dim = 100000, output_dim = 1):
        super(MyDeepLearning, self).__init__() # Module.__init__() call
        self.layer1 = torch.nn.Linear(input_dim, 512)
        self.act1 = torch.nn.Sigmoid()
        # self.act1 = torch.nn.ReLU()

        self.layer2 = torch.nn.Linear(512, 256)
        self.act2 = torch.nn.Sigmoid()
        # self.act2 = torch.nn.ReLU()

        self.layer3 = torch.nn.Linear(256, output_dim)
        self.act3 = torch.nn.Sigmoid()
        # self.act3 = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(p=0.3) # to avoid overfitting

    def forward(self, x):
        x = self.layer1(x)
        x = self.act1(x)
        x = self.layer2(x)
        x = self.act2(x)
        x = self.layer3(x)
        x = self.act3(x)
        x = x.flatten()
        return x

In [None]:
# defining our loss function, batch size, optimizer
model = MyDeepLearning(input_dim = X_train_s.shape[1])
batch_size = 512
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)
criterion = torch.nn.BCELoss()

In [None]:
# defining a custom dataloader
def dataset(x_data, y_data, batch_size, idx):
  # Create the batch input and target
  if idx < X_train_s.shape[0]//batch_size:
    batch_idx = slice(idx * batch_size, (idx + 1) * batch_size)
  else: # to deal with the last chunck of data
    batch_idx = slice(idx * batch_size, x_data.shape[0])

  x = x_data[batch_idx].todense()   # dense representation
  y = y_data[batch_idx]

  # Convert to tensor
  x = torch.tensor(x).float()
  y = torch.tensor(y).float()

  return x, y

In [None]:
for epoch in range(5):
  # Training step
  train_losses = []
  pbar_train = tqdm(range(X_train_s.shape[0]//batch_size + 1), total = X_train_s.shape[0]//batch_size + 1)
  model.train()
  for step in pbar_train:
    # print(step)
    optimizer.zero_grad() # Zero gradients each step
    # Run through the model, get the loss and make a gradient descent step
    (x,y) = dataset(X_train_s, y_train, batch_size, step)
    out = model(x)
    loss = criterion(out, y)
    loss.backward()
    optimizer.step()
    train_losses.append(loss.item())
    pbar_train.set_description(f'Epoch {epoch} train loss: {sum(train_losses)/(1+step) :.4f}')

  # Validation step
  pbar_val = tqdm(range(X_val_s.shape[0]//batch_size + 1), total = X_val_s.shape[0]//batch_size + 1)
  model.eval()
  val_losses = []
  preds = []
  for step in pbar_val:
    # Run through the model, get the loss; note, we do not need gradients during inference
    (x,y) = dataset(X_val_s, y_valid, batch_size, step)
    with torch.no_grad():
      out = model(x)
    preds.append(out)
    loss = criterion(out, y)
    val_losses.append(loss.item())
    pbar_val.set_description(f'Epoch {epoch} valid loss: {sum(val_losses)/(1+step) :.4f}')

  y_pred = np.concatenate(preds)
  print(f'Epoch {epoch} valid auc : {roc_auc_score(y_valid, y_pred):.4f}')

In [None]:
# test
sample_comment = x_train.iloc[14:15]
sample_comment_target = y_train[14:15]
print('samplee comment:', sample_comment)
sample_comment_s = vectorizer.transform(sample_comment)
(x,y) = dataset(sample_comment_s, sample_comment_target, 1, 0)
with torch.no_grad():
  out = model(x)
print('predicted target:', out)
print('actual target:', y_train[14:15])