In [None]:
import re
import nltk
from argparse import Namespace
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
import os
from google.colab import drive
from argparse import Namespace
import torch
import torch.optim as optim
from tqdm import tqdm_notebook, tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
import torch.nn.functional as F
import math

drive.mount('/content/drive')
path = 'drive/My Drive/blogtext_preprocessed.csv'
# blog_data = pd.read_csv(path).iloc[:10000,]
nltk.download('punkt')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Define Functions and Classes

In [None]:
class Preprocessing:
	
	def __init__(self,args):
		self.data = path
		self.max_len = args.max_len
		self.max_words = args.max_words
		self.test_size = 0.25
		
	def load_data(self):
		df = pd.read_csv(self.data).iloc[:10000,:]
		df = df.drop(['Unnamed: 0'], axis=1).dropna().reset_index(drop=True)
		df.replace({'gender': {'male': 1,'female': 0}},inplace=True)

		X = df['text'].values
		Y = df['gender'].values
		
		self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(X, Y, test_size=self.test_size)
		
	def prepare_tokens(self):
		self.tokens = Tokenizer(num_words=self.max_words)
		self.tokens.fit_on_texts(self.x_train)

	def sequence_to_token(self, x):
		sequences = self.tokens.texts_to_sequences(x)
		return sequence.pad_sequences(sequences, maxlen=self.max_len)
  
class DatasetMaper(Dataset):
	'''
	Handles batches of dataset
	'''
  
	def __init__(self, x, y):
		self.x = x
		self.y = y
		
	def __len__(self):
		return len(self.x)
		
	def __getitem__(self, idx):
		return self.x[idx], self.y[idx]

class TextClassifier(nn.ModuleList):
  def __init__(self, params):
    super(TextClassifier, self).__init__()

    # Parameters regarding text preprocessing
    self.seq_len = params.max_len
    self.num_words = params.max_words
    self.embedding_size = params.embedding_size
    
    # Dropout definition
    self.dropout = nn.Dropout(0.25)
    
    # CNN parameters definition
    # Kernel sizes
    self.kernel_1 = 2
    self.kernel_2 = 3
    self.kernel_3 = 4
    self.kernel_4 = 5
    
    # Output size for each convolution
    self.out_size = params.out_size
    # Number of strides for each convolution
    self.stride = params.stride
    
    # Embedding layer definition
    self.embedding = nn.Embedding(self.num_words + 1, self.embedding_size, padding_idx=0)
    
    # Convolution layers definition
    self.conv_1 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_1, self.stride)
    self.conv_2 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_2, self.stride)
    self.conv_3 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_3, self.stride)
    self.conv_4 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_4, self.stride)
    
    # Max pooling layers definition
    self.pool_1 = nn.MaxPool1d(self.kernel_1, self.stride)
    self.pool_2 = nn.MaxPool1d(self.kernel_2, self.stride)
    self.pool_3 = nn.MaxPool1d(self.kernel_3, self.stride)
    self.pool_4 = nn.MaxPool1d(self.kernel_4, self.stride)
    
    # Fully connected layer definition
    self.fc = nn.Linear(self.in_features_fc(), 1)

  def in_features_fc(self):
    # Calcualte size of convolved/pooled features for convolution_1/max_pooling_1 features
    out_conv_1 = ((self.embedding_size - 1 * (self.kernel_1 - 1) - 1) / self.stride) + 1
    out_conv_1 = math.floor(out_conv_1)
    out_pool_1 = ((out_conv_1 - 1 * (self.kernel_1 - 1) - 1) / self.stride) + 1
    out_pool_1 = math.floor(out_pool_1)
    
      # Calcualte size of convolved/pooled features for convolution_2/max_pooling_2 features
    out_conv_2 = ((self.embedding_size - 1 * (self.kernel_2 - 1) - 1) / self.stride) + 1
    out_conv_2 = math.floor(out_conv_2)
    out_pool_2 = ((out_conv_2 - 1 * (self.kernel_2 - 1) - 1) / self.stride) + 1
    out_pool_2 = math.floor(out_pool_2)
    
      # Calcualte size of convolved/pooled features for convolution_3/max_pooling_3 features
    out_conv_3 = ((self.embedding_size - 1 * (self.kernel_3 - 1) - 1) / self.stride) + 1
    out_conv_3 = math.floor(out_conv_3)
    out_pool_3 = ((out_conv_3 - 1 * (self.kernel_3 - 1) - 1) / self.stride) + 1
    out_pool_3 = math.floor(out_pool_3)
    
      # Calcualte size of convolved/pooled features for convolution_4/max_pooling_4 features
    out_conv_4 = ((self.embedding_size - 1 * (self.kernel_4 - 1) - 1) / self.stride) + 1
    out_conv_4 = math.floor(out_conv_4)
    out_pool_4 = ((out_conv_4 - 1 * (self.kernel_4 - 1) - 1) / self.stride) + 1
    out_pool_4 = math.floor(out_pool_4)
    
      # Returns "flattened" vector (input for fully connected layer)
    return (out_pool_1 + out_pool_2 + out_pool_3 + out_pool_4) * self.out_size

  def forward(self, x):

      # Sequence of tokes is filterd through an embedding layer
    x = self.embedding(x)
      
      # Convolution layer 1 is applied
    x1 = self.conv_1(x)
    x1 = torch.relu(x1)
    x1 = self.pool_1(x1)
      
      # Convolution layer 2 is applied
    x2 = self.conv_2(x)
    x2 = torch.relu((x2))
    x2 = self.pool_2(x2)
   
      # Convolution layer 3 is applied
    x3 = self.conv_3(x)
    x3 = torch.relu(x3)
    x3 = self.pool_3(x3)
      
      # Convolution layer 4 is applied
    x4 = self.conv_4(x)
    x4 = torch.relu(x4)
    x4 = self.pool_4(x4)
      
      # The output of each convolutional layer is concatenated into a unique vector
    union = torch.cat((x1, x2, x3, x4), 2)
    union = union.reshape(union.size(0), -1)

      # The "flattened" vector is passed through a fully connected layer
    out = self.fc(union)
      # Dropout is applied		
    out = self.dropout(out)
      # Activation function is applied
    out = torch.sigmoid(out)
      
    return out.squeeze()

def train(model, x_train, y_train, x_test, y_test, params):
   
  #  # Initialize dataset maper
   train = DatasetMaper(x_train, y_train)
   test = DatasetMaper(x_test, y_test)
   
  #  # Initialize loaders
   loader_train = DataLoader(train, batch_size=params.batch_size)
   loader_test = DataLoader(test, batch_size=params.batch_size)
   
   # Define optimizer
   optimizer = optim.RMSprop(model.parameters(), lr=params.learning_rate)
   
   # Starts training phase
   for epoch in range(params.epochs):
      # Set model in training model
      model.train()
      predictions = []
      # Starts batch training
      for x_batch, y_batch in loader_train:
      
         y_batch = y_batch.type(torch.FloatTensor)
         
         # Feed the model
         y_pred = model(x_batch)
         
         # Loss calculation
         loss = F.binary_cross_entropy(y_pred, y_batch)
         
         # Clean gradientes
         optimizer.zero_grad()
         
         # Gradients calculation
         loss.backward()
         
         # Gradients update
         optimizer.step()
         
         # Save predictions
         predictions += list(y_pred.detach().numpy())
      
      # Evaluation phase
      test_predictions = Run.evaluation(model, loader_test)
      
      # Metrics calculation
      train_accuary = Run.calculate_accuray(data['y_train'], predictions)
      test_accuracy = Run.calculate_accuray(data['y_test'], test_predictions)
      print("Epoch: %d, loss: %.5f, Train accuracy: %.5f, Test accuracy: %.5f" % (epoch+1, loss.item(), train_accuary, test_accuracy))

def calculate_accuray(grand_truth, predictions):
   true_positives = 0
   true_negatives = 0
   
   # Gets frequency  of true positives and true negatives
   # The threshold is 0.5
   for true, pred in zip(grand_truth, predictions):
      if (pred >= 0.5) and (true == 1):
         true_positives += 1
      elif (pred < 0.5) and (true == 0):
         true_negatives += 1
      else:
         pass
   # Return accuracy
   return (true_positives+true_negatives) / len(grand_truth)

def evaluation(model, loader_test):
		
		# Set the model in evaluation mode
		model.eval()
		predictions = []
		
		# Starst evaluation phase
		with torch.no_grad():
			for x_batch, y_batch in loader_test:
				x = x_batch.type(torch.LongTensor)
				y = y_batch.type(torch.FloatTensor)
				y_pred = model(x_batch)
				# predictions += list(y_pred.detach().data.numpy())
				predictions.append(y_pred.detach().data.numpy())
		return predictions

Modelling

In [None]:
args = Namespace(
    max_len = 1000,
    max_words = 300,
    learning_rate = 0.001,
    batch_size = 64,
    embedding_size = 128,
    out_size = 32,
    stride = 2,
    epochs = 4
)

preprocessing = Preprocessing(args)
preprocessing.load_data()
preprocessing.prepare_tokens()
x_train = preprocessing.sequence_to_token(preprocessing.x_train)
x_test = preprocessing.sequence_to_token(preprocessing.x_test)
y_train = preprocessing.y_train
y_test = preprocessing.y_test

training_set = DatasetMaper(x_train, y_train)
test_set = DatasetMaper(x_test, y_test)
loader_train = DataLoader(training_set, batch_size=args.batch_size)
loader_test = DataLoader(test_set)

model = TextClassifier(args)

# Define optimizer
optimizer = optim.RMSprop(model.parameters(), lr=args.learning_rate)
# Starts training phase
for epoch in tqdm(range(args.epochs)):
  # Set model in training model
  model.train()
  predictions = []
  # Starts batch training
  for x_batch, y_batch in loader_train:
      x_batch = x_batch.type(torch.LongTensor)
      y_batch = y_batch.type(torch.FloatTensor)
      
      # Feed the model
      y_pred = model(x_batch)
      
      # Loss calculation
      loss = F.binary_cross_entropy(y_pred, y_batch)
      
      # Clean gradientes
      optimizer.zero_grad()
      
      # Gradients calculation
      loss.backward()
      
      # Gradients update
      optimizer.step()
      
      # Save predictions
      predictions += list(y_pred.detach().numpy())
  
  # Evaluation phase
  test_predictions = evaluation(model, loader_test)
  
  # Metrics calculation
  train_accuary = calculate_accuray(y_train, predictions)
  test_accuracy = calculate_accuray(y_test, test_predictions)
  print("Epoch: %d, loss: %.5f, Train accuracy: %.5f, Test accuracy: %.5f" % (epoch+1, loss.item(), train_accuary, test_accuracy))

 25%|██▌       | 1/4 [01:09<03:27, 69.14s/it]

Epoch: 1, loss: 0.66120, Train accuracy: 0.59196, Test accuracy: 0.60983


 50%|█████     | 2/4 [02:18<02:18, 69.18s/it]

Epoch: 2, loss: 0.52192, Train accuracy: 0.67790, Test accuracy: 0.66910


 75%|███████▌  | 3/4 [03:28<01:09, 69.51s/it]

Epoch: 3, loss: 0.65159, Train accuracy: 0.72243, Test accuracy: 0.64515


100%|██████████| 4/4 [04:38<00:00, 69.67s/it]

Epoch: 4, loss: 0.47873, Train accuracy: 0.75572, Test accuracy: 0.67073





Hyperparameter Tuning

In [None]:
import itertools
best_acc = 0
best_param_combination = []

parameter_grid = [[max_len,max_words,embedding_size] for max_len,max_words,embedding_size in itertools.product([300,500,1000],[100,200,300],[32,64,128])]

for param_combination in tqdm(parameter_grid):
  args = Namespace(
    max_len = param_combination[0],
    max_words = param_combination[1],
    learning_rate = 0.001,
    batch_size = 64,
    embedding_size = param_combination[2],
    out_size = 32,
    stride = 2,
    epochs = 4
  )

  preprocessing = Preprocessing(args)
  preprocessing.load_data()
  preprocessing.prepare_tokens()
  x_train = preprocessing.sequence_to_token(preprocessing.x_train)
  x_test = preprocessing.sequence_to_token(preprocessing.x_test)
  y_train = preprocessing.y_train
  y_test = preprocessing.y_test

  training_set = DatasetMaper(x_train, y_train)
  test_set = DatasetMaper(x_test, y_test)
  loader_train = DataLoader(training_set, batch_size=args.batch_size)
  loader_test = DataLoader(test_set)

  model = TextClassifier(args)

  # Define optimizer
  optimizer = optim.RMSprop(model.parameters(), lr=args.learning_rate)
  # Starts training phase
  for epoch in range(args.epochs):
    # Set model in training model
    model.train()
    predictions = []
    # Starts batch training
    for x_batch, y_batch in loader_train:
        x_batch = x_batch.type(torch.LongTensor)
        y_batch = y_batch.type(torch.FloatTensor)
        
        # Feed the model
        y_pred = model(x_batch)
        
        # Loss calculation
        loss = F.binary_cross_entropy(y_pred, y_batch)
        
        # Clean gradientes
        optimizer.zero_grad()
        
        # Gradients calculation
        loss.backward()
        
        # Gradients update
        optimizer.step()
        
        # Save predictions
        predictions += list(y_pred.detach().numpy())
    
    # Evaluation phase
    test_predictions = evaluation(model, loader_test)
    
    # Metrics calculation
    train_accuary = calculate_accuray(y_train, predictions)
    test_accuracy = calculate_accuray(y_test, test_predictions)
    # print("Epoch: %d, loss: %.5f, Train accuracy: %.5f, Test accuracy: %.5f" % (epoch+1, loss.item(), train_accuary, test_accuracy))

  if test_accuracy>best_acc:
    best_acc = test_accuracy
    best_param_combination = param_combination
print('best acc is {} with hyperparameters {}'.format(best_acc,best_param_combination))

100%|██████████| 27/27 [51:05<00:00, 113.53s/it]

best acc is 0.6605765326837191 with hyperparameters [1000, 300, 128]



