In [None]:
import os
from google.colab import drive
from argparse import Namespace
import numpy as np
import pandas as pd
import httpimport
import torch
import torch.optim as optim
from tqdm import tqdm_notebook, tqdm
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

drive.mount('/content/drive')
# path = 'drive/My Drive/blogtext.csv'
path = 'drive/My Drive/blogtext_preprocessed.csv'
# blog_data = pd.read_csv(path)
if torch.cuda.is_available():
  device = torch.device('cuda:0')
  print('gpu')
else:
  print('cpu')

Mounted at /content/drive
gpu


Define Functions and Classes

In [None]:
class Preprocessing:
	
	def __init__(self,args):
		self.data = path
		self.max_len = args.max_len
		self.max_words = args.max_words
		self.test_size = 0.25
		
	def load_data(self):
		df = pd.read_csv(self.data).iloc[:10000,:]
		df = df.drop(['Unnamed: 0'], axis=1).dropna().reset_index(drop=True)
		df.replace({'gender': {'male': 1,'female': 0}},inplace=True)

		X = df['text'].values
		Y = df['gender'].values
		
		self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(X, Y, test_size=self.test_size)
		
	def prepare_tokens(self):
		self.tokens = Tokenizer(num_words=self.max_words)
		self.tokens.fit_on_texts(self.x_train)

	def sequence_to_token(self, x):
		sequences = self.tokens.texts_to_sequences(x)
		return sequence.pad_sequences(sequences, maxlen=self.max_len)

class TextClassifier(nn.ModuleList):

	def __init__(self, args):
		super(TextClassifier, self).__init__()
		
		# Hyperparameters
		self.batch_size = args.batch_size
		self.hidden_dim = args.hidden_dim
		self.LSTM_layers = args.lstm_layers
		self.input_size = args.max_words
    # self.device = args.device
		
		self.dropout = nn.Dropout(0.5)
		self.embedding = nn.Embedding(self.input_size, self.hidden_dim, padding_idx=0)
		self.lstm = nn.LSTM(input_size=self.hidden_dim, hidden_size=self.hidden_dim, num_layers=self.LSTM_layers, batch_first=True) #hidden_size = round(x_train.shape[0]/(2*self.hidden_dim*self.hidden_dim))
		self.fc1 = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim*2)
		self.fc2 = nn.Linear(self.hidden_dim*2, 1)
		
	def forward(self, x):
		
		# Hidden and cell state definion
		h = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim)).to(device) #manually set device
		c = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim)).to(device) #manually set device
		
		# Initialization fo hidden and cell states
		torch.nn.init.xavier_normal_(h).to(device) #manually set device
		torch.nn.init.xavier_normal_(c).to(device) #manually set device

		# Each sequence "x" is passed through an embedding layer
		out = self.embedding(x)
		# Feed LSTMs
		out, (hidden, cell) = self.lstm(out, (h,c))
		out = self.dropout(out)
		# The last hidden state is taken
		out = torch.relu_(self.fc1(out[:,-1,:]))
		out = self.dropout(out)
		out = torch.sigmoid(self.fc2(out))

		return out

class DatasetMaper(Dataset):
	'''
	Handles batches of dataset
	'''
  
	def __init__(self, x, y):
		self.x = x
		self.y = y
		
	def __len__(self):
		return len(self.x)
		
	def __getitem__(self, idx):
		return self.x[idx], self.y[idx]

def evaluation():
  predictions = []
  
      # The model is turned in evaluation mode
  model.eval()
  
        # Skipping gradients update
  with torch.no_grad():
    
              # Iterate over the DataLoader object
    for x_batch, y_batch in loader_test:
      
      x = x_batch.type(torch.LongTensor).to(device) #manually set device
      y = y_batch.type(torch.FloatTensor).to(device) #manually set device
      
                  # Feed the model
      y_pred = model(x)
      
                  # Save prediction
      predictions += list(y_pred.detach().cpu().data.numpy()) #manually set device
      
  return predictions

def calculate_accuray(grand_truth, predictions):
  true_positives = 0
  true_negatives = 0
  
  for true, pred in zip(grand_truth, predictions):
    if (pred > 0.5) and (true == 1):
      true_positives += 1
    elif (pred < 0.5) and (true == 0):
      true_negatives += 1
    else:
      pass
      
  return (true_positives+true_negatives) / len(grand_truth)

Modelling

In [None]:
args = Namespace(
    epochs = 8,
    hidden_dim = 64,
    lstm_layers = 1,
    max_len = 300,
    max_words = 300,
    device = device,
    learning_rate = 0.001,
    batch_size = 64,
    early_stopping_criteria = 5
)

preprocessing = Preprocessing(args)
preprocessing.load_data()
preprocessing.prepare_tokens()
x_train = preprocessing.sequence_to_token(preprocessing.x_train)
x_test = preprocessing.sequence_to_token(preprocessing.x_test)
y_train = preprocessing.y_train
y_test = preprocessing.y_test

training_set = DatasetMaper(x_train, y_train)
test_set = DatasetMaper(x_test, y_test)

model = TextClassifier(args).to(device) #manually set device
next(model.parameters()).device
loader_training = DataLoader(training_set, batch_size=model.batch_size)
loader_test = DataLoader(test_set)

# Defines a RMSprop optimizer to update the parameters
optimizer = optim.RMSprop(model.parameters(), lr=args.learning_rate)

for epoch in tqdm(range(args.epochs)):

  predictions = []

  # model in training mode
  model.train()

  for x_batch, y_batch in loader_training:

    x = x_batch.type(torch.LongTensor).to(device) #manually set device
    # y = y_batch.type(torch.FloatTensor)
    y = torch.reshape(y_batch.type(torch.FloatTensor), (len(y_batch), 1)).to(device) #manually set device

    # Feed the model and get output "y_pred"
    y_pred = model(x)

    # Calculate loss
    loss = F.binary_cross_entropy(y_pred, y)

    # The gradientes are calculated
    # i.e. derivates are calculated
    loss.backward()
    
    # Each parameter is updated
    # with torch.no_grad():
    #     a -= lr * a.grad
    #     b -= lr * b.grad
    optimizer.step()
    
    # Take the gradients to zero!
    # a.grad.zero_()
    # b.grad.zero_()
    optimizer.zero_grad()

raw_predictions = evaluation()
predictions = [raw_predictions[i][0] for i in range(len(raw_predictions))]

print('Accuracy is {}'.format(calculate_accuray(y_test, predictions)))

100%|██████████| 8/8 [00:13<00:00,  1.75s/it]


Accuracy is 0.6946812829882257


Hyperparameter Tuning

In [None]:
import itertools
best_acc = 0
best_param_combination = []

parameter_grid = [[epochs,hidden_dim,lstm_layers,max_len,max_words] for epochs,hidden_dim,lstm_layers,max_len,max_words in itertools.product([8],[64,128,256],[1],[100,300],[300])]
for param_combination in tqdm(parameter_grid):
  args = Namespace(
      epochs = param_combination[0],
      hidden_dim = param_combination[1],
      lstm_layers = param_combination[2],
      max_len = param_combination[3],
      max_words = param_combination[4],
      device = device,
      learning_rate = 0.001,
      batch_size = 64,
      early_stopping_criteria = 5
  )

  preprocessing = Preprocessing(args)
  preprocessing.load_data()
  preprocessing.prepare_tokens()
  x_train = preprocessing.sequence_to_token(preprocessing.x_train)
  x_test = preprocessing.sequence_to_token(preprocessing.x_test)
  y_train = preprocessing.y_train
  y_test = preprocessing.y_test

  training_set = DatasetMaper(x_train, y_train)
  test_set = DatasetMaper(x_test, y_test)

  model = TextClassifier(args).to(device) #manually set device
  next(model.parameters()).device
  loader_training = DataLoader(training_set, batch_size=model.batch_size)
  loader_test = DataLoader(test_set)

  # Defines a RMSprop optimizer to update the parameters
  optimizer = optim.RMSprop(model.parameters(), lr=args.learning_rate)

  for epoch in tqdm(range(args.epochs)):

    predictions = []

    # model in training mode
    model.train()

    for x_batch, y_batch in loader_training:

      x = x_batch.type(torch.LongTensor).to(device) #manually set device

      y = torch.reshape(y_batch.type(torch.FloatTensor), (len(y_batch), 1)).to(device) #manually set device

      y_pred = model(x)

      loss = F.binary_cross_entropy(y_pred, y)

      loss.backward()
      
      optimizer.step()

      optimizer.zero_grad()

  raw_predictions = evaluation()
  predictions = [raw_predictions[i][0] for i in range(len(raw_predictions))]
  if calculate_accuray(y_test, predictions)>best_acc:
    best_acc = calculate_accuray(y_test, predictions)
    best_param_combination = param_combination
print('best acc is {} with hyperparameters {}'.format(best_acc,best_param_combination))


  0%|          | 0/6 [00:00<?, ?it/s][A

  0%|          | 0/8 [00:00<?, ?it/s][A[A

 12%|█▎        | 1/8 [00:00<00:04,  1.47it/s][A[A

 25%|██▌       | 2/8 [00:01<00:04,  1.48it/s][A[A

 38%|███▊      | 3/8 [00:02<00:03,  1.49it/s][A[A

 50%|█████     | 4/8 [00:02<00:02,  1.50it/s][A[A

 62%|██████▎   | 5/8 [00:03<00:01,  1.50it/s][A[A

 75%|███████▌  | 6/8 [00:03<00:01,  1.50it/s][A[A

 88%|████████▊ | 7/8 [00:04<00:00,  1.50it/s][A[A

100%|██████████| 8/8 [00:05<00:00,  1.50it/s]

 17%|█▋        | 1/6 [00:13<01:06, 13.23s/it][A

  0%|          | 0/8 [00:00<?, ?it/s][A[A

 12%|█▎        | 1/8 [00:01<00:10,  1.55s/it][A[A

 25%|██▌       | 2/8 [00:03<00:09,  1.55s/it][A[A

 38%|███▊      | 3/8 [00:04<00:07,  1.55s/it][A[A

 50%|█████     | 4/8 [00:06<00:06,  1.55s/it][A[A

 62%|██████▎   | 5/8 [00:07<00:04,  1.54s/it][A[A

 75%|███████▌  | 6/8 [00:09<00:03,  1.54s/it][A[A

 88%|████████▊ | 7/8 [00:10<00:01,  1.54s/it][A[A

100%|██████████| 8/8 [00:12<

best acc is 0.6999593991067804 with hyperparameters [8, 64, 1, 300, 300]



