In [None]:
import re
import pandas as pd
import numpy as np

## Plotting Libraries
import seaborn as sns
import matplotlib.pyplot as plt

## Pytorch Imports
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data
import torch.nn.functional as F
import torch.optim as optim

## NLP Libraries
import spacy
from sklearn.model_selection import train_test_split
from nltk import download
import gensim
from nltk.corpus import stopwords
spacy_en = spacy.load('en')
download('stopwords')

In [None]:
## Checking if GPU is available
torch.cuda.is_available()

In [None]:
a = torch.cuda.FloatTensor([1])
print("{} - {}".format(type(a),type(a[0])))

## 1. Reading Dataset

In [None]:
train = pd.read_csv('train.csv')
print(len(train))
train.tail()

In [None]:
test = pd.read_csv('test.csv')
print(len(test))
test.tail()

In [None]:
## Checking dataset unbalance

In [None]:
EAP = train[train['author']=='EAP'].reset_index()
EAP_size = len(EAP)
print(EAP_size)
EAP.tail()

In [None]:
HPL = train[train['author']=='HPL'].reset_index()
HPL_size = len(HPL)
print(HPL_size)
HPL.tail()

In [None]:
MWS = train[train['author']=='MWS'].reset_index()
MWS_size = len(MWS)
print(MWS_size)
MWS.tail()

In [None]:
EAP[0:HPL_size].tail()

In [None]:
train_undersampled = pd.concat([EAP[0:HPL_size],HPL,MWS[0:HPL_size]], ignore_index=True)
train_undersampled.drop(['index'],axis=1,inplace=True)
train_undersampled = train_undersampled.sample(frac=1).reset_index(drop=True)
train_undersampled.tail()

In [None]:
(len(train))

## 3. Preprocessing

In [None]:
def transformText(text, do_stop=False, do_stem=False):
    
    stops = set(stopwords.words("english"))
    
    # Convert text to lower
    text = text.lower()
    
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
    
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stops]
    else:
        filtered_words = [word for word in text.split()]

    # Removing all the tokens with lesser than 3 characters
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=2)
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    if (do_stem==True):
        # Stemming
        text = gensim.parsing.preprocessing.stem_text(text)
    return text

In [None]:
train_undersampled['phrase_preprocessed']=train_undersampled['text'].apply(lambda x: transformText(x,do_stop=False, do_stem=False))
train_undersampled.head()

In [None]:
label_to_ix = { "EAP": 0, "HPL": 1, "MWS": 2 }
train['label']=[label_to_ix[a] for a in train.author]
train[0:10]

In [None]:
test['phrase_preprocessed']=test['text'].apply(lambda x: transformText(x,do_stop=False, do_stem=False))
test.head()

## 4. Train/Test split, Vocab

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(train_undersampled['phrase_preprocessed'],
                                                      train_undersampled['author'], 
                                                      test_size=0.2)

In [None]:
x_test = np.array(test['phrase_preprocessed'])
x_test

In [None]:
## Build Vocabulary
word_to_ix = {}
for sent in list(x_train) + list(x_valid) + list(x_test):
    for word in sent.split():
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

In [None]:
print("Vocabulary size: {}".format(len(word_to_ix)))

In [None]:
label_to_ix = { "EAP": 0, "HPL": 1, "MWS": 2 }

In [None]:
VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = len(label_to_ix)
VOCAB_SIZE, NUM_LABELS

## 5. Making dataset iterable

In [None]:
train_data=list(zip(x_train,y_train))
train_data[0:5]

In [None]:
valid_data=list(zip(x_valid,y_valid))
valid_data[0:5]

In [None]:
def make_context_vector(seq, to_ix):
    idxs = [to_ix[w] for w in seq.split()]
    tensor = torch.cuda.LongTensor(idxs)
    return tensor

In [None]:
def make_target(label, label_to_idx):
    return torch.cuda.LongTensor([label_to_idx[label]])

## 6. Model - LSTM Classifier with Word2vec

In [None]:
!ls ../../vectors/

In [None]:
glove_path = '../../vectors/glove.42B.300d.txt'

In [None]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [None]:
glove_vector = loadGloveModel(glove_path)

In [None]:
#glove_vector['start'][0:100]

In [None]:
#from gensim.models import KeyedVectors

In [None]:
#w2v = KeyedVectors.load_word2vec_format('../../vectors/GoogleNews-vectors-negative300.bin', binary = True)

In [None]:
W2V_DIM = 300
## standard deviation to use
sd = 1/np.sqrt(W2V_DIM)
## Random initialization
weights = np.random.normal(0, scale=sd, size=[VOCAB_SIZE, W2V_DIM])
weights = weights.astype(np.float32)

In [None]:
for word in word_to_ix:
    id = word_to_ix.get(word,None)
    if id is not None:
        try:
            #weights[id]=w2v.wv.word_vec(word)
            weights[id]=glove_vector[word]
        except:
            weights[id]=np.random.normal(0, scale=sd, size=[1, W2V_DIM]) ## If word not present, initialize randomly

In [None]:
word_to_ix

In [None]:
glove_vector["confessed"][0:50]
#w2v.wv.word_vec("confessed")[0:50]

In [None]:
idx=word_to_ix['confessed']

In [None]:
weights[idx][0:50]

In [None]:
W2V_DIM = 300
HIDDEN_DIM = 60
NUM_LAYERS = 5
DROPOUT = 0.8

In [None]:
class GruClassifierW2vec(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size, label_size, pre_trained_weights, dropout):
        super(GruClassifierW2vec, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeddings.weight.data=torch.Tensor(pre_trained_weights)
        self.gru = nn.GRU(input_size = embedding_dim,
                            hidden_size = hidden_dim,
                            num_layers = num_layers,
                            dropout = dropout)
        self.hidden2label = nn.Linear(hidden_dim, label_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # the first is the hidden h
        return (Variable(torch.zeros(self.num_layers, 1, self.hidden_dim)).cuda())

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        x = embeds.view(len(sentence), 1, -1)
        for i in range(self.num_layers):
            gru_out, self.hidden = self.gru(x, self.hidden)
        y  = self.hidden2label(gru_out[-1])
        log_probs = F.log_softmax(y)
        return log_probs

In [None]:
model = GruClassifierW2vec(embedding_dim=W2V_DIM,
                            hidden_dim=HIDDEN_DIM,
                            num_layers=NUM_LAYERS,
                            vocab_size=VOCAB_SIZE,
                            label_size=NUM_LABELS,
                            pre_trained_weights = weights,
                            dropout = DROPOUT)

In [None]:
model.cuda()

In [None]:
avg=int((EAP_size+HPL_size+MWS_size)/3)
avg

In [None]:
print(float(avg/EAP_size))
print(float(avg/HPL_size))
print(float(avg/MWS_size))

In [None]:
EAP_size*0.826

In [None]:
HPL_size*1.158

In [None]:
MWS_size*1.079

In [None]:
label_to_ix

In [None]:
## Loss function with mask to compensate class inbalance
mask=torch.cuda.FloatTensor((0.826,1.158,1.079))
#loss_function = nn.CrossEntropyLoss(weight=mask)

In [None]:
loss_function = nn.CrossEntropyLoss()
#print(loss_function.weight)
learning_rate = 0.001
optimizer = optim.Adam(model.parameters(),lr = learning_rate)

In [None]:
sample=train_data[2][0]
sample

In [None]:
sample_context=Variable(make_context_vector(sample,word_to_ix)).cuda()
sample_context

In [None]:
out=model(sample_context)
out

In [None]:
batch_size = 20
n_iters = 3000
num_epochs = n_iters/(len(x_train) / batch_size)
num_epochs = int(num_epochs)
num_epochs

In [None]:
iter = 0
for epoch in range(num_epochs):
    for (sent,label) in train_data:
        # Step 1 - clear the gradients
        model.zero_grad()
        optimizer.zero_grad()
        model.hidden = model.init_hidden()
        
        ## Step 2- Prepare input and label
        context_vec = Variable(make_context_vector(sent, word_to_ix)).cuda()
        target = Variable(make_target(label, label_to_ix)).cuda()
        
        # Step 3 - Run forward pass
        output = model(context_vec)  
        
        # Step 4 - Compute loss, gradients, update parameters
        loss = loss_function(output, target)
        loss.backward()
        optimizer.step()
        
        iter+=1     
        ## Calculate final accuracy
        if iter % 1000 ==0:
            correct = 0
            total = 0
            for (sent,label) in valid_data:
                context_vec = Variable(make_context_vector(sent, word_to_ix)).cuda()
                target = Variable(make_target(label, label_to_ix)).cuda()
                output = model(context_vec)
                _,predicted = torch.max(output.data,1)
                total += target.size(0)
                correct += (predicted[0] == make_target(label, label_to_ix)).sum()
            accuracy = 100 * correct/total
            print('Iterations: {}. Loss: {}. Accuracy: {}'.format(iter,loss.data[0],accuracy))

In [None]:
## Making predictions on Test set

In [None]:
n=3
bow_vec = Variable(make_context_vector(valid_data[n][0], word_to_ix))
print("-"*20 + " INPUT "+"-"*30)
print("TRUE LABEL = {}".format(valid_data[n][1]))
print("SENTENCE = {}".format(valid_data[n][0]))
print("-"*20 + " PREDICTION "+"-"*30)
log_probs = model(bow_vec)
_,predicted = torch.max(log_probs.data,1)
print("PRED = {}".format(predicted[0]))
print("PRED = {}".format(list(label_to_ix.keys())[list(label_to_ix.values()).index(predicted[0])]))
##print("LOG_PROB = {}".format(log_probs))
print("PROBS = {}".format(F.softmax(log_probs)))

In [None]:
from sklearn.metrics import log_loss

In [None]:
bow_vec = Variable(make_context_vector(valid_data[10][0], word_to_ix))

In [None]:
def calculate_log_loss(valid_data, model, label_to_ix, word_to_ix):
    true_label = np.zeros((len(valid_data),1))
    results_valid = np.zeros((len(valid_data),len(label_to_ix)))
    for i in range(len(valid_data)):
        bow_vec = Variable(make_context_vector(valid_data[i][0], word_to_ix))
        log_probs = model(bow_vec)
        pred = F.softmax(log_probs,dim=1).data.cpu().numpy()
        results_valid[i]=pred
        true_label[i]=label_to_ix[valid_data[i][1]]
    return log_loss(true_label,results_valid)

In [None]:
calculate_log_loss(valid_data, model, label_to_ix, word_to_ix)

In [None]:
def make_preds(model,test):
    my_sub = pd.DataFrame(columns={'id', 'EAP','HPL', 'MWS'})
    my_sub=my_sub[['id', 'EAP','HPL', 'MWS']]
    for i in range(len(test['phrase_preprocessed'])):
        sample=test['phrase_preprocessed'][i]
        #print(sample)
        sample_context=Variable(make_context_vector(sample,word_to_ix)).cuda()
        log_prob=model(sample_context)
        probs=F.softmax(log_prob)
        my_sub.loc[i] = [test['id'][i], probs.data[0][0],probs.data[0][1],probs.data[0][2]]
    return my_sub

In [None]:
my_sub = pd.DataFrame(columns={'id', 'EAP','HPL', 'MWS'})
my_sub=my_sub[['id', 'EAP','HPL', 'MWS']]
my_sub

In [None]:
my_sub

In [None]:
preds=make_preds(model,test)

In [None]:
preds.tail()

In [None]:
preds.to_csv('roberto_new_12.csv',index=False)