In [1]:
import re
import pandas as pd
import numpy as np

## Plotting Libraries
import seaborn as sns
import matplotlib.pyplot as plt

## Pytorch Imports
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data
import torch.nn.functional as F
import torch.optim as optim

## NLP Libraries
import spacy
from sklearn.model_selection import train_test_split
from nltk import download
import gensim
from nltk.corpus import stopwords
spacy_en = spacy.load('en')
download('stopwords')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
## Checking if GPU is available
torch.cuda.is_available()

True

## 1. Reading Dataset

In [3]:
train = pd.read_csv('train.csv')
print(len(train))
train.tail()

19579


Unnamed: 0,id,text,author
19574,id17718,"I could have fancied, while I looked at it, th...",EAP
19575,id08973,The lids clenched themselves together as if in...,EAP
19576,id05267,"Mais il faut agir that is to say, a Frenchman ...",EAP
19577,id17513,"For an item of news like this, it strikes us i...",EAP
19578,id00393,"He laid a gnarled claw on my shoulder, and it ...",HPL


In [4]:
test = pd.read_csv('test.csv')
print(len(test))
test.tail()

8392


Unnamed: 0,id,text
8387,id11749,All this is now the fitter for my purpose.
8388,id10526,I fixed myself on a wide solitude.
8389,id13477,It is easily understood that what might improv...
8390,id13761,"Be this as it may, I now began to feel the ins..."
8391,id04282,"Long winded, statistical, and drearily genealo..."


## 2. Preprocessing

In [5]:
def transformText(text, do_stop=False, do_stem=False):
    
    stops = set(stopwords.words("english"))
    
    # Convert text to lower
    text = text.lower()
    
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
    
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stops]
    else:
        filtered_words = [word for word in text.split()]

    # Removing all the tokens with lesser than 3 characters
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=2)
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    if (do_stem==True):
        # Stemming
        text = gensim.parsing.preprocessing.stem_text(text)
    return text

In [6]:
train['phrase_preprocessed']=train['text'].apply(lambda x: transformText(x,do_stop=True, do_stem=True))
train.head()

Unnamed: 0,id,text,author,phrase_preprocessed
0,id26305,"This process, however, afforded me no means of...",EAP,process howev afford mean ascertain dimens dun...
1,id17569,It never once occurred to me that the fumbling...,HPL,never occur fumbl might mere mistak
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,left hand gold snuff box which caper hill cut ...
3,id27763,How lovely is spring As we looked from Windsor...,MWS,love spring look windsor terrac sixteen fertil...
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,find noth els even gold superintend abandon at...


In [7]:
test['phrase_preprocessed']=test['text'].apply(lambda x: transformText(x,do_stop=True, do_stem=True))
test.head()

Unnamed: 0,id,text,phrase_preprocessed
0,id02310,"Still, as I urged our leaving Ireland with suc...",still urg leav ireland inquietud impati father...
1,id24541,"If a fire wanted fanning, it could readily be ...",fire want fan could readili fan newspap govern...
2,id00134,And when they had broken down the frail door t...,broken frail door found thi two cleanli pick h...
3,id27757,While I was thinking how I should possibly man...,think possibl manag without them on actual tum...
4,id04081,I am not sure to what limit his knowledge may ...,sure limit knowledg mai extend


## 3. Train/test split

In [8]:
x_train, x_valid, y_train, y_valid = train_test_split(train['phrase_preprocessed'],
                                                      train['author'], 
                                                      test_size=0.2)

In [9]:
x_test = np.array(test['phrase_preprocessed'])
x_test

array(['still urg leav ireland inquietud impati father thought best yield',
       'fire want fan could readili fan newspap govern grew weaker doubt leather iron acquir durabl proport for short time pair bellow rotterdam ever stood need stitch requir assist hammer',
       'broken frail door found thi two cleanli pick human skeleton earthen floor number singular beetl crawl shadowi corner',
       ...,
       'easili understood might improv close scrutin detail mai time injur gener distantli observ effect',
       'mai began feel inspir burn hope length nurtur secret thought stern desper resolut would submit longer enslav',
       'long wind statist drearili genealog matter wa ran continu thread brood tenaci horror preternatur malevol impress even impress good doctor'], dtype=object)

In [10]:
## Build Vocabulary
word_to_ix = {}
for sent in list(x_train) + list(x_valid) + list(x_test):
    for word in sent.split():
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

In [11]:
print("Vocabulary size: {}".format(len(word_to_ix)))

Vocabulary size: 17364


In [12]:
label_to_ix = { "EAP": 0, "HPL": 1, "MWS": 2 }

In [13]:
VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = len(label_to_ix)
VOCAB_SIZE, NUM_LABELS

(17364, 3)

## 4. Making iterable dataset

In [14]:
train_data=list(zip(x_train,y_train))
train_data[0:5]

[('agit reflect threw made friend dread danger relaps', 'MWS'),
 ('thu isol thrown upon resourc spent hour childhood pore ancient tome fill shadow haunt librari chateau roam without aim purpos perpetu dusk spectral wood cloth side hill near foot',
  'HPL'),
 ('delici word letter concern me i cannot tell you said how ardent desir see mathilda',
  'MWS'),
 ('account might properli belong former period life present moment lead far afield',
  'MWS'),
 ('bear mind argument urg thicket scene applic chief part scene outrag commit singl individu',
  'EAP')]

In [15]:
valid_data=list(zip(x_valid,y_valid))
valid_data[0:5]

[('see morrow meantim adieu rose walk room paus door lean it busi thought taken power support herself said lord raymond probabl return',
  'MWS'),
 ('act then ever must impuls', 'MWS'),
 ('bring right arm across breast actuat littl machineri necessari guid left arm finger figur',
  'EAP'),
 ('di could found record murder whose cruel act might compar hi', 'MWS'),
 ('none effort plausibl detail voyag itself', 'EAP')]

## 5. Model - BoW Classifier

In [16]:
class BoWClassifier(nn.Module):
    def __init__(self, num_labels, vocab_size):
        super(BoWClassifier, self).__init__()
        
        ## Defining parameters for linear model
        self.linear = nn.Linear(vocab_size, num_labels)
    
    def forward(self, bow_vec):
        ## do the foward pass and implement non-linearity
        return F.log_softmax(self.linear(bow_vec))

In [17]:
def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence.split():
        vec[word_to_ix[word]] += 1
    return vec.view(1,-1)

In [18]:
def make_target(label, label_to_idx):
    return torch.LongTensor([label_to_idx[label]])

In [19]:
n=5118
sample_phrase=make_bow_vector(x_train[n],word_to_ix)
print(">> SENTENCE: {}".format(x_train[n]))
print(">> SENTIMENT: {}".format(y_train[n]))
print(">> INPUT SIZE: {}".format(sample_phrase.size()))
sample_phrase

>> SENTENCE: we kept smack cove five mile higher coast thi practic fine weather take advantag fifteen minut slack push across main channel mosko str m far pool drop upon anchorag somewher near otterholm sandflesen eddi violent elsewher
>> SENTIMENT: EAP
>> INPUT SIZE: torch.Size([1, 17364])



    0     0     0  ...      0     0     0
[torch.FloatTensor of size 1x17364]

## 6. Training

In [20]:
VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = len(label_to_ix)
VOCAB_SIZE, NUM_LABELS

(17364, 3)

In [21]:
model = BoWClassifier(NUM_LABELS,VOCAB_SIZE)

In [22]:
model.cuda()

BoWClassifier(
  (linear): Linear(in_features=17364, out_features=3)
)

In [23]:
mask=torch.cuda.FloatTensor((0.826,1.158,1.079))
loss_function = nn.CrossEntropyLoss(weight=mask)

In [38]:
#loss_function = nn.CrossEntropyLoss()
learning_rate = 0.01
optimizer = optim.SGD(params = model.parameters(), lr = learning_rate)

In [57]:
batch_size = 50
n_iters = 7000
num_epochs = n_iters/(len(x_train) / batch_size)
num_epochs = int(num_epochs)
num_epochs

22

In [58]:
iter = 0
for epoch in range(num_epochs):
    for (sent,label) in train_data:
        # Step 1 - clear the gradients
        model.zero_grad()
        optimizer.zero_grad()
        
        ## Step 2- Prepare input and label
        bow_vec = Variable(make_bow_vector(sent, word_to_ix)).cuda()
        target = Variable(make_target(label, label_to_ix)).cuda()
        
        # Step 3 - Run forward pass
        output = model(bow_vec)
        #print("Log probabilities - {}".format(log_probs))
        
        # Step 4 - Compute loss, gradients, update parameters
        loss = loss_function(output, target)
        loss.backward()
        optimizer.step()
        
        iter+=1      
        ## Calculate final accuracy
        if iter % 1000 ==0:
            correct = 0
            total = 0
            for (sent,label) in valid_data:
                bow_vec = Variable(make_bow_vector(sent, word_to_ix)).cuda()
                target = Variable(make_target(label, label_to_ix)).cuda()
                output = model(bow_vec)
                _,predicted = torch.max(output.data,1)
                total += target.size(0)
                correct += (predicted[0] == make_target(label, label_to_ix)).sum()
            accuracy = 100 * correct/total
            print('Iterations: {}. Loss: {}. Accuracy: {}'.format(iter,loss.data[0],accuracy))

  # Remove the CWD from sys.path while we load stuff.


Iterations: 1000. Loss: 0.006390612106770277. Accuracy: 82.073544433095
Iterations: 2000. Loss: 0.0002262336201965809. Accuracy: 81.86925434116445
Iterations: 3000. Loss: 0.008876388892531395. Accuracy: 82.15015321756894
Iterations: 4000. Loss: 5.924526340095326e-05. Accuracy: 81.84371807967314
Iterations: 5000. Loss: 0.37198179960250854. Accuracy: 81.7926455566905
Iterations: 6000. Loss: 0.5646800994873047. Accuracy: 81.97139938712972
Iterations: 7000. Loss: 0.1864282637834549. Accuracy: 82.2267620020429
Iterations: 8000. Loss: 0.3021472692489624. Accuracy: 81.92032686414709
Iterations: 9000. Loss: 0.7592883706092834. Accuracy: 82.25229826353421
Iterations: 10000. Loss: 0.19229216873645782. Accuracy: 81.89479060265577
Iterations: 11000. Loss: 0.006421408616006374. Accuracy: 81.66496424923392
Iterations: 12000. Loss: 0.2754216194152832. Accuracy: 81.94586312563841
Iterations: 13000. Loss: 0.01569185219705105. Accuracy: 81.69050051072523
Iterations: 14000. Loss: 0.2403641641139984. Accu

Iterations: 113000. Loss: 0.18517456948757172. Accuracy: 81.92032686414709
Iterations: 114000. Loss: 0.03184659406542778. Accuracy: 82.02247191011236
Iterations: 115000. Loss: 0.08688245713710785. Accuracy: 82.04800817160368
Iterations: 116000. Loss: 0.012554424814879894. Accuracy: 81.94586312563841
Iterations: 117000. Loss: 0.19424781203269958. Accuracy: 81.94586312563841
Iterations: 118000. Loss: 0.40765267610549927. Accuracy: 81.7926455566905
Iterations: 119000. Loss: 1.8756237030029297. Accuracy: 81.7926455566905
Iterations: 120000. Loss: 0.14633987843990326. Accuracy: 81.7926455566905
Iterations: 121000. Loss: 2.2617061138153076. Accuracy: 81.89479060265577
Iterations: 122000. Loss: 0.02996932342648506. Accuracy: 81.76710929519918
Iterations: 123000. Loss: 1.2018389701843262. Accuracy: 81.6394279877426
Iterations: 124000. Loss: 0.19091542065143585. Accuracy: 81.76710929519918
Iterations: 125000. Loss: 0.000780635280534625. Accuracy: 81.89479060265577
Iterations: 126000. Loss: 0.39

Iterations: 223000. Loss: 0.029759787023067474. Accuracy: 81.69050051072523
Iterations: 224000. Loss: 0.4507604241371155. Accuracy: 81.89479060265577
Iterations: 225000. Loss: 0.01650993898510933. Accuracy: 81.84371807967314
Iterations: 226000. Loss: 0.5342569947242737. Accuracy: 81.69050051072523
Iterations: 227000. Loss: 0.47353145480155945. Accuracy: 81.69050051072523
Iterations: 228000. Loss: 0.07962946593761444. Accuracy: 81.61389172625128
Iterations: 229000. Loss: 0.6731911301612854. Accuracy: 81.69050051072523
Iterations: 230000. Loss: 0.2767869234085083. Accuracy: 81.7926455566905
Iterations: 231000. Loss: 0.1019599661231041. Accuracy: 81.89479060265577
Iterations: 232000. Loss: 2.3841855067985307e-07. Accuracy: 81.61389172625128
Iterations: 233000. Loss: 0.4327751398086548. Accuracy: 81.69050051072523
Iterations: 234000. Loss: 0.06536513566970825. Accuracy: 81.69050051072523
Iterations: 235000. Loss: 0.014691618271172047. Accuracy: 81.84371807967314
Iterations: 236000. Loss: 0

Iterations: 333000. Loss: 0.03390335664153099. Accuracy: 81.86925434116445
Iterations: 334000. Loss: 0.45538032054901123. Accuracy: 81.94586312563841
Iterations: 335000. Loss: 0.00046754872892051935. Accuracy: 81.74157303370787
Iterations: 336000. Loss: 0.3920731842517853. Accuracy: 81.6394279877426
Iterations: 337000. Loss: 0.010014409199357033. Accuracy: 81.53728294177732
Iterations: 338000. Loss: 4.410734163684538e-06. Accuracy: 81.48621041879468
Iterations: 339000. Loss: 0.0009205871028825641. Accuracy: 81.48621041879468
Iterations: 340000. Loss: 0.33416634798049927. Accuracy: 81.61389172625128
Iterations: 341000. Loss: 0.0319293849170208. Accuracy: 81.43513789581205
Iterations: 342000. Loss: 1.319566249847412. Accuracy: 81.51174668028601
Iterations: 343000. Loss: 0.0891244038939476. Accuracy: 81.46067415730337
Iterations: 344000. Loss: 0.2714027762413025. Accuracy: 81.51174668028601


In [59]:
from sklearn.metrics import log_loss

In [60]:
def calculate_log_loss(valid_data, model, label_to_ix, word_to_ix):
    true_label = np.zeros((len(valid_data),1))
    results_valid = np.zeros((len(valid_data),len(label_to_ix)))
    for i in range(len(valid_data)):
        bow_vec = Variable(make_bow_vector(valid_data[i][0], word_to_ix)).cuda()
        log_probs = model(bow_vec)
        pred = F.softmax(log_probs,dim=1).data.cpu().numpy()
        results_valid[i]=pred
        true_label[i]=label_to_ix[valid_data[i][1]]
    return log_loss(true_label,results_valid)

In [61]:
calculate_log_loss(valid_data, model, label_to_ix, word_to_ix)

  # Remove the CWD from sys.path while we load stuff.


0.46161438170183672

In [62]:
def make_preds(model,test):
    my_sub = pd.DataFrame(columns={'id', 'EAP','HPL', 'MWS'})
    my_sub=my_sub[['id', 'EAP','HPL', 'MWS']]
    for i in range(len(test['phrase_preprocessed'])):
        sample=test['phrase_preprocessed'][i]
        #print(sample)
        sample_context=Variable(make_bow_vector(sample,word_to_ix)).cuda()
        log_prob=model(sample_context)
        probs=F.softmax(log_prob)
        my_sub.loc[i] = [test['id'][i], probs.data[0][0],probs.data[0][1],probs.data[0][2]]
    return my_sub

In [63]:
my_sub = pd.DataFrame(columns={'id', 'EAP','HPL', 'MWS'})
my_sub=my_sub[['id', 'EAP','HPL', 'MWS']]
my_sub

Unnamed: 0,id,EAP,HPL,MWS


In [64]:
preds=make_preds(model,test)

  # Remove the CWD from sys.path while we load stuff.
  if __name__ == '__main__':


In [65]:
preds.tail()

Unnamed: 0,id,EAP,HPL,MWS
8387,id11749,0.644002,0.057733,0.298265
8388,id10526,0.082567,0.039702,0.877731
8389,id13477,0.390211,0.007943,0.601846
8390,id13761,0.376146,0.018901,0.604954
8391,id04282,0.008989,0.990818,0.000193


In [66]:
preds.to_csv('roberto_new_14.csv',index=False)