In [1]:
import re
import pandas as pd
import numpy as np

## Plotting Libraries
import seaborn as sns
import matplotlib.pyplot as plt

## Pytorch Imports
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data
import torch.nn.functional as F
import torch.optim as optim

## NLP Libraries
import spacy
from sklearn.model_selection import train_test_split
from nltk import download
import gensim
from nltk.corpus import stopwords
spacy_en = spacy.load('en')
download('stopwords')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
## Checking if GPU is available
torch.cuda.is_available()

True

## 1. Reading Dataset

In [3]:
train = pd.read_csv('train.csv')
print(len(train))
train.tail()

19579


Unnamed: 0,id,text,author
19574,id17718,"I could have fancied, while I looked at it, th...",EAP
19575,id08973,The lids clenched themselves together as if in...,EAP
19576,id05267,"Mais il faut agir that is to say, a Frenchman ...",EAP
19577,id17513,"For an item of news like this, it strikes us i...",EAP
19578,id00393,"He laid a gnarled claw on my shoulder, and it ...",HPL


In [4]:
test = pd.read_csv('test.csv')
print(len(test))
test.tail()

8392


Unnamed: 0,id,text
8387,id11749,All this is now the fitter for my purpose.
8388,id10526,I fixed myself on a wide solitude.
8389,id13477,It is easily understood that what might improv...
8390,id13761,"Be this as it may, I now began to feel the ins..."
8391,id04282,"Long winded, statistical, and drearily genealo..."


## 2. Preprocessing

In [5]:
def transformText(text, do_stop=False, do_stem=False):
    
    stops = set(stopwords.words("english"))
    
    # Convert text to lower
    text = text.lower()
    
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
    
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stops]
    else:
        filtered_words = [word for word in text.split()]

    # Removing all the tokens with lesser than 3 characters
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=2)
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    if (do_stem==True):
        # Stemming
        text = gensim.parsing.preprocessing.stem_text(text)
    return text

In [6]:
train['phrase_preprocessed']=train['text'].apply(lambda x: transformText(x,do_stop=True, do_stem=True))
train.head()

Unnamed: 0,id,text,author,phrase_preprocessed
0,id26305,"This process, however, afforded me no means of...",EAP,process howev afford mean ascertain dimens dun...
1,id17569,It never once occurred to me that the fumbling...,HPL,never occur fumbl might mere mistak
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,left hand gold snuff box which caper hill cut ...
3,id27763,How lovely is spring As we looked from Windsor...,MWS,love spring look windsor terrac sixteen fertil...
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,find noth els even gold superintend abandon at...


In [7]:
test['phrase_preprocessed']=test['text'].apply(lambda x: transformText(x,do_stop=True, do_stem=True))
test.head()

Unnamed: 0,id,text,phrase_preprocessed
0,id02310,"Still, as I urged our leaving Ireland with suc...",still urg leav ireland inquietud impati father...
1,id24541,"If a fire wanted fanning, it could readily be ...",fire want fan could readili fan newspap govern...
2,id00134,And when they had broken down the frail door t...,broken frail door found thi two cleanli pick h...
3,id27757,While I was thinking how I should possibly man...,think possibl manag without them on actual tum...
4,id04081,I am not sure to what limit his knowledge may ...,sure limit knowledg mai extend


## 3. Train/test split

In [8]:
x_train, x_valid, y_train, y_valid = train_test_split(train['phrase_preprocessed'],
                                                      train['author'], 
                                                      test_size=0.2)

In [9]:
x_test = np.array(test['phrase_preprocessed'])
x_test

array(['still urg leav ireland inquietud impati father thought best yield',
       'fire want fan could readili fan newspap govern grew weaker doubt leather iron acquir durabl proport for short time pair bellow rotterdam ever stood need stitch requir assist hammer',
       'broken frail door found thi two cleanli pick human skeleton earthen floor number singular beetl crawl shadowi corner',
       ...,
       'easili understood might improv close scrutin detail mai time injur gener distantli observ effect',
       'mai began feel inspir burn hope length nurtur secret thought stern desper resolut would submit longer enslav',
       'long wind statist drearili genealog matter wa ran continu thread brood tenaci horror preternatur malevol impress even impress good doctor'], dtype=object)

In [10]:
## Build Vocabulary
word_to_ix = {}
for sent in list(x_train) + list(x_valid) + list(x_test):
    for word in sent.split():
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

In [11]:
print("Vocabulary size: {}".format(len(word_to_ix)))

Vocabulary size: 17364


In [12]:
label_to_ix = { "EAP": 0, "HPL": 1, "MWS": 2 }

In [13]:
VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = len(label_to_ix)
VOCAB_SIZE, NUM_LABELS

(17364, 3)

## 4. Making iterable dataset

In [14]:
train_data=list(zip(x_train,y_train))
train_data[0:5]

[('agit reflect threw made friend dread danger relaps', 'MWS'),
 ('thu isol thrown upon resourc spent hour childhood pore ancient tome fill shadow haunt librari chateau roam without aim purpos perpetu dusk spectral wood cloth side hill near foot',
  'HPL'),
 ('delici word letter concern me i cannot tell you said how ardent desir see mathilda',
  'MWS'),
 ('account might properli belong former period life present moment lead far afield',
  'MWS'),
 ('bear mind argument urg thicket scene applic chief part scene outrag commit singl individu',
  'EAP')]

In [15]:
valid_data=list(zip(x_valid,y_valid))
valid_data[0:5]

[('see morrow meantim adieu rose walk room paus door lean it busi thought taken power support herself said lord raymond probabl return',
  'MWS'),
 ('act then ever must impuls', 'MWS'),
 ('bring right arm across breast actuat littl machineri necessari guid left arm finger figur',
  'EAP'),
 ('di could found record murder whose cruel act might compar hi', 'MWS'),
 ('none effort plausibl detail voyag itself', 'EAP')]

## 5. Model - BoW Classifier

In [16]:
class BoWClassifier(nn.Module):
    def __init__(self, num_labels, vocab_size):
        super(BoWClassifier, self).__init__()
        
        ## Defining parameters for linear model
        self.linear = nn.Linear(vocab_size, num_labels)
    
    def forward(self, bow_vec):
        ## do the foward pass and implement non-linearity
        return F.log_softmax(self.linear(bow_vec))

In [17]:
def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence.split():
        vec[word_to_ix[word]] += 1
    return vec.view(1,-1)

In [18]:
def make_target(label, label_to_idx):
    return torch.LongTensor([label_to_idx[label]])

In [19]:
n=5118
sample_phrase=make_bow_vector(x_train[n],word_to_ix)
print(">> SENTENCE: {}".format(x_train[n]))
print(">> SENTIMENT: {}".format(y_train[n]))
print(">> INPUT SIZE: {}".format(sample_phrase.size()))
sample_phrase

>> SENTENCE: we kept smack cove five mile higher coast thi practic fine weather take advantag fifteen minut slack push across main channel mosko str m far pool drop upon anchorag somewher near otterholm sandflesen eddi violent elsewher
>> SENTIMENT: EAP
>> INPUT SIZE: torch.Size([1, 17364])



    0     0     0  ...      0     0     0
[torch.FloatTensor of size 1x17364]

## 6. Training

In [24]:
VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = len(label_to_ix)
VOCAB_SIZE, NUM_LABELS

(17364, 3)

In [26]:
model = BoWClassifier(NUM_LABELS,VOCAB_SIZE)

In [27]:
model.cuda()

BoWClassifier(
  (linear): Linear(in_features=17364, out_features=3)
)

In [28]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 0.01
optimizer = optim.SGD(params = model.parameters(), lr = learning_rate)

In [30]:
batch_size = 50
n_iters = 3000
num_epochs = n_iters/(len(x_train) / batch_size)
num_epochs = int(num_epochs)

In [31]:
iter = 0
for epoch in range(num_epochs):
    for (sent,label) in train_data:
        # Step 1 - clear the gradients
        model.zero_grad()
        optimizer.zero_grad()
        
        ## Step 2- Prepare input and label
        bow_vec = Variable(make_bow_vector(sent, word_to_ix)).cuda()
        target = Variable(make_target(label, label_to_ix)).cuda()
        
        # Step 3 - Run forward pass
        output = model(bow_vec)
        #print("Log probabilities - {}".format(log_probs))
        
        # Step 4 - Compute loss, gradients, update parameters
        loss = loss_function(output, target)
        loss.backward()
        optimizer.step()
        
        iter+=1      
        ## Calculate final accuracy
        if iter % 100 ==0:
            correct = 0
            total = 0
            for (sent,label) in valid_data:
                bow_vec = Variable(make_bow_vector(sent, word_to_ix)).cuda()
                target = Variable(make_target(label, label_to_ix)).cuda()
                output = model(bow_vec)
                _,predicted = torch.max(output.data,1)
                total += target.size(0)
                correct += (predicted[0] == make_target(label, label_to_ix)).sum()
            accuracy = 100 * correct/total
            print('Iterations: {}. Loss: {}. Accuracy: {}'.format(iter,loss.data[0],accuracy))

  # Remove the CWD from sys.path while we load stuff.


Iterations: 100. Loss: 1.1417611837387085. Accuracy: 41.215526046986724
Iterations: 200. Loss: 1.2148423194885254. Accuracy: 41.29213483146067
Iterations: 300. Loss: 1.1584199666976929. Accuracy: 46.220633299284984
Iterations: 400. Loss: 0.9859713315963745. Accuracy: 46.016343207354446
Iterations: 500. Loss: 1.0582222938537598. Accuracy: 54.03472931562819
Iterations: 600. Loss: 1.0999226570129395. Accuracy: 59.6782431052094
Iterations: 700. Loss: 1.138018250465393. Accuracy: 53.983656792645554
Iterations: 800. Loss: 0.8971432447433472. Accuracy: 54.13687436159346
Iterations: 900. Loss: 1.0161608457565308. Accuracy: 51.40449438202247
Iterations: 1000. Loss: 1.114119529724121. Accuracy: 46.88457609805924
Iterations: 1100. Loss: 1.2058254480361938. Accuracy: 48.34014300306435
Iterations: 1200. Loss: 1.1601815223693848. Accuracy: 49.84678243105209
Iterations: 1300. Loss: 0.893349289894104. Accuracy: 48.92747701736466
Iterations: 1400. Loss: 0.9689179062843323. Accuracy: 51.22574055158325
I

Iterations: 11500. Loss: 0.736420750617981. Accuracy: 74.00408580183861
Iterations: 11600. Loss: 0.7553919553756714. Accuracy: 74.46373850868233
Iterations: 11700. Loss: 0.8663567304611206. Accuracy: 73.92747701736465
Iterations: 11800. Loss: 1.1619982719421387. Accuracy: 73.79979570990807
Iterations: 11900. Loss: 1.4577738046646118. Accuracy: 73.31460674157303
Iterations: 12000. Loss: 0.797548770904541. Accuracy: 72.6762002042901
Iterations: 12100. Loss: 1.5051548480987549. Accuracy: 72.85495403472932
Iterations: 12200. Loss: 0.19910919666290283. Accuracy: 72.57405515832482
Iterations: 12300. Loss: 0.1526978313922882. Accuracy: 73.4167517875383
Iterations: 12400. Loss: 0.6748184561729431. Accuracy: 73.23799795709908
Iterations: 12500. Loss: 0.03355148062109947. Accuracy: 72.9826353421859
Iterations: 12600. Loss: 0.6780412793159485. Accuracy: 73.4167517875383
Iterations: 12700. Loss: 1.0964689254760742. Accuracy: 73.23799795709908
Iterations: 12800. Loss: 0.5165092945098877. Accuracy: 

Iterations: 22800. Loss: 0.3892107307910919. Accuracy: 76.35342185903984
Iterations: 22900. Loss: 0.5325114130973816. Accuracy: 76.45556690500511
Iterations: 23000. Loss: 0.6042300462722778. Accuracy: 76.32788559754852
Iterations: 23100. Loss: 0.46884605288505554. Accuracy: 75.66394279877426
Iterations: 23200. Loss: 0.1807669997215271. Accuracy: 75.8682328907048
Iterations: 23300. Loss: 0.16387897729873657. Accuracy: 76.3023493360572
Iterations: 23400. Loss: 0.5086198449134827. Accuracy: 75.53626149131767
Iterations: 23500. Loss: 1.1658060550689697. Accuracy: 76.02145045965271
Iterations: 23600. Loss: 1.6550219058990479. Accuracy: 76.40449438202248
Iterations: 23700. Loss: 1.4432082176208496. Accuracy: 76.04698672114402
Iterations: 23800. Loss: 0.4843158721923828. Accuracy: 75.66394279877426
Iterations: 23900. Loss: 0.28227996826171875. Accuracy: 75.74055158324822
Iterations: 24000. Loss: 0.3798034191131592. Accuracy: 76.3023493360572
Iterations: 24100. Loss: 1.0071837902069092. Accura

Iterations: 34100. Loss: 0.6508767604827881. Accuracy: 77.47701736465781
Iterations: 34200. Loss: 0.12920980155467987. Accuracy: 77.40040858018386
Iterations: 34300. Loss: 0.11926945298910141. Accuracy: 77.7579162410623
Iterations: 34400. Loss: 0.32661619782447815. Accuracy: 77.27272727272727
Iterations: 34500. Loss: 0.15161071717739105. Accuracy: 76.94075587334014
Iterations: 34600. Loss: 0.35220301151275635. Accuracy: 77.06843718079674
Iterations: 34700. Loss: 0.23493121564388275. Accuracy: 77.24719101123596
Iterations: 34800. Loss: 0.48224541544914246. Accuracy: 76.5832482124617
Iterations: 34900. Loss: 0.43967947363853455. Accuracy: 76.71092951991828
Iterations: 35000. Loss: 8.272782724816352e-05. Accuracy: 76.76200204290092
Iterations: 35100. Loss: 0.1238052248954773. Accuracy: 77.27272727272727
Iterations: 35200. Loss: 0.23099704086780548. Accuracy: 77.19611848825332
Iterations: 35300. Loss: 0.04567185416817665. Accuracy: 77.57916241062308
Iterations: 35400. Loss: 0.7615048289299

Iterations: 45300. Loss: 1.397005319595337. Accuracy: 78.4729315628192
Iterations: 45400. Loss: 1.0767698287963867. Accuracy: 78.72829417773238
Iterations: 45500. Loss: 0.3772626221179962. Accuracy: 78.75383043922369
Iterations: 45600. Loss: 0.07551827281713486. Accuracy: 78.57507660878447
Iterations: 45700. Loss: 0.1961425393819809. Accuracy: 78.24310520939734
Iterations: 45800. Loss: 0.5706614851951599. Accuracy: 78.62614913176711
Iterations: 45900. Loss: 0.06385918706655502. Accuracy: 78.70275791624107
Iterations: 46000. Loss: 1.094172716140747. Accuracy: 78.83043922369765
Iterations: 46100. Loss: 0.17372111976146698. Accuracy: 79.16241062308478
Iterations: 46200. Loss: 0.7532579302787781. Accuracy: 79.11133810010215
Iterations: 46300. Loss: 0.8604118824005127. Accuracy: 79.06026557711951
Iterations: 46400. Loss: 0.0690145343542099. Accuracy: 78.65168539325843
Iterations: 46500. Loss: 0.04409441351890564. Accuracy: 78.54954034729316
Iterations: 46600. Loss: 0.7188194990158081. Accur

Iterations: 56500. Loss: 0.627971351146698. Accuracy: 79.00919305413687
Iterations: 56600. Loss: 0.5657660961151123. Accuracy: 78.77936670071502
Iterations: 56700. Loss: 0.5492665767669678. Accuracy: 78.75383043922369
Iterations: 56800. Loss: 0.9710679650306702. Accuracy: 78.57507660878447
Iterations: 56900. Loss: 0.9073539972305298. Accuracy: 78.49846782431052
Iterations: 57000. Loss: 1.0832033157348633. Accuracy: 78.57507660878447
Iterations: 57100. Loss: 0.2338031679391861. Accuracy: 78.39632277834525
Iterations: 57200. Loss: 0.09776128083467484. Accuracy: 78.4729315628192
Iterations: 57300. Loss: 0.892398476600647. Accuracy: 78.77936670071502
Iterations: 57400. Loss: 0.1207452043890953. Accuracy: 78.98365679264556
Iterations: 57500. Loss: 0.11471226066350937. Accuracy: 78.62614913176711
Iterations: 57600. Loss: 1.2504477500915527. Accuracy: 78.65168539325843
Iterations: 57700. Loss: 0.646189272403717. Accuracy: 78.83043922369765
Iterations: 57800. Loss: 0.11183533072471619. Accurac

Iterations: 67700. Loss: 0.903321385383606. Accuracy: 79.41777323799796
Iterations: 67800. Loss: 0.6091587543487549. Accuracy: 79.54545454545455
Iterations: 67900. Loss: 0.35062333941459656. Accuracy: 79.64759959141982
Iterations: 68000. Loss: 0.1939026117324829. Accuracy: 79.67313585291113
Iterations: 68100. Loss: 0.22905077040195465. Accuracy: 79.67313585291113
Iterations: 68200. Loss: 0.947113037109375. Accuracy: 79.69867211440246
Iterations: 68300. Loss: 1.0327365398406982. Accuracy: 79.72420837589377
Iterations: 68400. Loss: 0.10101097822189331. Accuracy: 79.67313585291113
Iterations: 68500. Loss: 0.010985961183905602. Accuracy: 79.72420837589377
Iterations: 68600. Loss: 0.6440916061401367. Accuracy: 79.80081716036773
Iterations: 68700. Loss: 0.09878183156251907. Accuracy: 79.67313585291113
Iterations: 68800. Loss: 0.4615565836429596. Accuracy: 79.57099080694586
Iterations: 68900. Loss: 0.5988982319831848. Accuracy: 79.41777323799796
Iterations: 69000. Loss: 0.2642790973186493. Ac

Iterations: 78900. Loss: 0.069785937666893. Accuracy: 79.95403472931562
Iterations: 79000. Loss: 0.04743032157421112. Accuracy: 79.97957099080695
Iterations: 79100. Loss: 0.5923871397972107. Accuracy: 79.90296220633299
Iterations: 79200. Loss: 0.6699829697608948. Accuracy: 79.92849846782431
Iterations: 79300. Loss: 0.6128202676773071. Accuracy: 79.72420837589377
Iterations: 79400. Loss: 0.4848283529281616. Accuracy: 79.7752808988764
Iterations: 79500. Loss: 0.4096032679080963. Accuracy: 80.08171603677222
Iterations: 79600. Loss: 0.5159483551979065. Accuracy: 79.92849846782431
Iterations: 79700. Loss: 0.6278679370880127. Accuracy: 79.80081716036773
Iterations: 79800. Loss: 0.5300508737564087. Accuracy: 79.7752808988764
Iterations: 79900. Loss: 0.5431957244873047. Accuracy: 79.87742594484168
Iterations: 80000. Loss: 0.35971346497535706. Accuracy: 80.03064351378958
Iterations: 80100. Loss: 0.513616144657135. Accuracy: 79.97957099080695
Iterations: 80200. Loss: 1.6058317422866821. Accuracy

Iterations: 90100. Loss: 0.1501089632511139. Accuracy: 80.43922369765066
Iterations: 90200. Loss: 0.6971719264984131. Accuracy: 80.15832482124617
Iterations: 90300. Loss: 0.007646576501429081. Accuracy: 80.1838610827375
Iterations: 90400. Loss: 0.10152721405029297. Accuracy: 80.15832482124617
Iterations: 90500. Loss: 0.28267908096313477. Accuracy: 80.13278855975486
Iterations: 90600. Loss: 0.0024097710847854614. Accuracy: 80.26046986721144
Iterations: 90700. Loss: 0.1164155900478363. Accuracy: 80.28600612870275
Iterations: 90800. Loss: 0.08662788569927216. Accuracy: 79.82635342185904
Iterations: 90900. Loss: 0.5297797322273254. Accuracy: 80.15832482124617
Iterations: 91000. Loss: 0.39254483580589294. Accuracy: 79.95403472931562
Iterations: 91100. Loss: 0.21599853038787842. Accuracy: 79.97957099080695
Iterations: 91200. Loss: 0.07648103684186935. Accuracy: 79.90296220633299
Iterations: 91300. Loss: 0.10194629430770874. Accuracy: 79.82635342185904
Iterations: 91400. Loss: 0.0726398006081

Iterations: 101300. Loss: 1.8248226642608643. Accuracy: 80.26046986721144
Iterations: 101400. Loss: 0.35014134645462036. Accuracy: 80.54136874361593
Iterations: 101500. Loss: 0.8813201189041138. Accuracy: 80.46475995914199
Iterations: 101600. Loss: 0.5973511338233948. Accuracy: 80.51583248212462
Iterations: 101700. Loss: 0.10547460615634918. Accuracy: 80.15832482124617
Iterations: 101800. Loss: 0.8265315890312195. Accuracy: 80.54136874361593
Iterations: 101900. Loss: 0.043905843049287796. Accuracy: 80.4902962206333
Iterations: 102000. Loss: 0.68660569190979. Accuracy: 80.59244126659857
Iterations: 102100. Loss: 0.5183790922164917. Accuracy: 80.4902962206333
Iterations: 102200. Loss: 2.1698238849639893. Accuracy: 80.36261491317671
Iterations: 102300. Loss: 0.2803918123245239. Accuracy: 80.56690500510726
Iterations: 102400. Loss: 0.8007376194000244. Accuracy: 80.56690500510726
Iterations: 102500. Loss: 1.148036003112793. Accuracy: 80.59244126659857
Iterations: 102600. Loss: 0.32918494939

Iterations: 112400. Loss: 0.017694825306534767. Accuracy: 80.36261491317671
Iterations: 112500. Loss: 0.004854203201830387. Accuracy: 80.31154239019408
Iterations: 112600. Loss: 0.018152184784412384. Accuracy: 80.6435137895812
Iterations: 112700. Loss: 0.10087509453296661. Accuracy: 80.54136874361593
Iterations: 112800. Loss: 0.1386578530073166. Accuracy: 80.38815117466802
Iterations: 112900. Loss: 0.07147546112537384. Accuracy: 80.43922369765066
Iterations: 113000. Loss: 0.1946001648902893. Accuracy: 80.38815117466802
Iterations: 113100. Loss: 0.4194938838481903. Accuracy: 80.4902962206333
Iterations: 113200. Loss: 0.05008877068758011. Accuracy: 80.46475995914199
Iterations: 113300. Loss: 0.23806820809841156. Accuracy: 80.36261491317671
Iterations: 113400. Loss: 0.6461167931556702. Accuracy: 80.31154239019408
Iterations: 113500. Loss: 0.0892098993062973. Accuracy: 80.41368743615935
Iterations: 113600. Loss: 0.2098618447780609. Accuracy: 80.31154239019408
Iterations: 113700. Loss: 1.16

KeyboardInterrupt: 

In [32]:
from sklearn.metrics import log_loss

In [35]:
def calculate_log_loss(valid_data, model, label_to_ix, word_to_ix):
    true_label = np.zeros((len(valid_data),1))
    results_valid = np.zeros((len(valid_data),len(label_to_ix)))
    for i in range(len(valid_data)):
        bow_vec = Variable(make_bow_vector(valid_data[i][0], word_to_ix)).cuda()
        log_probs = model(bow_vec)
        pred = F.softmax(log_probs,dim=1).data.cpu().numpy()
        results_valid[i]=pred
        true_label[i]=label_to_ix[valid_data[i][1]]
    return log_loss(true_label,results_valid)

In [36]:
calculate_log_loss(valid_data, model, label_to_ix, word_to_ix)

  # Remove the CWD from sys.path while we load stuff.


0.51034577431219419

In [37]:
def make_preds(model,test):
    my_sub = pd.DataFrame(columns={'id', 'EAP','HPL', 'MWS'})
    my_sub=my_sub[['id', 'EAP','HPL', 'MWS']]
    for i in range(len(test['phrase_preprocessed'])):
        sample=test['phrase_preprocessed'][i]
        #print(sample)
        sample_context=Variable(make_bow_vector(sample,word_to_ix)).cuda()
        log_prob=model(sample_context)
        probs=F.softmax(log_prob)
        my_sub.loc[i] = [test['id'][i], probs.data[0][0],probs.data[0][1],probs.data[0][2]]
    return my_sub

In [38]:
my_sub = pd.DataFrame(columns={'id', 'EAP','HPL', 'MWS'})
my_sub=my_sub[['id', 'EAP','HPL', 'MWS']]
my_sub

Unnamed: 0,id,EAP,HPL,MWS


In [39]:
preds=make_preds(model,test)

  # Remove the CWD from sys.path while we load stuff.
  if __name__ == '__main__':


In [40]:
preds.tail()

Unnamed: 0,id,EAP,HPL,MWS
8387,id11749,0.606491,0.1371,0.256408
8388,id10526,0.235632,0.120189,0.644178
8389,id13477,0.515651,0.05646,0.427889
8390,id13761,0.19965,0.051406,0.748944
8391,id04282,0.444649,0.529615,0.025737


In [42]:
preds.to_csv('roberto_new_13.csv',index=False)