In [2]:
# Torch, Sklearn imports
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [3]:
## AllenNLP
import allennlp
from allennlp.modules.elmo import Elmo, batch_to_ids
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from allennlp.modules.token_embedders import BertEmbedder

print("PyTorch: {}".format(torch.__version__))
print("AllenNLP: {}".format(allennlp.__version__))

PyTorch: 0.4.1
AllenNLP: 0.8.1


In [4]:
## NLP libs
from nltk import download
import gensim
from nltk.corpus import stopwords
download('stopwords')

## Sklearn imports
from sklearn.datasets import fetch_20newsgroups

## General libs
import numpy as np
import pandas as pd
from string import punctuation
import os, re, sys, json, requests, pickle

02/06/2019 17:11:16 - INFO - gensim.summarization.textcleaner -   'pattern' package not found; tag filters are not available for English


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [6]:
## Lemmatization function based on Spacy Library
def lemmatizer_spacy(text):        
    sent = []
    doc = spacy_en(text)
    for word in doc:
        if word.lemma_ == "-PRON-":
            sent.append(word.text)
        else:
            sent.append(word.lemma_)
    return " ".join(sent)

def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

In [7]:
def transformText(text, do_stop=True, do_stem=False, do_lema = False):
    stops = set(stopwords.words("english"))
    # Convert text to lower
    text = text.lower()
    
    # Removing E-mails  
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    text = re.sub(r'\S*@\S*\s?', r' ', text)
    
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Removing Newline
    text = text.rstrip()
    # Removing all the stopwords
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stops]
    else:
        filtered_words = [word for word in text.split()]
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Stemming
    if (do_stem==True):
        text = gensim.parsing.preprocessing.stem_text(text)
    
    # Lemmatization
    if (do_lema==True):
        text = do_lemmatization(text)   
        
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    return text

In [8]:
idx = 2
x=newsgroups_train.data[idx]
x

'From: twillis@ec.ecn.purdue.edu (Thomas E Willis)\nSubject: PB questions...\nOrganization: Purdue University Engineering Computer Network\nDistribution: usa\nLines: 36\n\nwell folks, my mac plus finally gave up the ghost this weekend after\nstarting life as a 512k way back in 1985.  sooo, i\'m in the market for a\nnew machine a bit sooner than i intended to be...\n\ni\'m looking into picking up a powerbook 160 or maybe 180 and have a bunch\nof questions that (hopefully) somebody can answer:\n\n* does anybody know any dirt on when the next round of powerbook\nintroductions are expected?  i\'d heard the 185c was supposed to make an\nappearence "this summer" but haven\'t heard anymore on it - and since i\ndon\'t have access to macleak, i was wondering if anybody out there had\nmore info...\n\n* has anybody heard rumors about price drops to the powerbook line like the\nones the duo\'s just went through recently?\n\n* what\'s the impression of the display on the 180?  i could probably swin

In [11]:
transformText(x, do_stop=True)

'from thomas e willis subject pb questions organization purdue university engineering computer network distribution usa lines 36 well folks mac plus finally gave ghost weekend starting life 512k way back 1985 sooo i m market new machine bit sooner intended be i m looking picking powerbook 160 maybe 180 bunch questions hopefully somebody answer anybody know dirt next round powerbook introductions expected i d heard 185c supposed make appearence this summer heard anymore since access macleak wondering anybody info anybody heard rumors price drops powerbook line like ones duo s went recently what s impression display 180 could probably swing 180 got 80mb disk rather 120 really feel much better display yea looks great store wow really good could solicit opinions people use 160 180 day to day worth taking disk size money hit get active display i realize real subjective question i ve played around machines computer store breifly figured opinions somebody actually uses machine daily might pro

In [12]:
train_dataset = pd.DataFrame({'phrase': newsgroups_train.data, 'intent': newsgroups_train.target})
test_dataset = pd.DataFrame({'phrase': newsgroups_test.data, 'intent': newsgroups_test.target})

In [13]:
train_dataset.tail()

Unnamed: 0,intent,phrase
11309,13,From: jim.zisfein@factory.com (Jim Zisfein) \n...
11310,4,From: ebodin@pearl.tufts.edu\nSubject: Screen ...
11311,3,From: westes@netcom.com (Will Estes)\nSubject:...
11312,1,From: steve@hcrlgw (Steven Collins)\nSubject: ...
11313,8,From: gunning@cco.caltech.edu (Kevin J. Gunnin...


In [14]:
test_dataset.tail()

Unnamed: 0,intent,phrase
7527,14,From: richmond@spiff.Princeton.EDU (Stupendous...
7528,4,From: smytonj@murr11.alleg.edu (Jim Smyton)\nS...
7529,9,From: hhenderson@vax.clarku.edu\nSubject: RE: ...
7530,6,From: b859zam@utarlg.uta.edu \nSubject: INTEL ...
7531,15,From: adamsj@gtewd.mtv.gtegsc.com\nSubject: Re...


In [16]:
## Pruning training dataset for a given number of sentences

In [17]:
pruned_training = pd.DataFrame(columns=['phrase','intent'])
pruned_valid = pd.DataFrame(columns=['phrase','intent'])
pruned_training

Unnamed: 0,phrase,intent


In [18]:
pruning_size = 100

In [19]:
pruning_size = 100
for i in list(set(train_dataset.intent)):
    print("LABEL {}".format(i))
    sample_training = train_dataset[train_dataset.intent == i][0:pruning_size]
    sample_valid = train_dataset[train_dataset.intent == i][pruning_size:]
    print(" --- # Training_Phrases = {}".format(len(sample_training)))
    print(" --- # Validing_Phrases = {}".format(len(sample_valid))) 
    pruned_training = pd.concat([pruned_training, sample_training]).reset_index(drop=True)
    pruned_valid = pd.concat([pruned_valid, sample_valid]).reset_index(drop=True)

LABEL 0
 --- # Training_Phrases = 100
 --- # Validing_Phrases = 380
LABEL 1
 --- # Training_Phrases = 100
 --- # Validing_Phrases = 484
LABEL 2
 --- # Training_Phrases = 100
 --- # Validing_Phrases = 491
LABEL 3
 --- # Training_Phrases = 100
 --- # Validing_Phrases = 490
LABEL 4
 --- # Training_Phrases = 100
 --- # Validing_Phrases = 478
LABEL 5
 --- # Training_Phrases = 100
 --- # Validing_Phrases = 493
LABEL 6
 --- # Training_Phrases = 100
 --- # Validing_Phrases = 485
LABEL 7
 --- # Training_Phrases = 100
 --- # Validing_Phrases = 494
LABEL 8
 --- # Training_Phrases = 100
 --- # Validing_Phrases = 498
LABEL 9
 --- # Training_Phrases = 100
 --- # Validing_Phrases = 497
LABEL 10
 --- # Training_Phrases = 100
 --- # Validing_Phrases = 500
LABEL 11
 --- # Training_Phrases = 100
 --- # Validing_Phrases = 495
LABEL 12
 --- # Training_Phrases = 100
 --- # Validing_Phrases = 491
LABEL 13
 --- # Training_Phrases = 100
 --- # Validing_Phrases = 494
LABEL 14
 --- # Training_Phrases = 100
 --- 

In [20]:
pruned_training.tail()

Unnamed: 0,intent,phrase
1995,19,From: royc@rbdc.wsnc.org (Roy Crabtree)\nSubje...
1996,19,From: bskendig@netcom.com (Brian Kendig)\nSubj...
1997,19,From: mccullou@snake2.cs.wisc.edu (Mark McCull...
1998,19,From: kilroy@gboro.rowan.edu (Dr Nancy's Sweet...
1999,19,From: pharvey@quack.kfu.com (Paul Harvey)\nSub...


In [21]:
pruned_valid.tail()

Unnamed: 0,intent,phrase
9309,19,From: system@kalki33.lakes.trenton.sc.us (Kalk...
9310,19,From: clavazzi@nyx.cs.du.edu (The_Doge)\nSubje...
9311,19,From: hudson@athena.cs.uga.edu (Paul Hudson Jr...
9312,19,From: jmeritt@mental.MITRE.ORG (Jim Meritt - S...
9313,19,From: Pegasus@aaa.uoregon.edu (Pegasus)\nSubje...


In [22]:
## Clean Dataset (only stopword removal, punct, ascii - no lemma, stemm)
pruned_training['clean_text']=pruned_training['phrase'].apply(lambda x: transformText(x))
pruned_valid['clean_text']=pruned_valid['phrase'].apply(lambda x: transformText(x))

In [23]:
pruned_training.phrase[0]

'From: mathew <mathew@mantis.co.uk>\nSubject: Re: <Political Atheists?\nOrganization: Mantis Consultants, Cambridge. UK.\nX-Newsreader: rusnews v1.01\nLines: 22\n\nkmr4@po.CWRU.edu (Keith M. Ryan) writes:\n> ( I am almost sure that Zyklon-B is immediate and painless method of \n> death. If not, insert soem other form. )\n> \n>         And, ethnic and minority groups have been killed, mutilated and \n> exterminated through out history, so I guess it was not unusual.\n> \n>         So, you would agree that the holocost would be allowed under the US \n> Constitution?  [ in so far, the punishment. I doubt they recieved what would \n> be considered a "fair" trial by US standards.\n\nDon\'t be so sure.  Look what happened to Japanese citizens in the US during\nWorld War II.  If you\'re prepared to say "Let\'s round these people up and\nstick them in a concentration camp without trial", it\'s only a short step to\ngassing them without trial.  After all, it seems that the Nazis originally\nonl

In [24]:
pruned_training.clean_text[0]

'from mathew subject re political atheists organization mantis consultants cambridge uk x newsreader rusnews v1 01 lines 22 keith m ryan writes almost sure zyklon b immediate painless method death not insert soem form and ethnic minority groups killed mutilated exterminated history guess unusual so would agree holocost would allowed us constitution far punishment doubt recieved would considered fair trial us standards sure look happened japanese citizens us world war ii prepared say let s round people stick concentration camp without trial short step gassing without trial all seems nazis originally intended imprison jews final solution dreamt partly afford run camps devastation caused goering s total war gassed generally died malnutrition disease mathew'

In [25]:
pruned_valid.phrase[200]

'From: healta@saturn.wwc.edu (Tammy R Healy)\nSubject: Re: who are we to judge, Bobby?\nLines: 38\nOrganization: Walla Walla College\nLines: 38\n\nIn article <1993Apr14.213356.22176@ultb.isc.rit.edu> snm6394@ultb.isc.rit.edu (S.N. Mozumder ) writes:\n>From: snm6394@ultb.isc.rit.edu (S.N. Mozumder )\n>Subject: Re: who are we to judge, Bobby?\n>Date: Wed, 14 Apr 1993 21:33:56 GMT\n>In article <healta.56.734556346@saturn.wwc.edu> healta@saturn.wwc.edu (TAMMY R HEALY) writes:\n>>Bobby,\n>>\n>>I would like to take the liberty to quote from a Christian writer named \n>>Ellen G. White.  I hope that what she said will help you to edit your \n>>remarks in this group in the future.\n>>\n>>"Do not set yourself as a standard.  Do not make your opinions, your views \n>>of duty, your interpretations of scripture, a criterion for others and in \n>>your heart condemn them if they do not come up to your ideal."\n>>                         Thoughts Fromthe Mount of Blessing p. 124\n>>\n>>I hope quoting 

In [26]:
pruned_valid.clean_text[200]

'from tammy r healy subject re judge bobby lines 38 organization walla walla college lines 38 article s n mozumder writes from s n mozumder subject re judge bobby date wed 14 apr 1993 21 33 56 gmt in article tammy r healy writes bobby i would like take liberty quote christian writer named ellen g white hope said help edit remarks group future do set standard make opinions views of duty interpretations scripture criterion others your heart condemn come ideal thoughts fromthe mount blessing p 124 i hope quoting make atheists gag think ellen white put better could tammy point peace bobby mozumder point set views way believe saying eveil world caused atheism ridiculous counterproductive dialogue newsgroups see posts spirit condemnation atheists newsgroup bacause don believe exactly do try convert atheists here failing miserably wants position constantly defending agaist insulting attacks like seem like do i m sorry blind get messgae quote everyone else seemed to tammy'

In [27]:
## ELMo
#elmo_weights_key_path = '../../vectors/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
#elmo_config_key_path = '../../vectors/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'

## S3 Configs for SageMaker
bucket = 'adp-e-ml-notebooks-sagemaker'             
prefix = 'vectors'   
elmo_weights_key = '{}/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'.format(prefix)
elmo_weights_key_path = os.path.join('s3://', bucket, elmo_weights_key)
elmo_config_key = '{}/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'.format(prefix)
elmo_config_key_path = os.path.join('s3://', bucket, elmo_config_key)

### Elmo Instance
elmo = Elmo(elmo_config_key_path, 
            elmo_weights_key_path, 
            num_output_representations = 1, 
            dropout=0.3,
            requires_grad = False)
if torch.cuda.is_available():
    elmo = elmo.cuda()

02/06/2019 17:12:02 - INFO - allennlp.modules.elmo -   Initializing ELMo


In [30]:
def get_elmo(sent):
    elmo.eval()
    sent = [sent.split()]
    character_ids = batch_to_ids(sent)
    if torch.cuda.is_available():
        character_ids = character_ids.cuda()
    embeddings = elmo(character_ids)
    rep = embeddings['elmo_representations'][0]
    rep = rep.squeeze(dim=0)
    avg = rep.mean(dim=0)
    return avg

In [31]:
get_elmo("testing this")

tensor([-0.3639,  0.1719,  0.0151,  ...,  0.5625, -0.5524, -0.0254],
       device='cuda:0', grad_fn=<MeanBackward0>)

In [32]:
len(pruned_training), len(pruned_valid)

(2000, 9314)

In [33]:
## Data Loading Class
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        phrase = self.data.clean_text[index]
        X = get_elmo(phrase)
        y = self.data.intent[index]
        return X, y
    
    def __len__(self):
        return self.len

In [34]:
train_set = Intents(pruned_training)
valid_set = Intents(pruned_valid)

In [35]:
train_set.__len__(), valid_set.__len__()

(2000, 9314)

In [36]:
train_set.__getitem__(1)

(tensor([-0.2079, -0.0499,  0.0184,  ...,  0.0531, -0.0010, -0.1197],
        device='cuda:0', grad_fn=<MeanBackward0>), 0)

In [37]:
valid_set.__getitem__(2000)

(tensor([-0.2893, -0.0725,  0.1083,  ..., -0.0051, -0.0633, -0.0463],
        device='cuda:0', grad_fn=<MeanBackward0>), 4)

## Simple MLP Classifier

In [38]:
class SimpleMLP(nn.Module):
    def __init__(self, inputdim, 
                        nclasses, 
                        nhidden, 
                        dropout = 0):
        super(SimpleMLP, self).__init__()
        """
        PARAMETERS:
        -dropout:    dropout for MLP
        """
        
        self.inputdim = inputdim
        self.hidden_dim = nhidden
        self.dropout = dropout
        self.nclasses = nclasses
        self.model = nn.Sequential(
            nn.Linear(self.inputdim, nhidden),
            nn.Dropout(p=self.dropout),
            nn.ReLU(),
            nn.Linear(nhidden, self.nclasses),
            )
        if torch.cuda.is_available():
            self.model = self.model.cuda()
    def forward(self, x):
        log_probs = self.model(x)
        return log_probs

In [39]:
len(newsgroups_train.target_names), len(newsgroups_test.target_names)

(20, 20)

In [40]:
INP_DIM = elmo.get_output_dim()
NUM_LABELS = len(newsgroups_train.target_names)
NHIDDEN = 64
DROPOUT = 0

In [41]:
model = SimpleMLP(inputdim = INP_DIM ,
              nhidden = NHIDDEN,
              nclasses = NUM_LABELS,
              dropout = DROPOUT)

if torch.cuda.is_available():
    device = torch.device("cuda:0")
    model = model.to(device)
    model.cuda()

## Training

In [44]:
# Dataloaders Parameters
params = {'batch_size': 64,
          'shuffle': True,
          'num_workers': 0}
train_loader = DataLoader(train_set, **params)
valid_loader = DataLoader(valid_set, **params)
# Hyperparams
loss_function = nn.CrossEntropyLoss()
learning_rate = 0.001 
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [45]:
max_epochs = 5
for epoch in range(max_epochs):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(train_loader):
        
        ## Step 1 - Clear gradients w.r.t. parameters
        optimizer.zero_grad()
        
        if torch.cuda.is_available():
            sent = sent.cuda()
            label = label.cuda()
        
        ## Step 2 - Run forward pass
        output = model.forward(sent)
        
        ## Step 3 - Compute loss
        loss = loss_function(output, label)
        loss.backward()
        
        ## Step 4 = Update parameters
        optimizer.step()
        if i%50 == 0:
            
            # Calculate Accuracy         
            correct = 0
            total = 0
            
            for sent, label in valid_loader:      
                if torch.cuda.is_available():
                    sent = sent.cuda()
                    label = label.cuda()
                
                # Forward pass only to get logits/output
                output = model.forward(sent)
                
                # Get predictions from the maximum value
                _, predicted = torch.max(output.data, 1)
                
                # Total number of labels
                total += label.size(0)

                # Total correct predictions
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            # Print Loss
            print('LOSS: {}. VALID ACCURACY: {}%'.format(loss.data, accuracy))

EPOCH -- 0
LOSS: 3.0083396434783936. VALID ACCURACY: 6.098346575048314%
EPOCH -- 1


RuntimeError: DataLoader worker (pid 23757) exited unexpectedly with exit code 1. Details are lost due to multiprocessing. Rerunning with num_workers=0 may give better error trace.

In [None]:
def get_reply(phrase):
    x = get_elmo(phrase)
    logits_out = model.forward(x)
    softmax_out = F.softmax(logits_out, dim=0).cpu()
    _, pred_label = torch.max(softmax_out.data, 0)
    prediction=list(label_to_ix.keys())[pred_label]
    return prediction

In [None]:
get_reply("change this music")

In [None]:
get_reply("weather in Porto Alegre")

## Checking test error

In [None]:
correct = 0
total = 0
errors = []
accuracy = 0
for i in range(len(test_dataset)):
    msg = str(test_dataset['clean_text'][i])
    lbl = str(test_dataset['intent'][i])
    pred = get_reply(msg)
    total +=1
    if pred == lbl:
        correct += 1
    else:
        errors.append((msg,lbl))
test_accuracy = 100.00 * correct / total

In [None]:
print("TEST ACCURACY  -- {}".format(test_accuracy))