In [1]:
import sys
import pdb

import spacy
from torchtext import data
from tqdm import tqdm
import pandas as pd
import numpy as np

import torch
from torch.autograd import Variable
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim



#### Sentence Tokenization

In [2]:
spacy_en = spacy.load('en')
def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]


#### Prepare Data
debug functions for data

~~~~

train_iter, val_iter, test_iter = data.Iterator.splits(
        (train, val, test), batch_sizes=(256,256,30), device=0, repeat=False)
print(train[0].__dict__['tweet'])

for i,x in enumerate(train):
    try:
        x.__dict__['tweet']
        x.__dict__['author']
    except:
        print(i)
~~~~

In [3]:
TEXT = data.ReversibleField(sequential=True,tokenize=tokenizer, lower=True, fix_length=40)
LABEL = data.Field(sequential=False)

tweet_datafields = [('tweet', TEXT), ('author', LABEL)]

train, val =  data.TabularDataset.splits(
        path='./data/', train='train.csv',
        validation='val.csv', format='tsv',
        fields= tweet_datafields)

train_iter = data.Iterator(train, batch_size=256, device=0, repeat=False)
val_iter = data.Iterator(val, batch_size=64, device=0, sort=False, sort_within_batch=False, repeat=False)

#sort_key=lambda x: len(x.text)
TEXT.build_vocab(train, vectors="glove.6B.200d")
LABEL.build_vocab(train)


In [4]:
vocab = TEXT.vocab
class Yoon_Text(nn.Module):
    
    def __init__(self, x_label, y_label):
        super(Yoon_Text, self).__init__()
        self.x_label = x_label
        self.y_label = y_label
        self.embed = nn.Embedding(len(x_label.vocab), 200)
        self.embed.weight.data.copy_(vocab.vectors)
    
        self.conv1 = nn.Conv2d(1, 100, (3, 200))
        self.conv2 = nn.Conv2d(1, 100, (4, 200))
        self.conv3 = nn.Conv2d(1, 100, (5, 200))

        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(300, len(y_label.vocab) - 1)

    def forward(self, x):
        x = self.embed(x)  
        x = x.unsqueeze(1) 

        x1 = F.relu(self.conv1(x)).squeeze(3) 
        x1 = F.max_pool1d(x1, x1.size(2)).squeeze(2)

        x2 = F.relu(self.conv2(x)).squeeze(3)
        x2 = F.max_pool1d(x2, x2.size(2)).squeeze(2)

        x3 = F.relu(self.conv2(x)).squeeze(3)
        x3 = F.max_pool1d(x3, x3.size(2)).squeeze(2)

        x = torch.cat((x1, x2, x3), 1)
        x = self.dropout(x) 
        logit = self.fc1(x) 
        return logit
    
    def answer(self, text):
        text = self.x_label.preprocess(text)
        text = [[self.x_label.vocab.stoi[x] for x in text]]
        x = self.x_label.tensor_type(text)
        x = autograd.Variable(x, volatile=True).cuda()
        output = self.forward(x)
        _, predicted = torch.max(output, 1)
        return (self.y_label.vocab.itos[predicted.data[0]+1])

In [5]:
model = Yoon_Text(TEXT, LABEL)
model.cuda()

opt = torch.optim.Adam(model.parameters(), lr=0.001)

model.cuda()
epochs = 5
 
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    
    # Training 
    model.train() 
    for batch in tqdm(train_iter, total=len(train_iter), desc="  Train:"):
        opt.zero_grad()
        feature, target = batch.tweet, batch.author
        feature.data.t_(), target.data.sub_(1)  # batch first, index align
        feature.cuda(), target.cuda()
 
        preds = model(feature)
        loss = F.cross_entropy(preds, target)
        loss.backward()
        opt.step()
 
        running_loss += loss.data[0] * feature.size(0)
 
    epoch_loss = running_loss / len(train_iter)
 
    # Validation
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for batch in tqdm(val_iter, total=len(val_iter), desc="  Eval:"):
        feature, target = batch.tweet, batch.author
        feature.data.t_(), target.data.sub_(1)  # batch first, index align
        feature.cuda(), target.cuda()
        preds = model(feature)
        loss = F.cross_entropy(preds, target)
        val_loss += loss.data[0] * feature.size(0)
 
    val_loss /= len(val_iter)
    tqdm.write('Training Loss: {}, Validation Loss: {}'.format(epoch, epoch_loss, val_loss))

  Train:: 100%|██████████| 87/87 [00:04<00:00, 18.17it/s]
  Eval:: 100%|██████████| 17/17 [00:00<00:00, 271.34it/s]
  Train::   2%|▏         | 2/87 [00:00<00:04, 19.76it/s]

Training Loss: 1, Validation Loss: 288.924731429966


  Train:: 100%|██████████| 87/87 [00:03<00:00, 26.41it/s]
  Eval:: 100%|██████████| 17/17 [00:00<00:00, 286.74it/s]
  Train::   2%|▏         | 2/87 [00:00<00:04, 19.65it/s]

Training Loss: 2, Validation Loss: 123.3918200196891


  Train:: 100%|██████████| 87/87 [00:03<00:00, 26.48it/s]
  Eval:: 100%|██████████| 17/17 [00:00<00:00, 283.05it/s]
  Train::   2%|▏         | 2/87 [00:00<00:04, 19.43it/s]

Training Loss: 3, Validation Loss: 72.7104907556512


  Train:: 100%|██████████| 87/87 [00:03<00:00, 26.51it/s]
  Eval:: 100%|██████████| 17/17 [00:00<00:00, 282.06it/s]
  Train::   2%|▏         | 2/87 [00:00<00:04, 19.90it/s]

Training Loss: 4, Validation Loss: 44.52194872464257


  Train:: 100%|██████████| 87/87 [00:03<00:00, 26.54it/s]
  Eval:: 100%|██████████| 17/17 [00:00<00:00, 275.45it/s]

Training Loss: 5, Validation Loss: 28.119983608352726





In [6]:
text = """@naterob2222 Earth’s rotation rate relative to the stars: 23h 56m 04s. 
Relative to the Sun: 24h, but with a leap second occasionaly added"""
print(model.answer(text))

neiltyson


In [7]:
text = 'The silver line is on fire'
print(model.answer(text))

wmata
