In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

import pandas as pd
import torch.nn.functional as F

In [3]:
df = pd.read_csv('Tweets.csv')

#visualise the data
df.head(5)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
sent_to_ix = {'negative':0, 'neutral':1, 'positive': 2}
ix_to_sent = { 0:'negative', 1:'neutral', 2:'positive'}

df['text'] = df['text'].apply(lambda sentence: sentence.replace("!", " ! "))
df['text'] = df['text'].apply(lambda sentence: sentence.replace("%", " % "))
df['text'] = df['text'].apply(lambda sentence: sentence.replace("^", " ^ "))
df['text'] = df['text'].apply(lambda sentence: sentence.replace("/", " / "))
df['text'] = df['text'].apply(lambda sentence: sentence.replace("#", " # "))
df['text'] = df['text'].apply(lambda sentence: sentence.replace(":", " : "))
df['text'] = df['text'].apply(lambda sentence: sentence.replace(";", " ; "))
df['text'] = df['text'].apply(lambda sentence: sentence.replace(".", " . "))
df['text'] = df['text'].apply(lambda sentence: sentence.replace(",", " , "))
df['text'] = df['text'].apply(lambda sentence: sentence.replace("@", " @ "))
df['text'] = df['text'].apply(lambda sentence: sentence.replace("?", " ? "))
df['text'] = df['text'].apply(lambda sentence: sentence.replace("'", " ' "))
df['text'] = df['text'].apply(lambda sentence: sentence.lower().split())
df['airline_sentiment'] = df['airline_sentiment'].apply(lambda sent: sent_to_ix[sent])

#visualise again
df.head(5)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,1,1.0,,,Virgin America,,cairdin,,0,"[@, virginamerica, what, @, dhepburn, said, .]",,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,2,0.3486,,0.0,Virgin America,,jnardino,,0,"[@, virginamerica, plus, you, ', ve, added, co...",,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,1,0.6837,,,Virgin America,,yvonnalynn,,0,"[@, virginamerica, i, didn, ', t, today, ., .,...",,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,0,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,"[@, virginamerica, it, ', s, really, aggressiv...",,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,0,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,"[@, virginamerica, and, it, ', s, a, really, b...",,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [5]:
vocab = {}
N_df = len(df)

for i in xrange(N_df):
    for word in df['text'][i]:
        if word not in vocab:
            vocab[word] = len(vocab)
            
vocab_size = len(vocab)
print 'Total distinct words: {}'.format(vocab_size)
print 'Created dictionary: '
print vocab

Total distinct words: 17209
Created dictionary: 


In [6]:
dtype = torch.FloatTensor
dtype_long = torch.LongTensor

if torch.cuda.is_available():
    print 'Cuda is available'
    dtype = torch.cuda.FloatTensor
    dtype_long = torch.cuda.LongTensor

Cuda is available


In [7]:
def prepare_sequence(sentence, vocab):
    ixs = []
    for word in sentence:
        if word not in vocab:
            sentence.remove(word)
    ixs = map(lambda word: vocab[word], sentence)
    ixs_var = Variable(dtype_long(ixs))        
    return ixs_var

#Let's test it out
sentence = 'hello sneha'.split()
print sentence
print 'Code for the above sentence is'
print prepare_sequence(sentence, vocab)

['hello', 'sneha']
Code for the above sentence is
Variable containing:
 5072
[torch.cuda.LongTensor of size 1 (GPU 0)]



In [8]:
print len(df)

df_train = df[:14000]
df_val = df[14000:]

14640


In [9]:
class LSTMClassifier(nn.Module):
    
    def __init__(self, vocab_size, embedded_dim, hidden_dim):
        super(LSTMClassifier, self).__init__()
        
        self.embed = nn.Embedding(vocab_size, embedded_dim)
        self.lstm = nn.LSTM(embedded_dim, hidden_dim, 1)
        self.linear = nn.Linear(hidden_dim, 3)
        self.soft = nn.Softmax()
        
    def forward(self, input):
        embedded_sentence = self.embed(input).view((len(input), 1, embedded_dim))
        _, out = self.lstm(embedded_sentence)
        out = out[-1]
        out  = F.relu(out.view((1, -1)))
        out = self.linear(out)
        return self.soft(out)

In [11]:
N_epochs = 15
embedded_dim = 10
hidden_dim = 10

model = LSTMClassifier(vocab_size, embedded_dim, hidden_dim).type(dtype)
loss_fn = nn.CrossEntropyLoss().type(dtype)
optimizer = optim.Adam(model.parameters())

In [12]:
N_train = len(df_train)
N_val = len(df_val)
N_epochs = 15

for epoch in xrange(N_epochs):
    train_epoch_loss = 0
    val_epoch_loss = 0
    for i in xrange(N_train):
        model.train()
        sentence, sentiment = df_train['text'][i], df_train['airline_sentiment'][i]
        
        sentence_var = prepare_sequence(sentence, vocab)
        scores = model(sentence_var)
        loss = loss_fn(scores, Variable(dtype_long([sentiment])))   
        train_epoch_loss += loss.data[0]
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print 'Training Loss for epoch {} is {}'.format(epoch, train_epoch_loss)
    
    for i in xrange(N_val):
        model.eval()
        sentence, author = df_val['text'][14000 + i], df_val['airline_sentiment'][14000 + i]
        
        sentence_var = prepare_sequence(sentence, vocab)
        scores = model(sentence_var)
        loss = loss_fn(scores, Variable(dtype_long([author])))   
        val_epoch_loss += loss.data[0]
        
    print 'Validation Loss for epoch {} is {}'.format(epoch, val_epoch_loss)

Training Loss for epoch 0 is 12303.2538344
Validation Loss for epoch 0 is 479.991138399
Training Loss for epoch 1 is 11573.0011764
Validation Loss for epoch 1 is 471.557220817
Training Loss for epoch 2 is 11010.9647085
Validation Loss for epoch 2 is 470.25041616
Training Loss for epoch 3 is 10603.9650126
Validation Loss for epoch 3 is 456.01934725
Training Loss for epoch 4 is 10302.4494565
Validation Loss for epoch 4 is 453.122085154
Training Loss for epoch 5 is 10079.9606791
Validation Loss for epoch 5 is 451.631572306
Training Loss for epoch 6 is 9858.55616796
Validation Loss for epoch 6 is 447.503376782
Training Loss for epoch 7 is 9699.27742261
Validation Loss for epoch 7 is 454.562984109
Training Loss for epoch 8 is 9537.40374041
Validation Loss for epoch 8 is 446.742719948
Training Loss for epoch 9 is 9408.97753096
Validation Loss for epoch 9 is 446.117323399
Training Loss for epoch 10 is 9302.04875076
Validation Loss for epoch 10 is 448.315862298
Training Loss for epoch 11 is 91

In [17]:
def test_sentence(sentence):
    sentence_var = prepare_sequence(sentence, vocab)
    scores = model(sentence_var)
    ix = torch.max(scores, 1)[1].data.cpu().numpy()[0][0]
    sentiment = ix_to_sent[ix]
    return sentiment

sentiment = test_sentence("I had a very good flight ! Very courteous staff .".lower().split())
print 'Beep bop; tickety dop; I think that sentence is {}'.format(sentiment)

Beep bop; tickety dop; I think that sentence is positive


In [14]:
PATH = 'saved_model.pth'

In [15]:
#Save the model
torch.save(model.state_dict(), PATH)

In [16]:
#Load
model = LSTMClassifier(vocab_size, embedded_dim, hidden_dim).type(dtype)
model.load_state_dict(torch.load(PATH))