# Twitter Sentiment Classification - Version 2

In [1]:
import os 
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import gensim
import Cython

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, TensorDataset, DataLoader

from torchtext import data

from helpers import *
from data import create_csv_submission

## Step 1: Load tweets

In [2]:
DATA_PATH = '../twitter-datasets/'
MODEL_PATH = '../models/'

TRAIN_NEG_PATH = os.path.join(DATA_PATH, 'train_neg.txt') # 100'000 negative tweets
TRAIN_POS_PATH = os.path.join(DATA_PATH, 'train_pos.txt') # 100'000 positive tweets
#TRAIN_NEG_PATH = os.path.join(DATA_PATH, 'train_neg_full.txt') # 2'500'000 negative tweets
#TRAIN_POS_PATH = os.path.join(DATA_PATH, 'train_pos_full.txt') # 2'500'000 positive tweets
TEST_PATH = os.path.join(DATA_PATH, 'test_data.txt')

In [3]:
x_text_train, y_train_full = load_data_and_labels(TRAIN_POS_PATH, TRAIN_NEG_PATH)

In [4]:
x_text_test = load_test_data(TEST_PATH)

## Step 2: Build word2vec vocabulary

In [7]:
vector_length = 100

In [6]:
w2v_model = gensim.models.Word2Vec(x_text_train + x_text_test, min_count=1, workers=4, size=vector_length)

KeyboardInterrupt: 

In [None]:
print(w2v_model)

In [None]:
w2v_model.save(MODEL_PATH + 'twitter_w2v_50.bin')

In [8]:
w2v_model = gensim.models.Word2Vec.load(MODEL_PATH + 'twitter_w2v.bin')

In [9]:
# When training finished delete the training model but retain the word vectors:
word_vectors = w2v_model.wv
del w2v_model

In [10]:
word_vectors.wv['computer'].shape

(100,)

In [11]:
word_vectors.most_similar('computer')

[('internet', 0.708969235420227),
 ('desktop', 0.6651554107666016),
 ('calculator', 0.6596795320510864),
 ('settings', 0.6567734479904175),
 ('laptop', 0.6565127372741699),
 ('phone', 0.6472017168998718),
 ('browser', 0.635584831237793),
 ('portfolio', 0.6346469521522522),
 ('wifi', 0.6323057413101196),
 ('desk', 0.6210508346557617)]

## Step 3: Convert tweets into sentences of vectors

In [12]:
# Compute the number of words of the longest tweet to get the maximal sentence length
sequence_length_train = max(len(x) for x in x_text_train)
sequence_length_test = max(len(x) for x in x_text_test)
sequence_length = max(sequence_length_train, sequence_length_test)
print('Maximum sequence length of train and test data:', sequence_length)

x_text_train_pad = pad_sentences(x_text_train, padding_word="<PAD/>", sequence_length=sequence_length)
x_text_test_pad = pad_sentences(x_text_test, padding_word="<PAD/>", sequence_length=sequence_length)

del x_text_train
del x_text_test

Maximum sequence length of train and test data: 50


In [13]:
# Split into training and validation data
x_train, x_val, y_train, y_val = train_test_split(x_text_train_pad, y_train_full, test_size=0.01, random_state=42)

In [14]:
len(x_train), len(x_train[0])

(198000, 50)

In [15]:
len(x_val), len(x_val[0])

(2000, 50)

In [16]:
# TODO: SLOW!

def get_tweets_tensor(tweets, indices=[], verbose=False):
    '''Mapping every word to a vector from word2vec
    Padding words are mapped to zero
    Leave indices empty to map every tweet in tweets
    '''

    nb_tweets = len(tweets) if len(indices)==0 else len(indices)
    tweets_vec = np.zeros((nb_tweets, len(tweets[0]), vector_length), dtype=np.float32)
    
    if indices == []:
        for idx_t, tweet in enumerate(tweets):
            for idx_w, word in enumerate(tweet):
                if word != '<PAD/>':
                    tweets_vec[idx_t, idx_w] = word_vectors.wv[word]  
            if verbose and idx_t % 100000 == 0:
                print('Transformed {}/{} tweets'.format(idx_t+1, (len(x_text_train_pad))))
    else:
        for idx_t, orig_idx in enumerate(indices):
            for idx_w, word in enumerate(tweets[orig_idx]):
                if word != '<PAD/>':
                    tweets_vec[idx_t, idx_w] = word_vectors.wv[word]  
            if verbose and idx_t % 100000 == 0:
                print('Transformed {}/{} tweets'.format(idx_t+1, (len(x_text_train_pad))))
    
    return torch.from_numpy(tweets_vec)

## Step 4: Classification

In [36]:
# Hyper Parameters
num_epochs = 1
batch_size = 100
learning_rate = 0.001

In [37]:
class ListDataset(Dataset):
    """Dataset wrapping data and target lists.

    Each sample will be retrieved by indexing both lists along the first
    dimension.

    Arguments:
        data_list (python list): contains sample data.
        target_list (python list): contains sample targets (labels).
    """

    def __init__(self, data_list, target_list):
        assert len(data_list) == len(target_list)
        self.data_list = data_list
        self.target_list = target_list

    def __getitem__(self, index):
        return self.data_list[index], self.target_list[index]

    def __len__(self):
        return len(self.data_list)

In [38]:
train_dataset = ListDataset(x_train, y_train)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size, 
                                           shuffle=True)

In [57]:
# Network hyperparameters
N = len(train_loader.dataset.data_list)      # Number of tweets (eg 200000)
S = len(train_loader.dataset.data_list[0])   # Number of words in one sentence (eg 50)
V = vector_length                            # Length of word vectors (eg 100)
K = 5                                        # Kernel width (K*V)
C = 128                                      # Number of convolutional filters
F = 2                                        # Number of output neurons in fully connected layer

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(             # input shape (1, S, V)
                in_channels=1,              # input channels
                out_channels=C,             # number of filters
                kernel_size=(K,V),          # filter size
                padding=(K//2,0)               # to keep size S
        )                                   # output shape (C, S, 1)
        self.relu = nn.ReLU()               # ReLU activation
        self.bn1 = nn.BatchNorm1d(C)
        self.max_pool1 = nn.MaxPool1d(2)    # max-pool each filter into S/2 output   
        self.conv2 = nn.Conv1d(             # input shape (C, S/2)
                in_channels=C,              # input channels
                out_channels=C,             # number of filters (one for each input channel)
                kernel_size=K,              # filter size
                padding=K//2                   # to keep size S/2
        )                                   # output shape (C, S/2)
        self.bn2 = nn.BatchNorm1d(C)
        self.max_pool2 = nn.MaxPool1d(int(S/2)) # max pool each filter into 1 output
        self.dropout= nn.Dropout(p=0.2)
        self.out = nn.Linear(C, F)          # fully connected layer, output F classes
        self.softmax = nn.Softmax()

    def forward(self, x):
        #print(x.data.shape) #torch.Size([100, 74, 100])
        
        out = x.unsqueeze(1)
        #print(out.data.shape) #torch.Size([100, 1, 74, 100])
        
        out = self.conv1(out)
        #print(out.data.shape) #torch.Size([100, 128, 74, 1])
        
        out = self.bn1(out)
        
        out = self.relu(out).squeeze(3)
        #print(out.data.shape) #torch.Size([100, 128, 74])
        
        out = self.dropout(out)
        
        out = self.max_pool1(out)
        #print(out.data.shape) #torch.Size([100, 128, 37])
        
        out = self.conv2(out)
        #print(out.data.shape) #torch.Size([100, 128, 37])
        
        out = self.bn2(out)
        
        out = self.relu(out)
        #print(out.data.shape) #torch.Size([100, 128, 37])
        
        out = self.dropout(out)
        
        out = self.max_pool2(out).squeeze(2).float()
        #print(out.data.shape) #torch.Size([100, 128])
        
        out = self.out(out)
        #print(out.data.shape) #torch.Size([100, 2])
        
        out = self.softmax(out)
        
        return out

In [58]:
cnn = CNN()
print(cnn)  # net architecture

CNN (
  (conv1): Conv2d(1, 128, kernel_size=(5, 100), stride=(1, 1), padding=(2, 0))
  (relu): ReLU ()
  (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True)
  (max_pool1): MaxPool1d (size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(128, 128, kernel_size=(5,), stride=(1,), padding=(2,))
  (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True)
  (max_pool2): MaxPool1d (size=25, stride=25, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout (p = 0.2)
  (out): Linear (128 -> 2)
  (softmax): Softmax ()
)


In [59]:
optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate)   # optimize all cnn parameters
loss_func = nn.CrossEntropyLoss()                                  # the target label is not one-hotted

In [73]:
cnn.train()

for epoch in range(num_epochs):  # loop over the dataset multiple times 
    for i, batch_indices in enumerate(train_loader.batch_sampler):   # iterate over mini-batches  
        # Converting tweets to vectors and storing it in a variable
        sentences = get_tweets_tensor(train_loader.dataset.data_list, batch_indices, verbose=False)
        x = Variable(sentences)
        
        # Converting labels to a variable
        labels = torch.from_numpy(train_loader.dataset.target_list[batch_indices])
        y = Variable(labels, requires_grad=False)
        
        # Forward + Backward + Optimize
        optimizer.zero_grad() # reset gradient
        outputs = cnn(x) # cnn output
        loss = loss_func(outputs, y) # clear gradients for this training step
        loss.backward() # backpropagation, compute gradients
        optimizer.step() # apply gradients
        
        if (i+1) % 100 == 0:
            print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f' 
                  %(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0]))

print('Finished Training')

Epoch [1/1], Iter [100/1980] Loss: 0.4019
Epoch [1/1], Iter [200/1980] Loss: 0.4350
Epoch [1/1], Iter [300/1980] Loss: 0.4045
Epoch [1/1], Iter [400/1980] Loss: 0.4456
Epoch [1/1], Iter [500/1980] Loss: 0.3887
Epoch [1/1], Iter [600/1980] Loss: 0.4602
Epoch [1/1], Iter [700/1980] Loss: 0.4448
Epoch [1/1], Iter [800/1980] Loss: 0.4055
Epoch [1/1], Iter [900/1980] Loss: 0.4345
Epoch [1/1], Iter [1000/1980] Loss: 0.4931
Epoch [1/1], Iter [1100/1980] Loss: 0.4891
Epoch [1/1], Iter [1200/1980] Loss: 0.4663
Epoch [1/1], Iter [1300/1980] Loss: 0.4513
Epoch [1/1], Iter [1400/1980] Loss: 0.4597
Epoch [1/1], Iter [1500/1980] Loss: 0.4089
Epoch [1/1], Iter [1600/1980] Loss: 0.4553
Epoch [1/1], Iter [1700/1980] Loss: 0.4316
Epoch [1/1], Iter [1800/1980] Loss: 0.4593
Epoch [1/1], Iter [1900/1980] Loss: 0.4417
Finished Training


In [74]:
# Evaluate accuracy of predictions from validation data
cnn.eval()
val_output = cnn(Variable(get_tweets_tensor(x_val)))
y_val_pred = torch.max(val_output, 1)[1].data.numpy().squeeze()

print('Validation accuracy:', accuracy_score(y_val, y_val_pred))

Validation accuracy: 0.8305


## Step 5: Make predictions for test data and save

In [None]:
test_output = cnn(Variable(get_tweets_tensor(x_text_test_pad, verbose=False)))
y_pred = torch.max(test_output, 1)[1].data.numpy().squeeze()

In [None]:
y_pred[y_pred == 0] = -1
y_pred

In [None]:
ids = np.arange(len(y_pred)+1)[1:]
ids

In [None]:
create_csv_submission(ids, y_pred, 'kaggle_submission.csv')