# Twitter Sentiment Classification - Version 2

In [1]:
import os 
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import gensim
import Cython

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, TensorDataset, DataLoader

from torchtext import data

from helpers import *
from data import create_csv_submission

## Step 1: Load tweets

In [2]:
DATA_PATH = '../twitter-datasets/'
MODEL_PATH = '../models/'

TRAIN_NEG_PATH = os.path.join(DATA_PATH, 'train_neg.txt') # 100'000 negative tweets
TRAIN_POS_PATH = os.path.join(DATA_PATH, 'train_pos.txt') # 100'000 positive tweets
#TRAIN_NEG_PATH = os.path.join(DATA_PATH, 'train_neg_full.txt') # 2'500'000 negative tweets
#TRAIN_POS_PATH = os.path.join(DATA_PATH, 'train_pos_full.txt') # 2'500'000 positive tweets
TEST_PATH = os.path.join(DATA_PATH, 'test_data.txt')

In [3]:
x_text_train, y_train_full = load_data_and_labels(TRAIN_POS_PATH, TRAIN_NEG_PATH)

In [4]:
x_text_test = load_test_data(TEST_PATH)

## Step 2: Build word2vec vocabulary

In [5]:
vector_length = 100

In [5]:
w2v_model = gensim.models.Word2Vec(x_text_train + x_text_test, min_count=1, workers=4, size=vector_length)

In [6]:
print(w2v_model)

Word2Vec(vocab=496335, size=100, alpha=0.025)


In [9]:
w2v_model.save(MODEL_PATH + 'twitter_w2v.bin')

In [6]:
w2v_model = gensim.models.Word2Vec.load(MODEL_PATH + 'twitter_w2v.bin')

In [7]:
# When training finished delete the training model but retain the word vectors:
word_vectors = w2v_model.wv
del w2v_model

In [8]:
word_vectors.wv['computer'].shape

(100,)

In [10]:
word_vectors.most_similar('computer')

[('internet', 0.708969235420227),
 ('desktop', 0.6651554107666016),
 ('calculator', 0.6596795320510864),
 ('settings', 0.6567734479904175),
 ('laptop', 0.6565127372741699),
 ('phone', 0.6472017168998718),
 ('browser', 0.635584831237793),
 ('portfolio', 0.6346469521522522),
 ('wifi', 0.6323057413101196),
 ('desk', 0.6210508346557617)]

## Step 3: Convert tweets into sentences of vectors

In [12]:
# Compute the number of words of the longest tweet to get the maximal sentence length
sequence_length_train = max(len(x) for x in x_text_train)
sequence_length_test = max(len(x) for x in x_text_test)
sequence_length = max(sequence_length_train, sequence_length_test)
print('Maximum sequence length of train and test data:', sequence_length)

x_text_train_pad = pad_sentences(x_text_train, padding_word="<PAD/>", sequence_length=sequence_length)
x_text_test_pad = pad_sentences(x_text_test, padding_word="<PAD/>", sequence_length=sequence_length)

del x_text_train
del x_text_test

Maximum sequence length of train and test data: 50


In [13]:
# Split into training and validation data
x_train, x_val, y_train, y_val = train_test_split(x_text_train_pad, y_train_full, test_size=0.01, random_state=42)

In [14]:
len(x_train), len(x_train[0])

(198000, 50)

In [15]:
len(x_val), len(x_val[0])

(2000, 50)

In [16]:
# TODO: SLOW!

def get_tweets_tensor(tweets, indices=[], verbose=False):
    '''Mapping every word to a vector from word2vec
    Padding words are mapped to zero
    Leave indices empty to map every tweet in tweets
    '''

    nb_tweets = len(tweets) if len(indices)==0 else len(indices)
    tweets_vec = np.zeros((nb_tweets, len(tweets[0]), vector_length), dtype=np.float32)
    
    if indices == []:
        for idx_t, tweet in enumerate(tweets):
            for idx_w, word in enumerate(tweet):
                if word != '<PAD/>':
                    tweets_vec[idx_t, idx_w] = word_vectors.wv[word]  
            if verbose and idx_t % 100000 == 0:
                print('Transformed {}/{} tweets'.format(idx_t+1, (len(x_text_train_pad))))
    else:
        for idx_t, orig_idx in enumerate(indices):
            for idx_w, word in enumerate(tweets[orig_idx]):
                if word != '<PAD/>':
                    tweets_vec[idx_t, idx_w] = word_vectors.wv[word]  
            if verbose and idx_t % 100000 == 0:
                print('Transformed {}/{} tweets'.format(idx_t+1, (len(x_text_train_pad))))
    
    return torch.from_numpy(tweets_vec)

## Step 4: Classification

In [17]:
# Hyper Parameters
num_epochs = 1
batch_size = 100
learning_rate = 0.001

In [18]:
class ListDataset(Dataset):
    """Dataset wrapping data and target lists.

    Each sample will be retrieved by indexing both lists along the first
    dimension.

    Arguments:
        data_list (python list): contains sample data.
        target_list (python list): contains sample targets (labels).
    """

    def __init__(self, data_list, target_list):
        assert len(data_list) == len(target_list)
        self.data_list = data_list
        self.target_list = target_list

    def __getitem__(self, index):
        return self.data_list[index], self.target_list[index]

    def __len__(self):
        return len(self.data_list)

In [19]:
train_dataset = ListDataset(x_train, y_train)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size, 
                                           shuffle=True)

In [25]:
# Network hyperparameters
N = len(train_loader.dataset.data_list)      # Number of tweets (eg 200000)
S = len(train_loader.dataset.data_list[0])   # Number of words in one sentence (eg 50)
V = vector_length                            # Length of word vectors (eg 100)
K = 3                                        # Kernel width (K*V)
C = 128                                      # Number of convolutional filters
F = 2                                        # Number of output neurons in fully connected layer

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(             # input shape (1, S, V)
                in_channels=1,              # input channels
                out_channels=C,             # number of filters
                kernel_size=(K,V),          # filter size
                padding=(1,0)               # to keep size S
        )                                   # output shape (C, S, 1)
        self.relu = nn.ReLU()               # ReLU activation
        self.max_pool1 = nn.MaxPool1d(2)    # max-pool each filter into S/2 output   
        self.conv2 = nn.Conv1d(             # input shape (C, S/2)
                in_channels=C,              # input channels
                out_channels=C,             # number of filters (one for each input channel)
                kernel_size=K,              # filter size
                padding=1                   # to keep size S/2
        )                                   # output shape (C, S/2)
        self.max_pool2 = nn.MaxPool1d(int(S/2)) # max pool each filter into 1 output
        self.out = nn.Linear(C, F)          # fully connected layer, output F classes

    def forward(self, x):
        #print(x.data.shape) #torch.Size([100, 74, 100])
        
        out = x.unsqueeze(1)
        #print(out.data.shape) #torch.Size([100, 1, 74, 100])
        
        out = self.conv1(out)
        #print(out.data.shape) #torch.Size([100, 128, 74, 1])
        
        out = self.relu(out).squeeze(3)
        #print(out.data.shape) #torch.Size([100, 128, 74])
        
        out = self.max_pool1(out)
        #print(out.data.shape) #torch.Size([100, 128, 37])
        
        out = self.conv2(out)
        #print(out.data.shape) #torch.Size([100, 128, 37])
        
        out = self.relu(out)
        #print(out.data.shape) #torch.Size([100, 128, 37])
        
        out = self.max_pool2(out).squeeze(2).float()
        #print(out.data.shape) #torch.Size([100, 128])
        
        out = self.out(out)
        #print(out.data.shape) #torch.Size([100, 2])
        
        return out

In [20]:
import caps_net

SyntaxError: invalid syntax (conv_layer.py, line 23)

In [40]:
cnn = caps_net.Net(num_conv_in_channel=1,
                num_conv_out_channel=256,
                num_primary_unit=8,
                primary_unit_size=1152,
                num_classes=2,
                output_unit_size=16,
                num_routing=3,
                use_reconstruction_loss=False,
                regularization_scale=0.0005,
                cuda_enabled=False)

In [41]:
#cnn = CNN()
print(cnn)  # net architecture

Net (
  (conv1): ConvLayer (
    (conv0): Conv2d(1, 256, kernel_size=(9, 9), stride=(1, 1), padding=(1, 0))
    (relu): ReLU (inplace)
  )
  (primary): CapsuleLayer (
    (conv_units): ModuleList (
      (0): Conv2d(256, 32, kernel_size=(9, 9), stride=(2, 2))
      (1): Conv2d(256, 32, kernel_size=(9, 9), stride=(2, 2))
      (2): Conv2d(256, 32, kernel_size=(9, 9), stride=(2, 2))
      (3): Conv2d(256, 32, kernel_size=(9, 9), stride=(2, 2))
      (4): Conv2d(256, 32, kernel_size=(9, 9), stride=(2, 2))
      (5): Conv2d(256, 32, kernel_size=(9, 9), stride=(2, 2))
      (6): Conv2d(256, 32, kernel_size=(9, 9), stride=(2, 2))
      (7): Conv2d(256, 32, kernel_size=(9, 9), stride=(2, 2))
    )
  )
  (digits): CapsuleLayer (
  )
)


In [42]:
optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate)   # optimize all cnn parameters
loss_func = nn.CrossEntropyLoss()                                  # the target label is not one-hotted

In [43]:
for epoch in range(num_epochs):  # loop over the dataset multiple times 
    for i, batch_indices in enumerate(train_loader.batch_sampler):   # iterate over mini-batches  
        # Converting tweets to vectors and storing it in a variable
        sentences = get_tweets_tensor(train_loader.dataset.data_list, batch_indices, verbose=False)
        x = Variable(sentences)
        
        # Converting labels to a variable
        labels = torch.from_numpy(train_loader.dataset.target_list[batch_indices])
        y = Variable(labels, requires_grad=False)
        
        # Forward + Backward + Optimize
        optimizer.zero_grad() # reset gradient
        outputs = cnn(x) # cnn output
        loss = loss_func(outputs, y) # clear gradients for this training step
        loss.backward() # backpropagation, compute gradients
        optimizer.step() # apply gradients
        
        if (i+1) % 100 == 0:
            print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f' 
                  %(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0]))

print('Finished Training')

ValueError: Expected 4D tensor as input, got 3D tensor instead.

In [33]:
# Evaluate accuracy of predictions from validation data
val_output = cnn(Variable(get_tweets_tensor(x_val)))
y_val_pred = torch.max(val_output, 1)[1].data.numpy().squeeze()

print('Validation accuracy:', accuracy_score(y_val, y_val_pred))

Validation accuracy: 0.8295


## Step 5: Make predictions for test data and save

In [93]:
test_output = cnn(Variable(get_tweets_tensor(x_text_test_pad, verbose=False)))
y_pred = torch.max(test_output, 1)[1].data.numpy().squeeze()

In [94]:
y_pred[y_pred == 0] = -1
y_pred

array([-1, -1, -1, ..., -1,  1, -1])

In [95]:
ids = np.arange(len(y_pred)+1)[1:]
ids

array([    1,     2,     3, ...,  9998,  9999, 10000])

In [96]:
create_csv_submission(ids, y_pred, 'kaggle_submission.csv')