# Twitter Sentiment Classification - Version 1

In [1]:
import os 
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import gensim
import Cython

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, TensorDataset, DataLoader

from torchtext import data

from helpers import *
from data import create_csv_submission

## Step 1: Load tweets

In [2]:
DATA_PATH = '../twitter-datasets/'
#TRAIN_NEG_PATH = os.path.join(DATA_PATH, 'train_neg.txt') # 100'000 negative tweets
#TRAIN_POS_PATH = os.path.join(DATA_PATH, 'train_pos.txt') # 100'000 positive tweets
TRAIN_NEG_PATH = os.path.join(DATA_PATH, 'train_neg_full.txt') # 2'500'000 negative tweets
TRAIN_POS_PATH = os.path.join(DATA_PATH, 'train_pos_full.txt') # 2'500'000 positive tweets
TEST_PATH = os.path.join(DATA_PATH, 'test_data.txt')

In [3]:
x_text_train, y_train_full = load_data_and_labels(TRAIN_POS_PATH, TRAIN_NEG_PATH)

In [4]:
x_text_test = load_test_data(TEST_PATH)

## Step 2: Build word2vec vocabulary

In [5]:
vector_length = 100
w2v_model = gensim.models.Word2Vec(x_text_train + x_text_test, min_count=1, workers=4, size=vector_length)

In [6]:
print(w2v_model)

Word2Vec(vocab=496335, size=100, alpha=0.025)


In [7]:
# When training finished delete the training model but retain the word vectors:
word_vectors = w2v_model.wv
del w2v_model

In [8]:
word_vectors.wv['computer'].shape

(100,)

In [9]:
word_vectors.most_similar('computer')

[('internet', 0.7272481918334961),
 ('desktop', 0.6671649813652039),
 ('phone', 0.6602565050125122),
 ('browser', 0.6511868238449097),
 ('laptop', 0.6457754373550415),
 ('phones', 0.6350281238555908),
 ('calculator', 0.6316317915916443),
 ('gmail', 0.6251569390296936),
 ('ipods', 0.6223116517066956),
 ('keyboard', 0.6169487833976746)]

## Step 3: Convert tweets into sentences of vectors

In [10]:
# Compute the number of words of the longest tweet to get the maximal sentence length
sequence_length_train = max(len(x) for x in x_text_train)
sequence_length_test = max(len(x) for x in x_text_test)
sequence_length = max(sequence_length_train, sequence_length_test)
print('Maximum sequence length of train and test data:', sequence_length)

x_text_train_pad = pad_sentences(x_text_train, padding_word="<PAD/>", sequence_length=sequence_length)
x_text_test_pad = pad_sentences(x_text_test, padding_word="<PAD/>", sequence_length=sequence_length)

del x_text_train
del x_text_test

Maximum sequence length of train and test data: 74


In [11]:
# Split into training and validation data
x_train, x_val, y_train, y_val = train_test_split(x_text_train_pad, y_train_full, test_size=0.1, random_state=42)

In [12]:
len(x_train), len(x_train[0])

(2250000, 74)

In [13]:
len(x_val), len(x_val[0])

(250000, 74)

In [14]:
# TODO: SLOW!

def get_tweets_tensor(tweets, indices=[], verbose=False):
    '''Mapping every word to a vector from word2vec
    Padding words are mapped to zero
    Leave indices empty to map every tweet in tweets
    '''

    nb_tweets = len(tweets) if len(indices)==0 else len(indices)
    tweets_vec = np.zeros((nb_tweets, len(tweets[0]), vector_length), dtype=np.float32)
    
    if indices == []:
        for idx_t, tweet in enumerate(tweets):
            for idx_w, word in enumerate(tweet):
                if word != '<PAD/>':
                    tweets_vec[idx_t, idx_w] = word_vectors.wv[word]  
            if verbose and idx_t % 100000 == 0:
                print('Transformed {}/{} tweets'.format(idx_t+1, (len(x_text_train_pad))))
    else:
        for idx_t, orig_idx in enumerate(indices):
            for idx_w, word in enumerate(tweets[orig_idx]):
                if word != '<PAD/>':
                    tweets_vec[idx_t, idx_w] = word_vectors.wv[word]  
            if verbose and idx_t % 100000 == 0:
                print('Transformed {}/{} tweets'.format(idx_t+1, (len(x_text_train_pad))))
    
    return torch.from_numpy(tweets_vec)

## Step 4: Classification

In [15]:
# Hyper Parameters
num_epochs = 1
batch_size = 100
learning_rate = 0.001

In [16]:
class ArrayDataset(Dataset):
    """Dataset wrapping data and target tensors.

    Each sample will be retrieved by indexing both tensors along the first
    dimension.

    Arguments:
        data_tensor (Tensor): contains sample data.
        target_tensor (Tensor): contains sample targets (labels).
    """

    def __init__(self, data_array, target_array):
        assert data_array.shape[0] == target_array.shape[0]
        self.data_array = data_array
        self.target_array = target_array

    def __getitem__(self, index):
        return self.data_array[index], self.target_array[index]

    def __len__(self):
        return self.data_array.shape[0]

In [17]:
class ListDataset(Dataset):
    """Dataset wrapping data and target tensors.

    Each sample will be retrieved by indexing both tensors along the first
    dimension.

    Arguments:
        data_tensor (Tensor): contains sample data.
        target_tensor (Tensor): contains sample targets (labels).
    """

    def __init__(self, data_list, target_list):
        assert len(data_list) == len(target_list)
        self.data_list = data_list
        self.target_list = target_list

    def __getitem__(self, index):
        return self.data_list[index], self.target_list[index]

    def __len__(self):
        return len(self.data_list)

In [18]:
train_dataset = ListDataset(x_train, y_train)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size, 
                                           shuffle=True)

In [19]:
# Network hyperparameters
N = len(train_loader.dataset.data_list)      # Number of tweets (eg 200000)
S = len(train_loader.dataset.data_list[0])   # Number of words in one sentence (eg 50)
V = vector_length                            # Length of word vectors (eg 100)
K = 3                                        # Kernel width (K*V)
C = 64                                       # Number of convolutional filters
F = 2                                        # Number of output neurons in fully connected layer

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(             # input shape (1, S, V)
                in_channels=1,              # input channels
                out_channels=C,             # number of filters
                kernel_size=(K,V),          # filter size
                padding=(1,0),              # if want same width and length of this image after con2d, padding=(kernel_size-1)/2 if stride=1
        )                         # output shape (C, S, 1)
        self.relu = nn.ReLU()               # ReLU activation
        self.max_pool = nn.MaxPool1d(S)     # max-pool each filter into 1 output   
        self.out = nn.Linear(C, F)          # fully connected layer, output F classes

    def forward(self, x):        
        out = x.unsqueeze(1)        
        out = self.conv1(out)        
        out = self.relu(out).squeeze(3)        
        out = self.max_pool(out).squeeze(2).float()                
        out = self.out(out)        
        return out

In [20]:
cnn = CNN()
print(cnn)  # net architecture

CNN (
  (conv1): Conv2d(1, 64, kernel_size=(3, 100), stride=(1, 1), padding=(1, 0))
  (relu): ReLU ()
  (max_pool): MaxPool1d (size=74, stride=74, padding=0, dilation=1, ceil_mode=False)
  (out): Linear (64 -> 2)
)


In [21]:
optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate)   # optimize all cnn parameters
loss_func = nn.CrossEntropyLoss()                                  # the target label is not one-hotted

In [22]:
for epoch in range(num_epochs):  # loop over the dataset multiple times 
    for i, batch_indices in enumerate(train_loader.batch_sampler):   # iterate over mini-batches  
        # Converting tweets to vectors and storing it in a variable
        sentences = get_tweets_tensor(train_loader.dataset.data_list, batch_indices, verbose=False)
        x = Variable(sentences)
        
        # Converting labels to a variable
        labels = torch.from_numpy(train_loader.dataset.target_list[batch_indices])
        y = Variable(labels, requires_grad=False)
        
        # Forward + Backward + Optimize
        optimizer.zero_grad() # reset gradient
        outputs = cnn(x) # cnn output
        loss = loss_func(outputs, y) # clear gradients for this training step
        loss.backward() # backpropagation, compute gradients
        optimizer.step() # apply gradients
        
        if (i+1) % 100 == 0:
            print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f' 
                  %(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0]))

print('Finished Training')

Epoch [1/1], Iter [100/22500] Loss: 0.5279
Epoch [1/1], Iter [200/22500] Loss: 0.3991
Epoch [1/1], Iter [300/22500] Loss: 0.3297
Epoch [1/1], Iter [400/22500] Loss: 0.3542
Epoch [1/1], Iter [500/22500] Loss: 0.3576
Epoch [1/1], Iter [600/22500] Loss: 0.4598
Epoch [1/1], Iter [700/22500] Loss: 0.5672
Epoch [1/1], Iter [800/22500] Loss: 0.3419
Epoch [1/1], Iter [900/22500] Loss: 0.4263
Epoch [1/1], Iter [1000/22500] Loss: 0.3910
Epoch [1/1], Iter [1100/22500] Loss: 0.3435
Epoch [1/1], Iter [1200/22500] Loss: 0.2903
Epoch [1/1], Iter [1300/22500] Loss: 0.3546
Epoch [1/1], Iter [1400/22500] Loss: 0.3693
Epoch [1/1], Iter [1500/22500] Loss: 0.3804
Epoch [1/1], Iter [1600/22500] Loss: 0.3193
Epoch [1/1], Iter [1700/22500] Loss: 0.3856
Epoch [1/1], Iter [1800/22500] Loss: 0.4196
Epoch [1/1], Iter [1900/22500] Loss: 0.3849
Epoch [1/1], Iter [2000/22500] Loss: 0.4527
Epoch [1/1], Iter [2100/22500] Loss: 0.4103
Epoch [1/1], Iter [2200/22500] Loss: 0.3842
Epoch [1/1], Iter [2300/22500] Loss: 0.42

Epoch [1/1], Iter [18600/22500] Loss: 0.4516
Epoch [1/1], Iter [18700/22500] Loss: 0.3830
Epoch [1/1], Iter [18800/22500] Loss: 0.3160
Epoch [1/1], Iter [18900/22500] Loss: 0.3741
Epoch [1/1], Iter [19000/22500] Loss: 0.4106
Epoch [1/1], Iter [19100/22500] Loss: 0.4050
Epoch [1/1], Iter [19200/22500] Loss: 0.2652
Epoch [1/1], Iter [19300/22500] Loss: 0.3877
Epoch [1/1], Iter [19400/22500] Loss: 0.3028
Epoch [1/1], Iter [19500/22500] Loss: 0.3052
Epoch [1/1], Iter [19600/22500] Loss: 0.4146
Epoch [1/1], Iter [19700/22500] Loss: 0.3539
Epoch [1/1], Iter [19800/22500] Loss: 0.4154
Epoch [1/1], Iter [19900/22500] Loss: 0.4051
Epoch [1/1], Iter [20000/22500] Loss: 0.2842
Epoch [1/1], Iter [20100/22500] Loss: 0.3116
Epoch [1/1], Iter [20200/22500] Loss: 0.3227
Epoch [1/1], Iter [20300/22500] Loss: 0.3063
Epoch [1/1], Iter [20400/22500] Loss: 0.3853
Epoch [1/1], Iter [20500/22500] Loss: 0.3737
Epoch [1/1], Iter [20600/22500] Loss: 0.3204
Epoch [1/1], Iter [20700/22500] Loss: 0.2939
Epoch [1/1

In [37]:
# Evaluate n predictions from test data
n = 10000
val_output_n = cnn(Variable(get_tweets_tensor(x_val, indices=np.arange(n), verbose=False)))
y_val_pred_n = torch.max(val_output_n, 1)[1].data.numpy().squeeze()
accuracy(y_val_pred_n, y_val[:n], verbose=True)

  if sys.path[0] == '':


Accuracy: 0.8403%


0.84030000000000005

## Step 5: Make predictions for test data and save

In [32]:
test_output = cnn(Variable(get_tweets_tensor(x_text_test_pad, verbose=False)))
y_pred = torch.max(test_output, 1)[1].data.numpy().squeeze()

In [33]:
y_pred[y_pred == 0] = -1
y_pred

array([-1,  1, -1, ..., -1,  1, -1])

In [34]:
ids = np.arange(len(y_pred)+1)[1:]
ids

array([    1,     2,     3, ...,  9998,  9999, 10000])

In [35]:
create_csv_submission(ids, y_pred, 'kaggle_submission.csv')