# Twitter Sentiment Classification - Version 1

In [1]:
import os 
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import gensim
import Cython

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader

from helpers import *
from data import create_csv_submission

## Step 1: Load tweets

In [2]:
DATA_PATH = '../twitter-datasets/'
TRAIN_NEG_PATH = os.path.join(DATA_PATH, 'train_neg.txt') # 100'000 negative tweets
TRAIN_POS_PATH = os.path.join(DATA_PATH, 'train_pos.txt') # 100'000 positive tweets
#TRAIN_NEG_PATH = os.path.join(DATA_PATH, 'train_neg_full.txt') # 2'500'000 negative tweets
#TRAIN_POS_PATH = os.path.join(DATA_PATH, 'train_pos_full.txt') # 2'500'000 positive tweets
TEST_PATH = os.path.join(DATA_PATH, 'test_data.txt')

In [3]:
x_text_train, y_train_full = load_data_and_labels(TRAIN_POS_PATH, TRAIN_NEG_PATH)

In [4]:
x_text_test = load_test_data(TEST_PATH)

## Step 2: Build word2vec vocabulary

In [5]:
vector_length = 100
w2v_model = gensim.models.Word2Vec(x_text_train + x_text_test, min_count=1, workers=4, size=vector_length)

In [6]:
print(w2v_model)

Word2Vec(vocab=103942, size=100, alpha=0.025)


In [7]:
# When training finished delete the training model but retain the word vectors:
word_vectors = w2v_model.wv
del w2v_model

In [8]:
word_vectors.wv['computer'].shape

(100,)

In [9]:
word_vectors.most_similar('computer')

[('pocket', 0.8003799915313721),
 ('data', 0.8002138137817383),
 ('security', 0.7997422218322754),
 ('etch', 0.7943888902664185),
 ('notes', 0.791228175163269),
 ('desk', 0.7858865261077881),
 ('networking', 0.774237334728241),
 ('protocols', 0.7616873979568481),
 ('wave', 0.7598108053207397),
 ('wallet', 0.7540020942687988)]

## Step 3: Convert tweets into sentences of vectors

In [10]:
# Compute the number of words of the longest tweet to get the maximal sentence length
sequence_length_train = max(len(x) for x in x_text_train)
sequence_length_test = max(len(x) for x in x_text_test)
sequence_length = max(sequence_length_train, sequence_length_test)
print('Maximum sequence length of train and test data:', sequence_length)

x_text_train_pad = pad_sentences(x_text_train, padding_word="<PAD/>", sequence_length=sequence_length)
x_text_test_pad = pad_sentences(x_text_test, padding_word="<PAD/>", sequence_length=sequence_length)

Maximum sequence length of train and test data: 50


In [11]:
# Mapping every word to a vector from word2vec
# Padding words are mapped to zero

tweets_vec_train = np.ndarray((len(x_text_train_pad), sequence_length, vector_length))

# TODO: SLOW!
for idx_t, tweet in enumerate(x_text_train_pad):
    for idx_w, word in enumerate(tweet):
        if word == '<PAD/>':
            tweets_vec_train[idx_t, idx_w] = np.zeros(100)
        else:
            tweets_vec_train[idx_t, idx_w] = word_vectors.wv[word]
    if idx_t % 25000 == 0:
        print('Transformed {}/{} tweets'.format(idx_t+1, (len(x_text_train_pad))))
        
tweets_vec_test = np.ndarray((len(x_text_test_pad), sequence_length, vector_length))

# TODO: SLOW!
for idx_t, tweet in enumerate(x_text_test_pad):
    for idx_w, word in enumerate(tweet):
        if word == '<PAD/>':
            tweets_vec_test[idx_t, idx_w] = np.zeros(100)
        else:
            tweets_vec_test[idx_t, idx_w] = word_vectors.wv[word]
    if idx_t % 25000 == 0:
        print('Transformed {}/{} tweets'.format(idx_t+1, (len(x_text_test_pad))))

Transformed 1/200000 tweets
Transformed 25001/200000 tweets
Transformed 50001/200000 tweets
Transformed 75001/200000 tweets
Transformed 100001/200000 tweets
Transformed 125001/200000 tweets
Transformed 150001/200000 tweets
Transformed 175001/200000 tweets
Transformed 1/10000 tweets


In [12]:
x_train, x_val, y_train, y_val = train_test_split(tweets_vec_train, y_train_full, test_size=0.1, random_state=42)

In [13]:
x_train.shape, y_train.shape

((180000, 50, 100), (180000,))

In [14]:
x_val.shape, y_val.shape

((20000, 50, 100), (20000,))

In [15]:
tweets_vec_test.shape

(10000, 50, 100)

## Step 4: Classification

In [16]:
# Hyper Parameters
num_epochs = 1
batch_size = 100
learning_rate = 0.001

In [17]:
train_dataset = TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train))
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size, 
                                           shuffle=True)

In [18]:
# Network hyperparameters
N = train_loader.dataset.data_tensor.shape[0] # Number of tweets (eg 200000)
S = train_loader.dataset.data_tensor.shape[1] # Number of words in one sentence (eg 50)
V = train_loader.dataset.data_tensor.shape[2] # Length of word vectors (eg 100)
K = 3                                         # Kernel width (K*V)
C = 32                                        # Number of convolutional filters
F = 2                                         # Number of output neurons in fully connected layer

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(             # input shape (1, S, V)
                in_channels=1,              # input channels
                out_channels=C,             # number of filters
                kernel_size=(K,V),          # filter size
                padding=(1,0),              # if want same width and length of this image after con2d, padding=(kernel_size-1)/2 if stride=1
        ).double()                          # output shape (C, S, 1)
        self.relu = nn.ReLU()               # ReLU activation
        self.max_pool = nn.MaxPool1d(S)     # max-pool each filter into 1 output   
        self.out = nn.Linear(C, F)          # fully connected layer, output F classes

    def forward(self, x):        
        out = x.unsqueeze(1)        
        out = self.conv1(out)        
        out = self.relu(out).squeeze(3)        
        out = self.max_pool(out).squeeze(2).float()                
        out = self.out(out)        
        return out

In [19]:
cnn = CNN()
print(cnn)  # net architecture

CNN (
  (conv1): Conv2d(1, 32, kernel_size=(3, 100), stride=(1, 1), padding=(1, 0))
  (relu): ReLU ()
  (max_pool): MaxPool1d (size=50, stride=50, padding=0, dilation=1, ceil_mode=False)
  (out): Linear (32 -> 2)
)


In [20]:
optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate)   # optimize all cnn parameters
loss_func = nn.CrossEntropyLoss()                                  # the target label is not one-hotted

In [21]:
for epoch in range(num_epochs):  # loop over the dataset multiple times 
    for i, (sentences, labels) in enumerate(train_loader):   # iterate over mini-batches      
        x = Variable(sentences)
        y = Variable(labels, requires_grad=False)
        
        # Forward + Backward + Optimize
        optimizer.zero_grad() # reset gradient
        outputs = cnn(x) # cnn output
        loss = loss_func(outputs, y) # clear gradients for this training step
        loss.backward() # backpropagation, compute gradients
        optimizer.step() # apply gradients
        
        if (i+1) % 100 == 0:
            print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f' 
                  %(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0]))

print('Finished Training')

Epoch [1/1], Iter [100/1800] Loss: 0.6104
Epoch [1/1], Iter [200/1800] Loss: 0.4573
Epoch [1/1], Iter [300/1800] Loss: 0.4746
Epoch [1/1], Iter [400/1800] Loss: 0.5232
Epoch [1/1], Iter [500/1800] Loss: 0.3982
Epoch [1/1], Iter [600/1800] Loss: 0.4205
Epoch [1/1], Iter [700/1800] Loss: 0.3745
Epoch [1/1], Iter [800/1800] Loss: 0.4700
Epoch [1/1], Iter [900/1800] Loss: 0.3834
Epoch [1/1], Iter [1000/1800] Loss: 0.4419
Epoch [1/1], Iter [1100/1800] Loss: 0.4094
Epoch [1/1], Iter [1200/1800] Loss: 0.3339
Epoch [1/1], Iter [1300/1800] Loss: 0.4181
Epoch [1/1], Iter [1400/1800] Loss: 0.3679
Epoch [1/1], Iter [1500/1800] Loss: 0.4670
Epoch [1/1], Iter [1600/1800] Loss: 0.4493
Epoch [1/1], Iter [1700/1800] Loss: 0.4068
Epoch [1/1], Iter [1800/1800] Loss: 0.4140
Finished Training


In [22]:
# Print 10 predictions from test data
val_output_10 = cnn(Variable(torch.from_numpy(x_val[:10])))
y_val_pred_10 = torch.max(val_output_10, 1)[1].data.numpy().squeeze()
print(y_val_pred_10, 'prediction number')
print(y_val[:10], 'real number')
accuracy(y_val_pred_10, y_val[:10], verbose=True)

[1 1 0 1 1 1 0 1 0 1] prediction number
[0 1 0 1 1 1 0 0 0 1] real number
Accuracy: 0.8%


0.80000000000000004

In [23]:
# Print validation accuracy of predictions
val_output = cnn(Variable(torch.from_numpy(x_val)))
y_val_pred = torch.max(val_output, 1)[1].data.numpy().squeeze()

accuracy(y_val_pred, y_val, verbose=True)

Accuracy: 0.80595%


0.80595000000000006

## Step 5: Make predictions for test data and save

In [24]:
test_output = cnn(Variable(torch.from_numpy(tweets_vec_test)))
y_pred = torch.max(test_output, 1)[1].data.numpy().squeeze()

In [25]:
y_pred[y_pred == 0] = -1
y_pred

array([-1, -1, -1, ..., -1,  1, -1])

In [26]:
ids = np.arange(len(y_pred)+1)[1:]
ids

array([    1,     2,     3, ...,  9998,  9999, 10000])

In [27]:
create_csv_submission(ids, y_pred, 'kaggle_submission.csv')