# Model 4

In [1]:
import os 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import gensim
import Cython

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, TensorDataset, DataLoader

from helpers import *
from data import create_csv_submission

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
cuda_available = torch.cuda.is_available()

## Step 1: Load tweets

In [3]:
DATA_PATH = '../twitter-datasets/'
MODEL_PATH = '../models/'

#TRAIN_NEG_PATH = os.path.join(DATA_PATH, 'train_neg.txt') # 100'000 negative tweets
#TRAIN_POS_PATH = os.path.join(DATA_PATH, 'train_pos.txt') # 100'000 positive tweets
TRAIN_NEG_PATH = os.path.join(DATA_PATH, 'train_neg_full.txt') # 2'500'000 negative tweets
TRAIN_POS_PATH = os.path.join(DATA_PATH, 'train_pos_full.txt') # 2'500'000 positive tweets
TEST_PATH = os.path.join(DATA_PATH, 'test_data.txt')

In [4]:
x_text_train, y_train_full = load_data_and_labels(TRAIN_POS_PATH, TRAIN_NEG_PATH)

In [5]:
x_text_test = load_test_data(TEST_PATH)

## tfidf

In [6]:
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=1)
matrix = vectorizer.fit_transform([x for x in x_text_train + x_text_test])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print('vocab size : {}'.format(len(tfidf)))

vocab size : 496335


## Step 2: Build word2vec vocabulary

In [7]:
vector_length = 100

In [8]:
w2v_model = gensim.models.Word2Vec(x_text_train + x_text_test, min_count=1, workers=8, size=vector_length)

In [9]:
print(w2v_model)

Word2Vec(vocab=433403, size=100, alpha=0.025)


In [10]:
w2v_model.save(MODEL_PATH + 'twitter_w2v_preprocessed.bin')

In [8]:
w2v_model = gensim.models.Word2Vec.load(MODEL_PATH + 'twitter_w2v.bin')

In [9]:
# When training finished delete the training model but retain the word vectors:
word_vectors = w2v_model.wv
del w2v_model

In [10]:
word_vectors.wv['computer'].shape

(100,)

In [11]:
word_vectors.most_similar('computer')

[('internet', 0.708969235420227),
 ('desktop', 0.6651554107666016),
 ('calculator', 0.6596795320510864),
 ('settings', 0.6567735075950623),
 ('laptop', 0.6565126180648804),
 ('phone', 0.6472017765045166),
 ('browser', 0.635584831237793),
 ('portfolio', 0.6346469521522522),
 ('wifi', 0.6323057413101196),
 ('desk', 0.6210508346557617)]

## Step 3: Convert tweets into sentences of vectors

In [12]:
# Compute the number of words of the longest tweet to get the maximal sentence length
sequence_length_train = max(len(x) for x in x_text_train)
sequence_length_test = max(len(x) for x in x_text_test)
sequence_length = max(sequence_length_train, sequence_length_test)
print('Maximum sequence length of train and test data:', sequence_length)

x_text_train_pad = pad_sentences(x_text_train, padding_word="<PAD/>", sequence_length=sequence_length)
x_text_test_pad = pad_sentences(x_text_test, padding_word="<PAD/>", sequence_length=sequence_length)

del x_text_train
del x_text_test

Maximum sequence length of train and test data: 74


In [13]:
# Split into training and validation data
x_train, x_val, y_train, y_val = train_test_split(x_text_train_pad, y_train_full, test_size=0.01, random_state=42)

In [14]:
len(x_train), len(x_train[0])

(2475000, 74)

In [15]:
len(x_val), len(x_val[0])

(25000, 74)

In [16]:
# TODO: SLOW!

def get_tweets_tensor(tweets, indices=[], verbose=False):
    '''Mapping every word to a vector from word2vec
    Padding words are mapped to zero
    Leave indices empty to map every tweet in tweets
    '''

    nb_tweets = len(tweets) if len(indices)==0 else len(indices)
    tweets_vec = np.zeros((nb_tweets, len(tweets[0]), vector_length), dtype=np.float32)
    
    if indices == []:
        for idx_t, tweet in enumerate(tweets):
            for idx_w, word in enumerate(tweet):
                if word != '<PAD/>':
                    tweets_vec[idx_t, idx_w] = word_vectors.wv[word] * tfidf[word]
            if verbose and idx_t % 100000 == 0:
                print('Transformed {}/{} tweets'.format(idx_t+1, (len(x_text_train_pad))))
    else:
        for idx_t, orig_idx in enumerate(indices):
            for idx_w, word in enumerate(tweets[orig_idx]):
                if word != '<PAD/>':
                    tweets_vec[idx_t, idx_w] = word_vectors.wv[word] * tfidf[word]  
            if verbose and idx_t % 100000 == 0:
                print('Transformed {}/{} tweets'.format(idx_t+1, (len(x_text_train_pad))))
    
    return torch.from_numpy(tweets_vec)

## Step 4: Classification

In [17]:
# Hyper Parameters
num_epochs = 3
batch_size = 100
learning_rate = 0.001

In [18]:
class ListDataset(Dataset):
    """Dataset wrapping data and target lists.

    Each sample will be retrieved by indexing both lists along the first
    dimension.

    Arguments:
        data_list (python list): contains sample data.
        target_list (python list): contains sample targets (labels).
    """

    def __init__(self, data_list, target_list):
        assert len(data_list) == len(target_list)
        self.data_list = data_list
        self.target_list = target_list

    def __getitem__(self, index):
        return self.data_list[index], self.target_list[index]

    def __len__(self):
        return len(self.data_list)

In [19]:
train_dataset = ListDataset(x_train, y_train)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size, 
                                           shuffle=True)

In [21]:
# Network hyperparameters
N = len(train_loader.dataset.data_list)      # Number of tweets (eg 200000)
S = len(train_loader.dataset.data_list[0])   # Number of words in one sentence (eg 50)
V = vector_length                            # Length of word vectors (eg 100)
K = 3                                        # Kernel width (K*V)
C = 256                                      # Number of convolutional filters
F = 2                                        # Number of output neurons in fully connected layer

class Model4(nn.Module):
    def __init__(self):
        super(Model4, self).__init__()
        self.bn1 = nn.BatchNorm1d(1)
        self.conv1 = nn.Conv2d(             # input shape (1, S, V)
                in_channels=1,              # input channels
                out_channels=C,             # number of filters
                kernel_size=(K,V),          # filter size
                padding=(K//2,0)               # to keep size S
        )                                   # output shape (C, S, 1)
        self.relu = nn.ReLU()               # ReLU activation
        self.bn = nn.BatchNorm1d(C)
        self.max_pool1 = nn.MaxPool1d(2)    # max-pool each filter into S/2 output   
        self.conv2 = nn.Conv1d(             # input shape (C, S/2)
                in_channels=C,              # input channels
                out_channels=C,             # number of filters (one for each input channel)
                kernel_size=K,              # filter size
                padding=K//2                   # to keep size S/2
        )                                   # output shape (C, S/2)
        self.max_pool2 = nn.MaxPool1d(int(S/2)) # max pool each filter into 1 output
        self.dropout= nn.Dropout(p=0.2)
        self.out = nn.Linear(C, F)          # fully connected layer, output F classes
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):        
        out = x.unsqueeze(1)                
        out = self.conv1(out)        
        out = self.bn(out)
        out = self.relu(out).squeeze(3)        
        out = self.dropout(out)
        out = self.max_pool1(out)        
        out = self.conv2(out)        
        out = self.bn(out)
        out = self.relu(out)        
        out = self.dropout(out)
        out = self.max_pool2(out).squeeze(2)        
        out = self.out(out)        
        out = self.softmax(out)
        return out

In [22]:
cnn = Model4()
if torch.cuda.is_available():
    cnn.cuda()
print(cnn)  # net architecture

Model4(
  (bn1): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True)
  (conv1): Conv2d (1, 256, kernel_size=(3, 100), stride=(1, 1), padding=(1, 0))
  (relu): ReLU()
  (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True)
  (max_pool1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d (256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (max_pool2): MaxPool1d(kernel_size=37, stride=37, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.2)
  (out): Linear(in_features=256, out_features=2)
  (softmax): Softmax()
)


In [23]:
optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate)   # optimize all cnn parameters
if torch.cuda.is_available():
    loss_func = nn.CrossEntropyLoss().cuda()                                  # the target label is not one-hotted
else:
    loss_func = nn.CrossEntropyLoss()

In [24]:
cnn.train()

for epoch in range(num_epochs):  # loop over the dataset multiple times 
    for i, batch_indices in enumerate(train_loader.batch_sampler):   # iterate over mini-batches  
        # Converting tweets to vectors and storing it in a variable
        sentences = get_tweets_tensor(train_loader.dataset.data_list, batch_indices, verbose=False)
        if torch.cuda.is_available():
            sentences = sentences.cuda()
        x = Variable(sentences)
        
        # Converting labels to a variable
        labels = torch.from_numpy(train_loader.dataset.target_list[batch_indices])
        if torch.cuda.is_available():
            labels = labels.cuda()
        y = Variable(labels, requires_grad=False)
        
        # Forward + Backward + Optimize
        optimizer.zero_grad() # reset gradient
        outputs = cnn(x) # cnn output
        loss = loss_func(outputs, y) # clear gradients for this training step
        loss.backward() # backpropagation, compute gradients
        optimizer.step() # apply gradients
        
        if (i+1) % 1000 == 0:
            print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f' 
                  %(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0]))

print('Finished Training')

Epoch [1/3], Iter [1000/24750] Loss: 0.4858
Epoch [1/3], Iter [2000/24750] Loss: 0.5104
Epoch [1/3], Iter [3000/24750] Loss: 0.4586
Epoch [1/3], Iter [4000/24750] Loss: 0.4572
Epoch [1/3], Iter [5000/24750] Loss: 0.4990
Epoch [1/3], Iter [6000/24750] Loss: 0.3994
Epoch [1/3], Iter [7000/24750] Loss: 0.5217
Epoch [1/3], Iter [8000/24750] Loss: 0.4135
Epoch [1/3], Iter [9000/24750] Loss: 0.4457
Epoch [1/3], Iter [10000/24750] Loss: 0.5316
Epoch [1/3], Iter [11000/24750] Loss: 0.4600
Epoch [1/3], Iter [12000/24750] Loss: 0.4674
Epoch [1/3], Iter [13000/24750] Loss: 0.4056
Epoch [1/3], Iter [14000/24750] Loss: 0.4205
Epoch [1/3], Iter [15000/24750] Loss: 0.4388
Epoch [1/3], Iter [16000/24750] Loss: 0.4622
Epoch [1/3], Iter [17000/24750] Loss: 0.4367
Epoch [1/3], Iter [18000/24750] Loss: 0.4236
Epoch [1/3], Iter [19000/24750] Loss: 0.4627
Epoch [1/3], Iter [20000/24750] Loss: 0.4879
Epoch [1/3], Iter [21000/24750] Loss: 0.4587
Epoch [1/3], Iter [22000/24750] Loss: 0.4774
Epoch [1/3], Iter [

In [25]:
# Evaluate accuracy of predictions from validation data
cnn.eval()
accuracy = 0
nb_steps = 0

step_size = 100
for i in range(0, 25000, step_size):
    val_output = cnn(Variable(get_tweets_tensor(x_val[i:i+step_size]).cuda()))
    y_val_pred = torch.max(val_output.cpu(), 1)[1].data.numpy().squeeze()
    accuracy += accuracy_score(y_val[i:i+step_size], y_val_pred)
    nb_steps += 1

print('Validation accuracy:', accuracy/nb_steps)

Validation accuracy: 0.77364


## Step 5: Make predictions for test data and save

In [26]:
cnn.cpu()

Model4(
  (bn1): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True)
  (conv1): Conv2d (1, 256, kernel_size=(3, 100), stride=(1, 1), padding=(1, 0))
  (relu): ReLU()
  (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True)
  (max_pool1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d (256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (max_pool2): MaxPool1d(kernel_size=37, stride=37, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.2)
  (out): Linear(in_features=256, out_features=2)
  (softmax): Softmax()
)

In [27]:
torch.save(cnn, './model4.pth')

  "type " + obj.__name__ + ". It won't be checked "


In [None]:
test_output = cnn(Variable(get_tweets_tensor(x_text_test_pad, verbose=False)))
y_pred = torch.max(test_output, 1)[1].data.numpy().squeeze()

In [None]:
y_pred[y_pred == 0] = -1
y_pred

In [None]:
ids = np.arange(len(y_pred)+1)[1:]
ids

In [None]:
create_csv_submission(ids, y_pred, 'kaggle_submission.csv')