In [None]:
import csv
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.functional as F
from torch.utils.data import DataLoader, Dataset

import numpy as np
import gensim
import nltk
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
%load_ext autoreload

## Setup the Training, Validation, and Test/Prediction Sets

In [None]:
word_model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)  
stop_words =  set(stopwords.words('english'))
trainings_df = pd.read_table("train.tsv") #156060
test_df = pd.read_table("test.tsv") #66292

trainings_df = trainings_df.sample(frac=1).reset_index(drop=True)
train_df = trainings_df[:135000]
val_df = trainings_df[135000:]

train_df = train_df.dropna()
test_df = test_df.dropna()
val_df = val_df.dropna()

In [None]:
print(len(test_df), len(train_df), len(val_df))
print(test_df.keys(), train_df.keys())

## Custom DataLoader class

In [None]:
class SentimentData(Dataset):
    
    def __init__(self, df, reduction):
        embeddings, labels = get_data(df)
        reduced_embeddings = reduction.fit_transform(embeddings)
    
        self.embeddings = reduced_embeddings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

### Helper Methods for processing the sentences into embeddings

In [None]:
def filter_sentence(sentence):
    tokens = gensim.utils.simple_preprocess(sentence)#nltk.tokenize.word_tokenize(sentence)
    filtered = [w.lower() for w in tokens if not w in stop_words and w in word_model.vocab]
    return filtered

def get_sentence_embedding(sentence):
    vectors = np.zeros((len(sentence),300))
    for i, word in enumerate(sentence):
        embedding = word_model.wv[word]
        assert np.all(np.isfinite(embedding))
        vectors[i] = embedding
    mean = np.mean(vectors, axis=0)
    return mean


def get_data(df):
    
    #nn.CrossEntropy doesn't used one_hot encoded
    # label_source = np.eye(5)

    all_sentences = []
    
    for i, row in df.iterrows():
        filtered = filter_sentence(row['Phrase'])
        if not len(filtered)==0:
            all_sentences.append( (filtered, row['Sentiment']))
    
    num_data = len(all_sentences)
    # tuples of (embedding, label)
    embeddings = np.zeros( (num_data, 300))
    # labels = np.zeros((num_data, 5)) instead of using one_hot
    labels = np.zeros((num_data))
    for i, (sentence, sentiment) in enumerate(all_sentences):
        embedding = get_sentence_embedding(sentence)
        labels[i] = sentiment    
        embeddings[i] = embedding
    
    # normalize the data a bit, needed for PCA
    embeddings -= np.mean(embeddings)
    return embeddings, labels

## The Training and Validation Loops

In [None]:
def train(model, trainloader, valloader, epochs=2, lr=1e-3, regularization=1e-4, print_every=2):
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=regularization)
    
    
    for epoch in range(epochs):  # loop over the dataset multiple times

        running_loss = 0.0
        for i, data in enumerate(trainloader):
            # get the inputs
            inputs, labels = data
            inputs = inputs.float()
            labels = labels.long()
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0

        if epoch % print_every==0:
            test(model, trainloader, name="train")
            test(model, valloader)

    print('Finished Training')
    test(model, valloader)
    return model

def test(model, valloader, name="val"):
    correct = 0
    total = 0
    with torch.no_grad():
        for data in valloader:
            
            images, labels = data
            images = images.float()
            labels = labels.long()
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print('Accuracy of the network on the %s %% images: %d %%' % (name,
        100 * correct / total))
    
    

In [None]:
def get_loaders(train_df, val_df, reduction=PCA(n_components=20), batch_size=4):
    trainloader = DataLoader(SentimentData(train_df, reduction), batch_size=batch_size, shuffle=True)
    valloader = DataLoader(SentimentData(val_df, reduction), batch_size=batch_size, shuffle=True)
    return trainloader, valloader

In [None]:
input_size = 200
output_size = 5
batch_size = 16
t, v = get_loaders(train_df, val_df, batch_size=batch_size, reduction=PCA(n_components=input_size))
  

In [None]:
class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)
class UnFlatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), 1, -1)

hidden_size = [256, 128, 128, 256]
channels = [1, 32, 64, 32]
complicated = nn.Sequential(
        nn.Linear(input_size, hidden_size[0] ),
        nn.ReLU(),
        UnFlatten(),
        nn.Conv1d(channels[0], channels[1], kernel_size=3, padding=1, stride=1),
        nn.ReLU(),
        nn.Dropout(p=0.2),
        nn.MaxPool1d(kernel_size=2),
        nn.Conv1d(channels[1], channels[2], kernel_size=5, padding=2, stride=1),
        nn.ReLU(),
        nn.Dropout(p=0.2),
        nn.MaxPool1d(kernel_size=2),
        nn.Conv1d(channels[2], channels[3], kernel_size=3, padding=1, stride=1),
        nn.ReLU(),
        nn.Dropout(p=0.2),
        nn.MaxPool1d(kernel_size=2),
        Flatten(),
        nn.Linear(channels[3] / 8 * hidden_size[0], output_size),
        
)  

simple = nn.Sequential(
        nn.Linear(input_size, hidden_size[0]),
        nn.ReLU(),
        nn.Linear(hidden_size[0], hidden_size[1]),
        nn.ReLU(),
        nn.Linear(hidden_size[1], hidden_size[2]),
        nn.ReLU(),
        nn.Linear(hidden_size[2], hidden_size[3]),
        nn.ReLU(),
        nn.Linear(hidden_size[3], output_size)
        )

In [None]:
trained = train(complicated, t, v,regularization=5e-3, lr=1e-3, epochs=10)