### Functions

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
pd.set_option('display.max_colwidth', -1)
import nltk
nltk.download('stopwords')

negative = 0
somewhat_negative = 1
neutral = 2
somewhat_positive = 3
positive = 4

train_fn = '/home/vule/projects/pinocchio/data/movie-review-sentiment-analysis-kernels-only/train.tsv'
test_fn = '/home/vule/projects/pinocchio/data/movie-review-sentiment-analysis-kernels-only/test.tsv'
glove_fn = '/home/vule/projects/pinocchio/data/glove-global-vectors-for-word-representation/glove.6B.100d.txt'

def load_data(fn):
    ''' Load data '''
    return pd.read_csv(fn, delimiter='\t')


def clean_str(in_str):
    ''' Turn url's into url, remove anything that's not alphanumeric or a space. Then lowercase what's left. '''
    in_str = str(in_str)
    # replace urls with 'url'
    in_str = re.sub(
        r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})", "url", in_str)
    in_str = re.sub(r'([^\s\w]|_)+', '', in_str)
    return in_str.strip().lower()


def load_train_data(fn):
    df = load_data(fn)
    pd.set_option('display.max_colwidth', -1)
    df = df[['Phrase', 'Sentiment']]
    # Create balanced set for training
    sample_size = min(df.Sentiment.value_counts())
    train_data = pd.concat([df[df['Sentiment'] == label].sample(sample_size) for label in [
                           negative, somewhat_negative, neutral, somewhat_positive, positive]]).sample(frac=1)
    en_stopwords = stopwords.words('english') + ['']
    train_data.Phrase = train_data.Phrase.apply(lambda xs: [x for x in clean_str(xs).split(' ') if x not in en_stopwords])
    train_data = train_data.reset_index(drop=True)
    return train_data


def stats(df):
    ''' Stast on length.'''
    df['l'] = df['Phrase'].apply(lambda x: len(str(x).split(' ')))
    print("mean length of sentence: " + str(df.l.mean()))
    print("max length of sentence: " + str(df.l.max()))
    print("std dev length of sentence: " + str(df.l.std()))


[nltk_data] Downloading package stopwords to /home/vule/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load data

In [2]:
# Load data
train_data = load_train_data(train_fn)
stats(train_data)
train_data.sample(3)

mean length of sentence: 4.878252262443439
max length of sentence: 28
std dev length of sentence: 3.9553379101225126


Unnamed: 0,Phrase,Sentiment,l
16402,"[superficial, midlife, crisis]",2,3
19806,[apology],2,1
19728,"[deform, families]",1,2


### Preprocess data

In [3]:
from collections import Counter, defaultdict
def sent_2_isent(sent, word2index):
    return np.array([word2index[x] if x in word2index else word2index[oov_token] for x in sent])

def padding(xs):
    padding_values = np.ones((max_length - len(xs)), dtype=int) * word2index[padding_token]
    return np.concatenate((xs, padding_values), axis=None)

def isent_2_sent(isent, index2word):
    sent = [index2word[i] for i in isent]
    return sent

def train_test_split(df):
    n = int(len(train_data) * 0.9)
    train_df, validation_df = df[:n], df[n:]
    X_train, y_train = train_df.iPhase.to_list(), train_df.Sentiment.to_list()
    X_validation, y_validation = validation_df.iPhase.to_list(), validation_df.Sentiment.to_list()
    return [np.stack(x) for x in [X_train, y_train, X_validation, y_validation]]

max_length = 30
max_features = 20000
oov_token='<unw>'
padding_token = '<padding>'
vocabs = [w[0] for w in Counter(sum(train_data.Phrase.to_list(), [])).most_common(max_features)]
vocabs = vocabs + [oov_token, padding_token]
word2index = {w: i for i, w in enumerate(vocabs)}
index2word = {i: w for i, w in enumerate(vocabs)}
train_data['iPhase'] = train_data.Phrase.apply(lambda xs: padding(sent_2_isent(xs, word2index)))

### DataLoader

In [4]:
from fastai.data.core import DataLoaders
from torch import tensor
import torch
from fastcore.foundation import L
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

seqs = L([(tensor(x, dtype=int), y) for x, y in zip(train_data.iPhase, train_data.Sentiment)])
bs = 32
cut = int(len(seqs) * 0.9)
dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs=bs, shuffle=False)

cuda


In [5]:
b = dls.one_batch()
x, y = b[0], b[1]
x.shape, y.shape, x.device, y.device

(torch.Size([32, 30]),
 torch.Size([32]),
 device(type='cpu'),
 device(type='cpu'))

### Model 1: Random embeddings

In [46]:
import torch
from torch import nn
from torch.nn import Module
import torch

class Model1(Module):
    def __init__(self, vocab_sz, embedding_dim, n_filter, max_length, n_grams, n_label):
        super(Model1, self).__init__()
        self.embedding_dim = embedding_dim
        self.n_filter = n_filter
        self.max_length = max_length
        self.n_grams = n_grams
        self.vocab_sz = vocab_sz
        self.embedding_layer = torch.nn.Embedding(vocab_sz, embedding_dim)
        self.convs = [nn.Conv2d(1, n_filter, (n_gram, embedding_dim)) for n_gram in n_grams]
        self.linear = torch.nn.Linear(n_filter * len(n_grams), n_label)
        self.dropout = nn.Dropout(0.5)
    def forward(self, x):
        x = self.embedding_layer(x);
        outs = []
        for n_gram, conv in zip(self.n_grams, self.convs):
            out = torch.unsqueeze(x, dim=1)
            out = conv(out)
            out = torch.squeeze(out, dim=-1)
            out = nn.MaxPool1d(self.max_length + 1 - n_gram)(out)
            out = torch.squeeze(out, dim=-1)
            outs.append(out)
        out = torch.cat(outs, dim=1)
        out = self.dropout(out)
        out = self.linear(out)
        return out

embedding_dim = 100
n_filter = 100
max_length = 30
n_grams = [3, 4, 5]
n_label=5
vocab_sz = len(vocabs)

model1 = Model1(vocab_sz, embedding_dim, n_filter, max_length, n_grams, n_label)
# acc 0.55

### Model 2: Static word2vec

In [10]:
embeddings_index = {}
f = open(glove_fn)
for line in f:
    values = line.split()
    word = values[0]
    if word in vocabs:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
f.close()

print(f'Found {len(embeddings_index)} word vectors.')

Found 13408 word vectors.


In [60]:
em = torch.nn.Embedding(vocab_sz, embedding_dim)
em.weight.requires_grad = False
em.weight.requires_grad

False

In [61]:
import torch
from torch import nn
from torch.nn import Module
import torch

def load_embedding_value(embedding_layer, embeddings_index):
    data = embedding_layer.weight.data
    for i, word in enumerate(vocabs):
        if word in embeddings_index:
            data[i] = tensor(embeddings_index[word])

class Model2(Module):
    def __init__(self, vocab_sz, embedding_dim, n_filter, max_length, n_grams, n_label, embeddings_index, is_static = True):
        super(Model2, self).__init__()
        self.embedding_dim = embedding_dim
        self.n_filter = n_filter
        self.max_length = max_length
        self.n_grams = n_grams
        self.vocab_sz = vocab_sz
        self.embedding_layer = torch.nn.Embedding(vocab_sz, embedding_dim)
        load_embedding_value(self.embedding_layer, embeddings_index)
        if is_static:
            self.embedding_layer.weight.requires_grad = False
        self.convs = [nn.Conv2d(1, n_filter, (n_gram, embedding_dim)) for n_gram in n_grams]
        self.linear = torch.nn.Linear(n_filter * len(n_grams), n_label)
        self.dropout = nn.Dropout(0.5)
    def forward(self, x):
        x = self.embedding_layer(x);
        outs = []
        for n_gram, conv in zip(self.n_grams, self.convs):
            out = torch.unsqueeze(x, dim=1)
            out = conv(out)
            out = torch.squeeze(out, dim=-1)
            out = nn.MaxPool1d(self.max_length + 1 - n_gram)(out)
            out = torch.squeeze(out, dim=-1)
            outs.append(out)
        out = torch.cat(outs, dim=1)
        out = self.dropout(out)
        out = self.linear(out)
        return out

embedding_dim = 100
n_filter = 100
max_length = 30
n_grams = [3, 4, 5]
n_label=5
vocab_sz = len(vocabs)

model2 = Model2(vocab_sz, embedding_dim, n_filter, max_length, n_grams, n_label, embeddings_index)
model3 = Model2(vocab_sz, embedding_dim, n_filter, max_length, n_grams, n_label, embeddings_index, False)
# 0.56

### Train

In [43]:
from fastai.learner import Learner
from fastai.callback.all import *
from torch.functional import F
from fastai.metrics import accuracy

In [47]:
learn = Learner(dls, model1, loss_func=F.cross_entropy, metrics=accuracy)
# learn.lr_find()
learn.fit_one_cycle(5, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,1.510838,1.452271,0.377262,00:18
1,1.206798,1.179396,0.498303,00:18
2,1.03862,1.106215,0.528281,00:18
3,0.933979,1.078371,0.540158,00:18
4,0.898879,1.070843,0.542421,00:18


In [62]:
learn = Learner(dls, model2, loss_func=F.cross_entropy, metrics=accuracy)
# learn.lr_find()
learn.fit_one_cycle(7, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,1.550643,1.541938,0.315045,00:08
1,1.559598,1.542817,0.309672,00:08
2,1.558234,1.539035,0.313348,00:07
3,1.554708,1.536203,0.312783,00:07
4,1.545742,1.531136,0.309672,00:07
5,1.539345,1.528565,0.309955,00:07
6,1.538152,1.523771,0.320984,00:08


In [63]:
learn = Learner(dls, model3, loss_func=F.cross_entropy, metrics=accuracy)
# learn.lr_find()
learn.fit_one_cycle(7, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,1.427449,1.365077,0.440328,00:21
1,1.144023,1.111611,0.532523,00:23
2,0.9649,1.068674,0.540158,00:22
3,0.845506,1.065659,0.549774,00:22
4,0.755679,1.073505,0.552602,00:22
5,0.664506,1.080762,0.556278,00:21
6,0.668115,1.078468,0.559389,00:21
