In [34]:
import tqdm
import nltk
import torch
import numpy as np
import pandas as pd
from torch import nn
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import confusion_matrix, classification_report
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /data/jkimbf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def load_data(split_name='train', columns=['text', 'stars'], folder='data'):
    '''
        "split_name" may be set as 'train', 'valid' or 'test' to load the corresponding dataset.
        
        You may also specify the column names to load any columns in the .csv data file.
        Among many, "text" can be used as model input, and "stars" column is the labels (sentiment). 
        If you like, you are free to use columns other than "text" for prediction.
    '''
    try:
        print(f"select [{', '.join(columns)}] columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        df = df.loc[:,columns]
        print("Success")
        return df
    except:
        print(f"Failed loading specified columns... Returning all columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        return df

In [3]:
train_df, valid_df, test_df = [
    load_data(x, columns=['text', 'stars'], folder='data') for x in ['train', 'valid', 'test']
]

select [text, stars] columns from the train split
Success
select [text, stars] columns from the valid split
Success
select [text, stars] columns from the test split
Failed loading specified columns... Returning all columns from the test split


In [4]:
train_df.head()

Unnamed: 0,text,stars
0,I've been here a handful of times now and I've...,5
1,The service was terrible. The food was just ok...,1
2,Alil pricey for the location but completly get...,4
3,Don't get your car washed here. Paid 11 and my...,1
4,Cute but tight. Not expensive and creative. I ...,5


In [5]:
print(len(train_df), len(valid_df), len(test_df))

18000 2000 4000


In [8]:
stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

def lower(s):
    """
    :param s: a string.
    return a string with lower characters
    Note that we allow the input to be nested string of a list.
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: 'text mining is to identify useful information.'
    """
    if isinstance(s, list):
        return [lower(t) for t in s]
    if isinstance(s, str):
        return s.lower()
    else:
        raise NotImplementedError("unknown datatype")


def tokenize(text):
    """
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    """
    return nltk.word_tokenize(text)


def stem(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of stemmed words, type: list
    e.g.
    Input: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    Output: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     results.append(ps.stem(token))
    # return results

    return [ps.stem(token) for token in tokens]

def n_gram(tokens, n=1):
    """
    :param tokens: a list of tokens, type: list
    :param n: the corresponding n-gram, type: int
    return a list of n-gram tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.'], 2
    Output: ['text mine', 'mine is', 'is to', 'to identifi', 'identifi use', 'use inform', 'inform .']
    """
    if n == 1:
        return tokens
    else:
        results = list()
        for i in range(len(tokens)-n+1):
            # tokens[i:i+n] will return a sublist from i th to i+n th (i+n th is not included)
            results.append(" ".join(tokens[i:i+n]))
        return results

def filter_stopwords(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of filtered tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    Output: ['text', 'mine', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     if token not in stopwords and not token.isnumeric():
    #         results.append(token)
    # return results

    return [token for token in tokens if token not in stopwords and not token.isnumeric()]

import numpy as np

def get_onehot_vector(feats, feats_dict):
    """
    :param data: a list of features, type: list
    :param feats_dict: a dict from features to indices, type: dict
    return a feature vector,
    """
    # initialize the vector as all zeros
    vector = np.zeros(len(feats_dict), dtype=np.float)
    for f in feats:
        # get the feature index, return -1 if the feature is not existed
        f_idx = feats_dict.get(f, -1)
        if f_idx != -1:
            # set the corresponding element as 1
            vector[f_idx] = 1
    return vector

In [10]:
train_df['tokens'] = train_df['text'].map(tokenize).map(filter_stopwords).map(lower)
valid_df['tokens'] = valid_df['text'].map(tokenize).map(filter_stopwords).map(lower)
test_df['tokens'] = test_df['text'].map(tokenize).map(filter_stopwords).map(lower)

In [13]:
glove_dict = {}
with open('glove.twitter.27B.200d.txt', 'r') as f:
    for i, line in enumerate(f):
        values = line.split()
        if i == 38522: continue # ' ' token not used
        glove_dict[values[0]] = np.asarray(values[1:], 'float32')



In [17]:
""" Averaged embedding for <UNK> token """
glove_dict['<UNK>'] = np.mean(list(glove_dict.values()), axis=0)

In [22]:
def tokens_to_avg_emb(texts):
    records = []
    for tokens in texts:
        record = [
            glove_dict[token] if token in glove_dict.keys() else glove_dict['<UNK>'] for token in tokens
        ]
        records.append(np.mean(record, axis=0))
    return np.asarray(records)

In [25]:
train_x = tokens_to_avg_emb(train_df['tokens'])
valid_x = tokens_to_avg_emb(valid_df['tokens'])
train_y = train_df['stars']
valid_y = valid_df['stars']

In [28]:
class MyDataset(Dataset):
    
    def __init__(self, seq, y):
        assert len(seq) == len(y)
        self.seq = seq
        self.y = y-1
    
    def __getitem__(self, idx):
        return np.asarray(self.seq[idx]), self.y[idx]

    def __len__(self):
        return len(self.seq)

In [57]:
batch_size = 16

train_loader = DataLoader(MyDataset(train_x, train_y), batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(MyDataset(valid_x, valid_y), batch_size=batch_size)

In [58]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(200, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 256)
        self.fc4 = nn.Linear(256, 5)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)
        return x

In [59]:
model = MLP()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)
criterion = torch.nn.CrossEntropyLoss()
num_epochs = 10

for e in range(num_epochs):
    print('epoch', e+1)
    model.train()
    total_acc = 0
    total_loss = 0
    total_count = 0
    with tqdm.tqdm(train_loader) as t:
        for x, y in t:
            optimizer.zero_grad()
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            total_acc += (logits.argmax(1) == y).sum().item()
            total_count += y.size(0)
            total_loss += loss.item()
            optimizer.step()
            t.set_postfix({'loss': total_loss/total_count, 'acc': total_acc/total_count})

    model.eval()
    y_pred = []
    y_true = []
    with tqdm.tqdm(valid_loader) as t:
        for x, y in t:
            logits = model(x)
            total_acc += (logits.argmax(1) == y).sum().item()
            total_count += len(y)
            y_pred += logits.argmax(1).tolist()
            y_true += y.tolist()
    print(classification_report(y_true, y_pred))
    print("\n\n")
    print(confusion_matrix(y_true, y_pred))

epoch 1


100%|██████████| 1125/1125 [00:05<00:00, 212.53it/s, loss=0.0777, acc=0.495]
100%|██████████| 125/125 [00:00<00:00, 1303.17it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.46      0.85      0.59       282
           1       0.00      0.00      0.00       136
           2       0.00      0.00      0.00       212
           3       0.40      0.38      0.39       466
           4       0.70      0.80      0.74       904

    accuracy                           0.57      2000
   macro avg       0.31      0.41      0.35      2000
weighted avg       0.47      0.57      0.51      2000




[[241   0   1  11  29]
 [ 94   0   2  29  11]
 [ 80   0   0  90  42]
 [ 59   0   1 175 231]
 [ 55   0   0 128 721]]
epoch 2


100%|██████████| 1125/1125 [00:05<00:00, 209.54it/s, loss=0.0634, acc=0.577]
100%|██████████| 125/125 [00:00<00:00, 1030.83it/s]


              precision    recall  f1-score   support

           0       0.52      0.89      0.65       282
           1       0.14      0.02      0.04       136
           2       0.32      0.19      0.24       212
           3       0.47      0.43      0.45       466
           4       0.76      0.79      0.77       904

    accuracy                           0.60      2000
   macro avg       0.44      0.46      0.43      2000
weighted avg       0.57      0.60      0.57      2000




[[250   2   8   4  18]
 [ 90   3  19  20   4]
 [ 62  11  41  68  30]
 [ 41   4  44 200 177]
 [ 41   1  18 134 710]]
epoch 3


100%|██████████| 1125/1125 [00:05<00:00, 215.87it/s, loss=0.0607, acc=0.596]
100%|██████████| 125/125 [00:00<00:00, 1574.68it/s]


              precision    recall  f1-score   support

           0       0.58      0.85      0.69       282
           1       0.29      0.04      0.07       136
           2       0.40      0.24      0.30       212
           3       0.46      0.53      0.49       466
           4       0.77      0.77      0.77       904

    accuracy                           0.62      2000
   macro avg       0.50      0.48      0.46      2000
weighted avg       0.60      0.62      0.59      2000




[[239   3  11   7  22]
 [ 78   5  24  27   2]
 [ 41   6  50  87  28]
 [ 28   2  31 247 158]
 [ 28   1   9 173 693]]
epoch 4


100%|██████████| 1125/1125 [00:05<00:00, 221.96it/s, loss=0.0593, acc=0.61] 
100%|██████████| 125/125 [00:00<00:00, 1571.20it/s]


              precision    recall  f1-score   support

           0       0.62      0.85      0.71       282
           1       0.29      0.10      0.15       136
           2       0.39      0.25      0.31       212
           3       0.46      0.40      0.43       466
           4       0.73      0.82      0.77       904

    accuracy                           0.62      2000
   macro avg       0.50      0.49      0.47      2000
weighted avg       0.58      0.62      0.59      2000




[[239   5  10   5  23]
 [ 73  14  23  22   4]
 [ 28  23  54  72  35]
 [ 24   4  41 188 209]
 [ 24   3  10 126 741]]
epoch 5


100%|██████████| 1125/1125 [00:05<00:00, 207.80it/s, loss=0.0585, acc=0.615]
100%|██████████| 125/125 [00:00<00:00, 1275.10it/s]


              precision    recall  f1-score   support

           0       0.64      0.85      0.73       282
           1       0.29      0.13      0.18       136
           2       0.39      0.31      0.34       212
           3       0.47      0.40      0.43       466
           4       0.74      0.82      0.78       904

    accuracy                           0.62      2000
   macro avg       0.51      0.50      0.49      2000
weighted avg       0.60      0.62      0.60      2000




[[239   7  10   5  21]
 [ 69  18  27  17   5]
 [ 24  25  65  66  32]
 [ 22   8  45 187 204]
 [ 21   4  18 121 740]]
epoch 6


100%|██████████| 1125/1125 [00:05<00:00, 208.84it/s, loss=0.0579, acc=0.614]
100%|██████████| 125/125 [00:00<00:00, 1524.79it/s]


              precision    recall  f1-score   support

           0       0.63      0.86      0.73       282
           1       0.26      0.14      0.18       136
           2       0.33      0.31      0.32       212
           3       0.47      0.38      0.42       466
           4       0.75      0.81      0.78       904

    accuracy                           0.62      2000
   macro avg       0.49      0.50      0.48      2000
weighted avg       0.59      0.62      0.60      2000




[[243   8   8   4  19]
 [ 70  19  30  14   3]
 [ 27  30  65  62  28]
 [ 25  10  67 175 189]
 [ 22   6  26 119 731]]
epoch 7


100%|██████████| 1125/1125 [00:05<00:00, 218.04it/s, loss=0.0576, acc=0.617]
100%|██████████| 125/125 [00:00<00:00, 1498.38it/s]


              precision    recall  f1-score   support

           0       0.70      0.74      0.72       282
           1       0.33      0.15      0.20       136
           2       0.40      0.31      0.35       212
           3       0.47      0.45      0.46       466
           4       0.73      0.83      0.78       904

    accuracy                           0.63      2000
   macro avg       0.52      0.50      0.50      2000
weighted avg       0.60      0.63      0.61      2000




[[210  16  16   6  34]
 [ 55  20  32  22   7]
 [ 16  16  65  80  35]
 [ 12   7  38 212 197]
 [  9   2  12 133 748]]
epoch 8


100%|██████████| 1125/1125 [00:05<00:00, 215.50it/s, loss=0.0572, acc=0.623]
100%|██████████| 125/125 [00:00<00:00, 1304.18it/s]


              precision    recall  f1-score   support

           0       0.66      0.83      0.74       282
           1       0.36      0.18      0.24       136
           2       0.38      0.32      0.35       212
           3       0.49      0.49      0.49       466
           4       0.77      0.79      0.78       904

    accuracy                           0.63      2000
   macro avg       0.53      0.52      0.52      2000
weighted avg       0.62      0.63      0.62      2000




[[233  11  13   6  19]
 [ 59  25  30  17   5]
 [ 22  25  67  72  26]
 [ 17   7  46 227 169]
 [ 21   2  19 146 716]]
epoch 9


100%|██████████| 1125/1125 [00:05<00:00, 207.25it/s, loss=0.0568, acc=0.621]
100%|██████████| 125/125 [00:00<00:00, 1072.86it/s]


              precision    recall  f1-score   support

           0       0.61      0.87      0.72       282
           1       0.27      0.11      0.16       136
           2       0.37      0.25      0.30       212
           3       0.49      0.38      0.43       466
           4       0.73      0.85      0.79       904

    accuracy                           0.63      2000
   macro avg       0.50      0.49      0.48      2000
weighted avg       0.59      0.63      0.60      2000




[[245   6   8   4  19]
 [ 73  15  26  16   6]
 [ 35  22  52  66  37]
 [ 23  11  37 177 218]
 [ 23   2  16  95 768]]
epoch 10


100%|██████████| 1125/1125 [00:05<00:00, 208.39it/s, loss=0.0564, acc=0.625]
100%|██████████| 125/125 [00:00<00:00, 1116.09it/s]


              precision    recall  f1-score   support

           0       0.66      0.82      0.73       282
           1       0.38      0.15      0.21       136
           2       0.43      0.26      0.33       212
           3       0.48      0.39      0.43       466
           4       0.71      0.86      0.78       904

    accuracy                           0.63      2000
   macro avg       0.53      0.50      0.50      2000
weighted avg       0.60      0.63      0.60      2000




[[232   5  10   5  30]
 [ 65  20  25  16  10]
 [ 26  18  56  69  43]
 [ 15   9  31 183 228]
 [ 15   0   9 106 774]]
