In [8]:
import pandas as pd
import boto3
import os

while 'models' not in os.listdir():
    os.chdir('..')

s3 = boto3.resource('s3')
s3_client = boto3.client('s3')

for bucket in s3.buckets.all():
    print(bucket.name)

BUCKET_NAME = 'pawel123r'
train_set = 'data/train.csv'
val_set = 'data/val.csv'

def download_file(file_name: str, bucket: str):
    s3_client.download_file(bucket, file_name, f"temp/{file_name.split('/')[-1]}")  

def download_return_dataframe(file_name: str, bucket: str):
    download_file(file_name, bucket)
    return pd.read_csv(f"temp/{file_name.split('/')[-1]}")

train = download_return_dataframe(train_set, BUCKET_NAME)
val = download_return_dataframe(val_set, BUCKET_NAME)

def split(dataframe):
    return dataframe['input'], dataframe['label']

X_train, y_train = split(train)
X_val, y_val = split(val)

pawel123r


In [9]:
X_train.head()

0                              i didnt feel humiliated
1    i can go from feeling so hopeless to so damned...
2     im grabbing a minute to post i feel greedy wrong
3    i am ever feeling nostalgic about the fireplac...
4                                 i am feeling grouchy
Name: input, dtype: object

In [10]:
y_train.head()

0    sadness
1    sadness
2      anger
3       love
4      anger
Name: label, dtype: object

In [12]:
X_val.head()

0    im feeling quite sad and sorry for myself but ...
1    i feel like i am still looking at a blank canv...
2                       i feel like a faithful servant
3                    i am just feeling cranky and blue
4    i can have for a treat or if i am feeling festive
Name: input, dtype: object

In [13]:
y_val.head()

0    sadness
1    sadness
2       love
3      anger
4        joy
Name: label, dtype: object

In [38]:
type(y_train_num)

numpy.ndarray

In [14]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder().fit(y_train)
y_train_num = enc.transform(y_train)
y_val_num = enc.transform(y_val)

print(y_train_num[0:5], y_val_num[0:5])

[4 4 0 3 0] [4 4 3 0 2]


In [15]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import wordpunct_tokenize

lemmatizer = WordNetLemmatizer()

def lemmatize_series(df):

    df_tokenized = df.apply(lambda setence: wordpunct_tokenize(setence))

    # Lemmatize words
    def lemmatize_list(words):

        lemmatized_words = []

        for word in words:
            lemmatized_words.append(lemmatizer.lemmatize(word).lower())

        return lemmatized_words


    df_lemmatized = df_tokenized.apply(lambda words: lemmatize_list(words))
        
    return df_lemmatized

X_train_lem = lemmatize_series(X_train)
X_val_lem = lemmatize_series(X_val)

In [16]:
import torchtext as text

vec = text.vocab.GloVe(name='6B', dim=300)

# Vectorize
def vectorize(words):
    return vec.get_vecs_by_tokens(words)

X_train_vec = X_train_lem.apply(lambda words: vectorize(words))
X_val_vec = X_val_lem.apply(lambda words: vectorize(words))

.vector_cache/glove.6B.zip: 862MB [02:42, 5.32MB/s]                               
100%|█████████▉| 399999/400000 [00:24<00:00, 16176.05it/s]


In [17]:
len(X_train_vec)

16000

In [18]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import one_hot
import numpy as np
from torch.nn.utils.rnn import pad_sequence
import torch

class Emotions(Dataset):

    def __init__(self, inputs, labels, num_classes):

        self.inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
        self.labels = labels
        self.num_classes = num_classes

    def __len__(self):
        assert len(self.inputs) == len(self.labels), f"Length of inputs ({len(self.inputs)}) and labels ({len(self.labels)}) lists don't match"
        return len(self.inputs)
    
    def __getitem__(self, index):

        input = self.inputs[index]
        label = torch.tensor(self.labels[index])

        return input, one_hot(label, self.num_classes)
    
num_classes = len(np.unique(y_train_num))
traindataset = Emotions(X_train_vec, y_train_num, num_classes)
valdataset = Emotions(X_val_vec, y_val_num, num_classes)

In [19]:
traindataset.__getitem__(69)

(tensor([[-0.1329,  0.1699, -0.1436,  ..., -0.2378,  0.1477,  0.6290],
         [ 0.1305, -0.1191, -0.4308,  ..., -0.2434, -0.2493,  0.5582],
         [ 0.1463, -0.0660,  0.0798,  ...,  0.4928, -0.0553, -0.1069],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]),
 tensor([0, 0, 1, 0, 0, 0]))

In [20]:
batch_size = 64
trainLoader = DataLoader(traindataset, batch_size=batch_size, shuffle=True, drop_last=True)
valLoader = DataLoader(valdataset, batch_size=batch_size, shuffle=True, drop_last=True)

In [35]:
import torch
from torch import nn
import torchtext

class LSTM(nn.Module):

    def __init__(self, batch_size, num_classes, kwargs = None):

        super().__init__()

        if kwargs is not None:
            self.kwargs = kwargs
        else:
            self.kwargs = {
                'input_size': 300,
                'hidden_size': 100,
                'num_layers': 4,
                'batch_first': True,
                'dropout': 0.05,
            }
        
        assert self.kwargs['batch_first'], "batch_first must be true!"

        self.lstm = nn.LSTM(**self.kwargs)
        self.linear1 = nn.Linear(self.kwargs['hidden_size'], self.kwargs['hidden_size']//2)
        self.linear2 = nn.Linear(self.kwargs['hidden_size']//2, num_classes)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()

        self.batch_size = batch_size
        self.h0 = torch.zeros(size=(self.kwargs['num_layers'], batch_size, self.kwargs['hidden_size']))
        self.c0 = torch.zeros(size=(self.kwargs['num_layers'], batch_size, self.kwargs['hidden_size']))

        self.hc = (self.h0, self.c0)

    def forward(self, x):
        
        x, _ = self.lstm(x, self.hc)
        # print(x[:,-1,:][0])
        x = self.relu(self.linear1(x[:,-1,:]))
        x = self.linear2(x)
        
        return x


batch_size = 64
model = LSTM(batch_size=batch_size, num_classes=6)
inputs, labels = next(iter(trainLoader))

output = model(inputs)

output.size()

torch.Size([64, 6])

In [36]:
def test(model, dataloader, criterion):

    with torch.no_grad():

        running_loss = 0
        test_loss = 0
        k = 20
        
        for batch, (X, y) in enumerate(dataloader):
            
            try:
                # print('Sizes: ', X.size(), y.size(), type(X[0][0][0]))
                # print('Samples: ', X[0], y[0])
                y_pred = model(X)
                # print('Sizes: ', y_pred.size())
                # print('Samples: ', y_pred[0])
                
                loss = criterion(y_pred, y.float())

                test_loss += loss.item()
                running_loss += loss.item()

                if (batch + 1) % k == 0:
                    print(f'[TEST No batch: {batch + 1:3d}] loss: {running_loss / k:.3f}')
                    running_loss = 0.0
            except Exception as exp:
                print(exp)
                print(X.size(), y.size())
        
        return {'test_loss': test_loss/len(dataloader)}

def train(model, trainloader, valloader, optimizer, criterion, epochs):

    train_losses = []
    val_losses = []
    k = 20

    for epoch in range(epochs):

        total_loss = 0
        running_loss = 0

        for batch, (X, y) in enumerate(trainloader):

            optimizer.zero_grad()
            
            # print('Sizes: ', X.size(), y.size(), type(X[0][0][0]))
            # print('Samples: ', X[0], y[0])
            y_pred = model(X)
            # print('Sizes: ', y_pred.size())
            # print('Samples: ', y_pred[0])
            loss = criterion(y_pred, y.float())
            loss.backward()

            optimizer.step()

            total_loss += loss.item()
            running_loss += loss.item()

            if (batch + 1) % k == 0:
                print(f'[TRAIN No epoch/batch: {epoch + 1}/{batch + 1:3d}] loss: {running_loss / k:.3f}')
                running_loss = 0.0
            
        #     break
        # break
        print(f'[No epoch: {epoch + 1:5d}] train loss: {total_loss / len(trainloader):.3f}')
        train_losses.append(total_loss / len(trainloader))

        val_result = test(model, valloader, criterion)
        print(f"[No epoch: {epoch + 1:5d}] val loss: {val_result['test_loss']:.3f}")
        val_losses.append(val_result['test_loss'])

    return {'train_losses': train_losses, 'val_losses': val_losses}

model = LSTM(batch_size=batch_size, num_classes=6)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9) 
criterion = nn.CrossEntropyLoss()

train(model, trainLoader, valLoader, optimizer, criterion, 50)

[TRAIN No epoch/batch: 1/ 20] loss: 1.722
[TRAIN No epoch/batch: 1/ 40] loss: 1.659
[TRAIN No epoch/batch: 1/ 60] loss: 1.610
[TRAIN No epoch/batch: 1/ 80] loss: 1.610
[TRAIN No epoch/batch: 1/100] loss: 1.567
[TRAIN No epoch/batch: 1/120] loss: 1.619
[TRAIN No epoch/batch: 1/140] loss: 1.567
[TRAIN No epoch/batch: 1/160] loss: 1.573
[TRAIN No epoch/batch: 1/180] loss: 1.580
[TRAIN No epoch/batch: 1/200] loss: 1.546
[TRAIN No epoch/batch: 1/220] loss: 1.564
[TRAIN No epoch/batch: 1/240] loss: 1.571
[No epoch:     1] train loss: 1.598
[TEST No batch:  20] loss: 1.581
[No epoch:     1] val loss: 1.581
[TRAIN No epoch/batch: 2/ 20] loss: 1.554
[TRAIN No epoch/batch: 2/ 40] loss: 1.558
[TRAIN No epoch/batch: 2/ 60] loss: 1.580
[TRAIN No epoch/batch: 2/ 80] loss: 1.584
[TRAIN No epoch/batch: 2/100] loss: 1.589
[TRAIN No epoch/batch: 2/120] loss: 1.588
[TRAIN No epoch/batch: 2/140] loss: 1.578
[TRAIN No epoch/batch: 2/160] loss: 1.567
[TRAIN No epoch/batch: 2/180] loss: 1.595
[TRAIN No epoch

KeyboardInterrupt: 