In [1]:
import os
import sys
import numpy as np
import pandas as pd
from collections import Counter

import torch
from torchtext import data
from torchtext import datasets

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Read Data

In [2]:
id_text = []
comments = []
labels = []

data = pd.read_csv('./data/train_data.csv', header=None)

In [3]:
for idx, row in data.iterrows():
    id_text.append(row[0])
    comments.append(str(row[1]))
    labels.append(row[2])

In [4]:
all_words = ' '.join(comments)

word_counts = Counter(all_words.split())
word_list = sorted(word_counts, key = word_counts.get, reverse = True)
vocab_to_int = {word:idx+1 for idx, word in enumerate(word_list)}
int_to_vocab = {idx:word for word, idx in vocab_to_int.items()}
encoded_reviews = [[vocab_to_int[word] for word in comment.split()] for comment in comments]

encoded_labels = labels

In [5]:
for idx, line in enumerate(encoded_reviews):
    if len(line) > 256:
        encoded_reviews.pop(idx)
        encoded_labels.pop(idx)

In [6]:
encoded_labels = np.array( [label for idx, label in enumerate(encoded_labels) if len(encoded_reviews[idx]) > 0] )
encoded_reviews = [review for review in encoded_reviews if len(review) > 0]

In [7]:
print(len(encoded_reviews))
print(len(encoded_labels))

16076
16076


## Pad sentences

In [8]:
def pad_text(encoded_reviews, seq_length):
    
    reviews = []
    
    for review in encoded_reviews:
        if len(review) >= seq_length:
            reviews.append(review[:seq_length])
        else:
            reviews.append([0]*(seq_length-len(review)) + review)
        
    return np.array(reviews)

In [9]:
padded_reviews = pad_text(encoded_reviews, seq_length = 256)

## Shuffle data and then split to train and valid

In [16]:
both = list(zip(encoded_reviews, encoded_labels))

In [18]:
import random

random.shuffle(both)

In [29]:
a, b = zip(*both)

In [31]:
len(a)

16076

In [10]:
import random

train_ratio = 0.8
valid_ratio = (1 - train_ratio)/2
total = padded_reviews.shape[0]
train_cutoff = int(total * train_ratio)
valid_cutoff = int(total * (1 - valid_ratio))

train_x, train_y = torch.from_numpy(padded_reviews[:train_cutoff]), torch.from_numpy(encoded_labels[:train_cutoff])
valid_x, valid_y = torch.from_numpy(padded_reviews[train_cutoff : valid_cutoff]), torch.from_numpy(encoded_labels[train_cutoff : valid_cutoff])
test_x, test_y = torch.from_numpy(padded_reviews[valid_cutoff:]), torch.from_numpy(encoded_labels[valid_cutoff:])

In [11]:
from torch.utils.data import TensorDataset, DataLoader

In [12]:
train_data = TensorDataset(train_x, train_y)
valid_data = TensorDataset(valid_x, valid_y)
test_data = TensorDataset(test_x, test_y)

batch_size = 32
train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = batch_size, shuffle = False)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = False)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_reviews, encoded_labels, test_size=0.1, random_state=42, stratify=encoded_labels)

In [14]:
train_x, valid_x, train_y, valid_y = torch.from_numpy(X_train), torch.from_numpy(X_test), torch.from_numpy(y_train), torch.from_numpy(y_test)

In [15]:
from torch.utils.data import TensorDataset, DataLoader

In [17]:
train_data = TensorDataset(train_x, train_y)
valid_data = TensorDataset(valid_x, valid_y)

batch_size = 32

train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = batch_size, shuffle = False)