In [14]:
import torch
import pandas as pd
import numpy as np
from scipy.spatial import distance
from collections import defaultdict
import random
from tqdm import tqdm
from IPython.display import display
from sklearn.decomposition import PCA
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import re
import string

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (30,15)

In [15]:
class RegressionNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = torch.nn.Linear(50, 1000)
        self.ac1 = torch.nn.Tanh()
        self.fc2 = torch.nn.Linear(1000, 1000)
        self.ac2 = torch.nn.Tanh()
        self.fc3 = torch.nn.Linear(1000, 1000)
        self.ac3 = torch.nn.Tanh()
        self.fc4 = torch.nn.Linear(1000, 500)
        self.ac4 = torch.nn.Tanh()
        self.fc5 = torch.nn.Linear(500, 2)
        self.sm = torch.nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.ac1(x)
        x = self.fc2(x)
        x = self.ac2(x)
        x = self.fc3(x)
        x = self.ac3(x)
        x = self.fc4(x)
        x = self.ac4(x)
        x = self.fc5(x)
        return x
  
    def inference(self, x):
        x = self.forward(x)
        x = self.sm(x)
        return x

In [16]:
net = RegressionNet()

In [17]:
def preprocessing(text):
    text = text.lower()
    
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'\*', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    text = text.replace("\\n", ' ')
    
    text = re.sub(r"([«“‘„']|[`]+)", r"\1 ", text)
    
    text = re.sub(r"([a-zA-Z])(\.)", r"\1 \2", text)
    text = re.sub(r"(\.)([a-zA-Z])", r"\1 \2", text)
    text = re.sub(r"([:,])([^\d])", r" \1 \2", text)
    text = re.sub(r"\.\.\.", r" ... ", text)
    text = re.sub(r"[;:@#$%&]", r" \g<0> ", text)
    text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r"\1 \2\3 ", text)
    text = re.sub(r"[?!]", r" \g<0> ", text)
    text = re.sub(r"([^'])' ", r"\1 ' ", text)
    
    text = re.sub(r'(\S)(\))', r"\1 \2", text)
    text = re.sub(r'(\()(\S)', r"\1 \2", text)
    text = re.sub(r'(\))(\S)', r"\1 \2", text)
    text = re.sub(r'(\S)(\()', r"\1 \2", text)
    
    text = re.sub(r'(\S)(\")', r"\1 \2 ", text)
    text = re.sub(r'(\")(\S)', r"\1 \2", text)

    text = re.sub(r"([»”’]+)", r" \1", text)
    
    text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
    text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ", text)
    
    return text

In [18]:
def tokenize_dataset(dataset, stem=0):
    """
        arg: list of texts
        return: list of tokenized texts
    """
    
    tokenized_dataset = [text.split() for text in dataset]
    if stem == 0:
        return [[token for token in text] for text in tokenized_dataset]
    stem_dataset = [[token[:stem] for token in text] for text in tokenized_dataset]
    return stem_dataset

In [19]:
def vectorize(tokenized_texts, word2emb, emb_mode=50, test_mode = False):
    res = []
    not_found = []
    for tok_text in tokenized_texts:
        text_vec = np.zeros(emb_mode)
        cnt = 0
        for token in tok_text:
            if token in word2emb:
                text_vec += word2emb[token]
                cnt += 1
            else:
                not_found.append(token)
        text_vec /= cnt
        res.append(text_vec)
    if test_mode:
        return np.stack(res), not_found
    return np.stack(res)

In [20]:
train_texts_path = "./filimdb_evaluation/FILIMDB/train.texts"
train_labels_path = "./filimdb_evaluation/FILIMDB/train.labels"

with open(train_texts_path, 'r', encoding='utf-8',) as inp:
    train_texts = list(map(str.strip, inp.readlines()))
with open(train_labels_path, 'r', encoding='utf-8',) as inp:
    train_labels = list(map(str.strip, inp.readlines()))
    
    

proc_train = list(map(preprocessing, train_texts))
token_train = tokenize_dataset(proc_train)

In [21]:
%%time
word2emb = dict()
with open("glove.6B/glove.6B.50d.txt", "r", encoding='utf-8') as inp:
    for line in inp:
        spl = line.strip().split()
        word2emb[spl[0]] =  np.array(spl[1:], dtype=np.float32)

Wall time: 6.72 s


In [22]:
X_train = vectorize(token_train, word2emb)

In [23]:
def gen_Y(y_labels):
    res = []
    for lab in y_labels:
        res.append(1 if lab == 'pos' else 0)
    return np.array(res)
Y_train = gen_Y(train_labels)

In [24]:
print(X_train.shape, Y_train.shape)

(15000, 50) (15000,)


In [25]:
X_small = torch.tensor(X_train[:20]).float()
Y_small = torch.tensor(Y_train[:20]).long()

In [30]:
loss_func = torch.nn.CrossEntropyLoss()
test1 = np.array([[0, 20], [0, 20], [20, 0]])
test1 = torch.tensor(test1).float()
test2 = np.array([1, 1, 0])
test2 = torch.tensor(test2).long()
loss = loss_func(test1, test2)
print(loss)
optimizer = torch.optim.SGD(net.parameters(), lr=0.05)

tensor(0.)


In [27]:
num_ep = 1000
for ep in tqdm(range(num_ep)):
    optimizer.zero_grad()
    
    output = net.forward(X_small)
#     print(output.shape)
    loss = loss_func(output, Y_small)
    loss.backward()
    optimizer.step()

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:09<00:00, 101.02it/s]


In [28]:
preds = net.forward(X_small)
preds = preds.argmax(dim=1)
print((preds == Y_small).float().mean())

tensor(0.8000)
