In [None]:
# !pip install kaggle

# !rm -rf .kaggle/

# !mkdir .kaggle
# !touch .kaggle/kaggle.json
# !chmod 600 ~/.kaggle/kaggle.json
# !pip install kaggle -q      # At first, I suspect the kaggle API lose effect so it doesn't have .kaggle folder. (not working)
# !rm -rf /root/.kaggle.      # when I created the folder, it says the file or dir already exits
# !mkdir /root/.kaggle        # successful
# !mv kaggle.json /root/.kaggle/kaggle.json    # not sure if I have to use full destination path, I previously only used /root/.kaggle and it failed. Don't have time to validate this thought.
# !ls /root/.kaggle/kaggle.json
# !chmod 600 ~/.kaggle/kaggle.json

In [None]:
#!mv .kaggle/kaggle.json /root
#!mv .kaggle /root/
# !kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
# !unzip ./imdb-dataset-of-50k-movie-reviews.zip

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from collections import Counter

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/NLP243/243_HW1/data/hw1_train-1.csv')
df.columns = df.columns.str.replace('textstr ', 'review')
train_data, val_data = train_test_split(df, test_size=0.2)
# train_data.rename(columns = {'textstr':'review'}, inplace = True)

In [None]:
train_data.head(10)

In [None]:
print(df.columns.tolist())

In [None]:
train_data['review']

In [None]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()
# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)

In [None]:
tokens = tokenizer('this is a test.')
tokens

In [None]:
vocab_size = 8_000
all_tokens = []
for rev in train_data['review']:
  tokens = tokenizer(rev)
  all_tokens.extend([i.text for i in tokens])

In [None]:
count = Counter(all_tokens)
tokens, counts = zip(*count.most_common(vocab_size))
vocab = {token: idx for idx, token in enumerate(tokens)}
vocab['<unk>'] = len(vocab)
vocab

In [None]:
print(vocab['<unk>'])
print(vocab['relevant'])

In [None]:
train_data.fillna('none', inplace=True)
uniq_labels = train_data['label'].unique()

all_labels=[]
for line in uniq_labels:
  all_labels.extend(line.split())

all_labels = list(set(all_labels))

print("{")

for i in range(len(all_labels)):
  print('\t"'+all_labels[i]+'" : ', i ," ,")

print("}")

In [None]:
label_dict = {
    "movie.subjects" :  0  ,
	"movie.starring.character" :  1  ,
	"movie.gross_revenue" :  2  ,
	"movie.initial_release_date" :  3  ,
	"movie.production_companies" :  4  ,
	"movie.starring.actor" :  5  ,
	"person.date_of_birth" :  6  ,
	"actor.gender" :  7  ,
	"movie.produced_by" :  8  ,
	"movie.directed_by" :  9  ,
	"movie.rating" :  10  ,
	"movie.estimated_budget" :  11  ,
	"movie.music" :  12  ,
	"movie.language" :  13  ,
	"none" :  14  ,
	"gr.amount" :  15  ,
	"movie.country" :  16  ,
	"movie.genre" :  17  ,
	"movie.locations" :  18  
}

In [None]:
class IMDBDataset(Dataset):
  def __init__(self, data: pd.DataFrame, vocab, label_dict):
    self.data = data
    self.vocab = vocab
    self.default = self.vocab['<unk>']
    self.labels = label_dict

  def tokenize(self, text: str):
    return [i.text for i in tokenizer(text)]

  def encode_tokens(self, tokens):
    encoded = [self.vocab.get(token, self.default) for token in tokens]
    return torch.tensor(encoded, device=device)

  def encode_label(self, label: str):
    # encoded = [self.labels.get(token, self.default) for token in label]
    # return torch.tensor(encoded, device=device)    
    encoded = [label_dict[k] for k in label.split()]
    return torch.tensor(encoded, device=device ,dtype=torch.long)
  
  def __getitem__(self, n: int):
    review = self.data['review'].iloc[n]
    classes = self.data['label'].iloc[n]
    return self.encode_tokens(self.tokenize(review)), self.encode_label(classes)

  def __len__(self):
    return len(self.data)

In [None]:
train_ds = IMDBDataset(train_data, vocab, label_dict)
val_ds = IMDBDataset(val_data, vocab, label_dict)

In [None]:
train_loader = DataLoader(train_ds, batch_size=1, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=1, shuffle=True)

In [None]:
next(iter(train_loader))

In [None]:
# class MLP(nn.Module):
#   #a multi-layered perceptron based classifier
#     def __init__(self, num_features,out_features):
#         """
#         Args:
#             num_features (int): the size of the input feature vector
#         """
#         super(MLP, self).__init__()
#         self.fc1 = nn.Linear(in_features=num_features, out_features=64)
#         print("num f:", num_features)
#         self.fc2 = nn.Linear(in_features=64,out_features=32)
#         self.fc3 = nn.Linear(in_features=32,out_features=out_features)

#     def forward(self, x_in, apply_softmax=False):
#         """The forward pass of the classifier
        
#         Args:
#             x_in (torch.Tensor): an input data tensor. 
#                 x_in.shape should be (batch, num_features)
#             apply_softmax (bool): a flag for the sigmoid activation
#                 should be false if used with the Cross Entropy losses
#         Returns:
#             the resulting tensor. tensor.shape should be (batch,)
#         """
#         y_out_1 = torch.relu(self.fc1(x_in))
#         y_out_2 = self.fc2(y_out_1)
#         y_out = self.fc3(y_out_2)
#         return y_out

class MLP(nn.Module):
  def __init__(self, n_tokens, emb_dim, hidden_dim, output_dim):
    super().__init__()
    self.embedding = nn.Embedding(n_tokens, emb_dim)
    self.fc1 = nn.Linear(emb_dim, hidden_dim)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(hidden_dim, output_dim)

  def forward(self, x):
    print("shape input", x.shape)
    # x: Tensor([[0, 1, 2, 5, 100, 3, 6]]), shape [B, seq_len]
    # embedding.weight:
    # 0:       [ 0.3, 0.5, ..., 0.7]
    #          ...
    # n_token: [ 1.0, 0.8, ..., 0.8]
    # 
    # embedded = embedding(0) + embedding(1) + ... + embedding(6)
    embedded = self.embedding(x)
    print("shape after embedding", embedded.shape)
    # embedded: Tensor([[0.4, 0.2, ..., -0.9]]), shape [B, emb_dim]
    hidden1 = self.fc1(embedded)
    print("shape after hidden1", hidden1.shape)
    hidden2 = self.relu(hidden1)
    print("shape after hidden2", hidden2.shape)
    y_out = self.fc2(hidden2)
    y_out = y_out.view(-1,output_dim)
    print("shape of y_out",y_out.shape)
    return y_out
    

In [None]:
vocab_size = vocab_size + 1
emb_dim = 100
hidden_dim = 200
output_dim = 19
model = MLP(vocab_size + 1, 100, 200, 19).to(device)

In [None]:
def train(loader, model, optimizer, loss_fn):
  model.train()
  losses = []
  pbar = tqdm(loader)
  for x, y in pbar:
    print("Input shape", x.shape)
    print("output shape", y.shape)
    optimizer.zero_grad()
    logits = model(x)
    print("logits shape", logits.shape)
    # print("y shape", y.shape)
    loss = loss_fn(logits, y)
    pbar.set_postfix({'loss': loss.item()})
    losses.append(loss.item())

    loss.backward()  # calculate gradients for w/b
    optimizer.step()  # update weights according to optimizer rules
  return sum(losses) / len(losses)


def evaluate(loader, model, loss_fn, score_fn):
  model.eval()
  predictions = []
  labels = []
  for x, y in tqdm(loader):
    logits = model(x)
    loss = loss_fn(logits, y)
    pred = torch.argmax(logits, dim=-1)
    predictions.append(pred.numpy())
    labels.append(y.numpy())
  score = score_fn(labels, predictions)
  return score

In [None]:
model.train()
losses = []
pbar = tqdm(loader)
for x, y in pbar:
  print("Input shape", x.shape)
  print("output shape", y.shape)
  optimizer.zero_grad()
  logits = model(x)
  print("logits shape", logits.shape)
  # print("y shape", y.shape)
  loss = loss_fn(logits, y)
  pbar.set_postfix({'loss': loss.item()})
  losses.append(loss.item())

  loss.backward()  # calculate gradients for w/b
  optimizer.step()  # update weights according to optimizer rules

In [None]:
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
loss_fn = nn.BCEWithLogitsLoss()
score_fn = accuracy_score
n_epochs = 3
best_acc = 0
for epoch in range(n_epochs):
  avg_loss = train(train_loader, model, optimizer, loss_fn)
  print('train loss: ', avg_loss)
  accuracy = evaluate(val_loader, model, loss_fn, score_fn)
  print('val accuracy: ', accuracy)
  if accuracy > best_acc and accuracy > 0.7:
    torch.save(model.state_dict(), f'best-model.pt')

In [None]:
s1 = torch.randint(0, 10, (1, 6))
pad = torch.zeros(size=(1, 4)) - 1
s3 = torch.cat([s1, pad], dim=1)
s2 = torch.randint(0, 10, (1, 10))

In [None]:
torch.cat([s1], dim=0)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder

In [None]:
mlb = MultiLabelBinarizer()

In [None]:
mlb.fit([['apple', 'banana', 'orange']])

In [None]:
mlb.transform([['apple', 'banana']])

In [None]:
ohe = OneHotEncoder()