In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import random
import torch, torchtext
from torchtext import data
import pdb

SEED = 32
torch.manual_seed(SEED)

In [None]:
text = data.Field(sequential=True, batch_first=True, lower=True,
                 include_lengths=True)
label = data.Field(sequential=False, use_vocab=False, dtype=torch.float32)

In [None]:
fields = [('text', text), ('label', label)]

In [None]:
train_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_set = [data.Example.fromlist([train_df.excerpt[i], train_df.target[i]], fields) 
            for i in range(train_df.shape[0])]

In [None]:
test_set = [data.Example.fromlist([test_df.excerpt[i]], [('text', text)]) 
            for i in range(test_df.shape[0])]

In [None]:
# vars(test_set[0])

In [None]:
train_data = data.Dataset(train_set, fields)
test_data = data.Dataset(test_set, [('text', text)])

In [None]:
# vars(test_data.examples[0])

In [None]:
import torchtext.vocab as vocab

custom_embeddings = vocab.Vectors(name='/kaggle/input/glove6b/glove.6B.300d.txt')

In [None]:
text.build_vocab(train_data, vectors=custom_embeddings)

In [None]:
print(f'size of input vocab', len(text.vocab))

In [None]:
text.vocab[120]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_iterator, test_iterator = data.BucketIterator.splits((train_data, test_data),
                                                   batch_size=32,
                                                   sort_key= lambda x:len(x.text),
                                                   sort_within_batch=True,
                                                   device=device)

In [None]:
len(train_iterator), 89*32

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class classifier(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight.data.copy_(text.vocab.vectors)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                              dropout=dropout,
                              batch_first=True, bidirectional=True)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_length):
        embedding = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedding, 
                                                           text_length.cpu(),
                                                           batch_first=True)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
#         pdb.set_trace()
#         hidden = torch.cat((hidden[-2,:,:]))
#         hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        dense_output = self.fc(hidden[-1,:,:])
        
        return dense_output

In [None]:
from tqdm import tqdm

In [None]:
import time

def train(model, dataloader, optimizer, criterion):
#     model.train()
#     total_acc, total_count = 0,0
#     log_interval = 500
#     start_time = time.time()
    epoch_loss = 0
    model.train()
    with tqdm(dataloader) as tepoch:
        for batch in tepoch:
            optimizer.zero_grad()

            text, length = batch.text
            prediction = model(text, length)
#             pdb.set_trace()
            loss = criterion(prediction, batch.label.reshape((-1,1)))
#             loss = torch.sqrt(loss)
#             loss = torch.mean(loss_each)
            loss.backward()

            optimizer.step()

            epoch_loss += loss.item()
#         print(f'batch : {idx}')
    
    return epoch_loss/ len(dataloader)

In [None]:
def evaluate(model, dataloader, criterion):
    
    epoch_loss = 0
    
    model.eval()
    
    with torch.no_grad():
        
        for batch in dataloader:
            text, length = batch.text
            prediction = model(text, length)
            loss = criterion(prediction, batch.label.reshape((-1,1)))
            
            epoch_loss += loss.item()
    
    return epoch_loss/len(dataloader)

In [None]:
size_of_vocab = len(text.vocab)
embedding_dim = 300
num_hidden_nodes = 100
num_output_nodes = 1
num_layers = 2
dropout = 0.2

model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes,
                  num_layers, dropout)

N_EPOCHS = 10
learning_rate = 0.01

def RMSELoss(yhat, y):
    return torch.sqrt(torch.mean((yhat-y)**2))

criterion = RMSELoss
optimizer = torch.optim.SGD(model.parameters(), learning_rate)

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_iterator, optimizer, criterion)
    print('adsad')
    
    valid_loss = evaluate(model, train_iterator, criterion)
    
    print(f'train loss : {train_loss} & valid loss : {valid_loss}')

In [None]:
# next(iter(test_iterator)).label
len(test_iterator)

In [None]:
output = []
for a in test_iterator:
    text, length = a.text
#     print(a)
    out = model(text, length)
#     pdb.set_trace()
    output.extend(out.reshape((-1)).detach().numpy())

In [None]:
submission = pd.DataFrame()

submission['id'] = test_df['id']

In [None]:
output

In [None]:
submission['target'] = output

In [None]:
submission.to_csv('submission.csv', index=False)