In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Загрузка данных

In [None]:
input_path = "/kaggle/input/commonlitreadabilityprize/"
train = pd.read_csv(input_path+"train.csv", usecols = ["excerpt","target"])
test = pd.read_csv(input_path+"test.csv", usecols=["excerpt"])
sub = pd.read_csv(input_path+"sample_submission.csv")

In [None]:
import spacy
import re
from collections import Counter
import string

# Подготовка текста (токенизация, создание словаря)

In [None]:
tok = spacy.load('en_core_web_sm')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [None]:
# Count number of occurences of each word
counts = Counter()
for text in list(train['excerpt']):
    counts.update(tokenize(text))

In [None]:
# Deleting infrequent words
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

In [None]:
# Creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [None]:
def encode_sentence(text, vocab2index, N=200):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [None]:
train['encoded'] = train['excerpt'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
train.head()

In [None]:
#from sklearn.model_selection import train_test_split
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Класс Dataset (особенность pytorch - нужно переопределять dataset под свои нужды)

In [None]:
class CommonLitReadabiltyDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx] #, self.X[idx][1]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = list(train['encoded'])
y = list(train['target'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1)

train_dataset = CommonLitReadabiltyDataset(X_train, y_train)
val_dataset = CommonLitReadabiltyDataset(X_valid, y_valid)

# Класс модели

In [None]:
class LSTM_model(nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, linear1) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear1 = nn.Linear(hidden_dim, linear1)
        self.act1 = nn.Tanh()
        self.linear2 = nn.Linear(linear1, 1)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        x = self.act1( self.linear1(ht[-1]) )
        return self.linear2(x)

In [None]:
vocab_size = len(words)
embedding_dim = 500
hidden_dim = 80
linear1 = 50

model_ft =  LSTM_model(vocab_size, embedding_dim, hidden_dim, linear1).to('cuda')
model_ft

In [None]:
def train_epoch(model,criterion,optimizer,dataset,epoch):
    
    train_dataset=dataset
    data_loader=DataLoader(dataset,batch_size=4,shuffle=True,num_workers=4)
    dataset_size=len(dataset)
    
    print(f"Epoch#{epoch}. Train")
    model.train()
    
    running_loss=0.0   #накопление лосса    
    epoch_loss=0.0
    
    for inputs,labels in tqdm( data_loader):
        inputs=inputs.to('cuda').type(torch.long)
        labels=labels.to('cuda').type(torch.float) #передаем батч на GPU(cuda)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss=criterion(outputs,labels)
        loss.backward() # обратное распостранение градиента
        optimizer.step() # шаг оптимизатора
        running_loss+=loss.item()*inputs.size(0)
    epoch_loss = running_loss / dataset_size
    print(f'Loss RMSE: { np.sqrt(epoch_loss) }')
    print(f"Epoch#{epoch} (Train) completed.")
    return model, epoch_loss

In [None]:
def valid_epoch(model,criterion,optimizer,dataset,epoch):
    val_dataset=dataset
    data_loader=DataLoader(dataset,batch_size=4,shuffle=True,num_workers=4)
    dataset_size=len(val_dataset)
    print(f"Epoch#{epoch}. Validation")
    model.eval()
    running_loss=0.0 # накопление лосc
    epoch_loss=0.0
    with torch.no_grad():
        for inputs,labels in tqdm( data_loader):
            inputs=inputs.to('cuda').type(torch.long)
            labels=labels.to('cuda').type(torch.float) #передаем батч на GPU(cuda)
            outputs = model(inputs)
            loss=criterion(outputs,labels)
            running_loss+=loss.item()*inputs.size(0)
    epoch_loss = running_loss / dataset_size
    print(f'Loss RMSE: { np.sqrt(epoch_loss) } ')
    print(f"Epoch#{epoch} (Validation) completed. ")
    return model, epoch_loss

In [None]:
from tqdm import tqdm

In [None]:
criterion = nn.MSELoss()
optimizer= optim.Adam(params=model_ft.parameters(),lr=3e-5)

# Тренировка модели

In [None]:
best_model = model_ft
best_epoch = 1
best_loss = 1000000
num_epochs = 10

train_loss_history = []
val_loss_history = []

for epoch in range(1,num_epochs+1):
    #тренировка
    model_ft, train_loss = train_epoch(model_ft,criterion,optimizer,train_dataset,epoch)
    train_loss_history.append(train_loss)
    
    #валидация
    model_ft, val_loss = valid_epoch(model_ft,criterion,optimizer,val_dataset,epoch)
    val_loss_history.append(val_loss)
    
    if(val_loss<best_loss):
        best_model = model_ft
        best_epoch = epoch

# Inference - для соревнования на Kaggle (вывод на тестовой выборке)

In [None]:
model = best_model
model.eval()

test['encoded'] = test['excerpt'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
excerpts_test = test['encoded']

X_test = [excerpts_test[i][0] for i in range(len(test))]
X_test = torch.LongTensor(X_test).to('cuda')

y_hat = model(X_test)
y_hat

In [None]:
sub["target"] = y_hat.cpu().detach().numpy()
sub

In [None]:
sub.to_csv("submission_lstm.csv", index=False)