## Бібліотеки

In [None]:
import numpy as np
import pandas as pd
import torch

## Дані

In [None]:
df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
df = df[df['excerpt'].notnull()]

In [None]:
df.head()

In [None]:
df.shape

## Очистка даних

In [None]:
text = df['excerpt'].str.lower().values

In [None]:
text

In [None]:
from string import punctuation
print(punctuation)

In [None]:
alphabet = "qwertyuiopasdfghjklzxcvbnm " #qwertyuiopasdfghjklzxcvbnm

In [None]:
text = [''.join([c for c in te if c in alphabet]) for te in text]

In [None]:
text[:2] #[x for x in [i.split() for i in text] if len(x) < 15]

In [None]:
''.join([c for c in text[0] if c in alphabet])

In [None]:
all_text = ' '.join(text)

In [None]:
all_text[:100]

## Кодування тексту

In [None]:
from collections import Counter

words = all_text.split()

count_words = Counter(words)

total_words = len(words)
sorted_words = count_words.most_common(total_words)

In [None]:
#count_words

In [None]:
{v:i for i, v in enumerate(["hello", "world"])}

In [None]:
vocab_to_int = {w:i for i, (w,c) in enumerate(sorted_words)}

In [None]:
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}

In [None]:
#vocab_to_int

In [None]:
len(vocab_to_int)

In [None]:
text_int = []

for te in text:
    r = [vocab_to_int[w] for w in te.split()]
    text_int.append(r)
print (text_int[0:3])

In [None]:
encoded_labels = df['target'].values
encoded_labels.mean()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

text_len = [len(x) for x in text_int]
pd.Series(text_len).hist()
plt.show()
pd.Series(text_len).describe()

In [None]:
text_int = [text_int[i] for i, l in enumerate(text_len) if l>0]
encoded_labels = [encoded_labels[i] for i, l in enumerate(text_len) if l>0]

In [None]:
text_int[0]

In [None]:
encoded_labels[:10]

In [None]:
len(encoded_labels), len(text_int)

In [None]:
def pad_features(reviews_int, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's or truncated to the input seq_length.
    '''
    features = np.zeros((len(reviews_int), seq_length), dtype = int)
    
    for i, review in enumerate(reviews_int):
        review_len = len(review)
        
        if review_len <= seq_length:
            zeroes = list(np.zeros(seq_length-review_len))
            new = zeroes+review
        elif review_len > seq_length:
            new = review[0:seq_length]
        
        features[i,:] = np.array(new)
    
    return features

In [None]:
features = pad_features(text_int, 50)

In [None]:
features[:10,:]

In [None]:
features.shape

## Підготовка датасету

In [None]:
split_frac = 0.8
len_feat = len(features)

train_x = features[0:int(split_frac*len_feat)] #
train_y = encoded_labels[0:int(split_frac*len_feat)] #int(split_frac*len_feat)

remaining_x = features[int(split_frac*len_feat):] #int(split_frac*len_feat)
remaining_y = encoded_labels[int(split_frac*len_feat):]

valid_x = remaining_x[:int(len(remaining_x)*0.5)] #int(len(remaining_x)*0.5)
valid_y = remaining_y[:int(len(remaining_x)*0.5)]

test_x = remaining_x[int(len(remaining_x)*0.5):]
test_y = remaining_y[int(len(remaining_x)*0.5):]

In [None]:
len(train_x), len(valid_x), len(test_x)

In [None]:
from torch.utils.data import DataLoader, TensorDataset
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(np.array(train_x)), torch.from_numpy(np.array(train_y)))
valid_data = TensorDataset(torch.from_numpy(np.array(valid_x)), torch.from_numpy(np.array(valid_y)))
test_data = TensorDataset(torch.from_numpy(np.array(test_x)), torch.from_numpy(np.array(test_y)))
# dataloaders
batch_size = 50
# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,  num_workers = 4, drop_last=True)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size,  num_workers = 4, drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size,  num_workers = 4, drop_last=True)

In [None]:
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

## Архітектура нейронної мережі

In [None]:
from torch import nn

class RegLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.5):
        super().__init__()
        # params: "n_" means dimension
        self.n_vocab = n_vocab     # number of unique words in vocabulary
        self.n_layers = n_layers   # number of LSTM layers 
        self.n_hidden = n_hidden   # number of hidden nodes in LSTM
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_hidden, n_output)
        #self.sigmoid = nn.Sigmoid()
        
        
    def forward (self, input_words):
                                             # INPUT   :  (batch_size, seq_length)
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        lstm_out, h = self.lstm(embedded_words)         # (batch_size, seq_length, n_hidden)
        lstm_out = self.dropout(lstm_out)
        lstm_out = lstm_out.contiguous().view(-1, self.n_hidden) # (batch_size*seq_length, n_hidden)
        fc_out = self.fc(lstm_out)                      # (batch_size*seq_length, n_output)
        #sigmoid_out = self.sigmoid(fc_out)              # (batch_size*seq_length, n_output)
        sigmoid_out = fc_out.view(batch_size, -1)  # (batch_size, seq_length*n_output)
        
        # extract the output of ONLY the LAST output of the LAST element of the sequence
        sigmoid_last = sigmoid_out[:, -1]               # (batch_size, 1)
        
        return sigmoid_last, h
    
    
    def init_hidden (self, batch_size):  # initialize hidden weights (h,c) to 0
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        weights = next(self.parameters()).data
        h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        
        return h


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding
output_size = 1
embedding_dim = 5
hidden_dim = int(256/4)
n_layers = 2
net = RegLSTM(vocab_size, embedding_dim, hidden_dim, output_size, n_layers, 0.1)
net.to(device)
print(net)

## Навчання нейронної мережі

In [None]:
from torch import optim
from tqdm import tqdm

criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr = 0.00001)

print_every = 300
step = 0
n_epochs = 50  # validation loss increases from ~ epoch 3 or 4
clip = 5  # for gradient clip to prevent exploding gradient problem in LSTM/RNN
device = 'cuda' if torch.cuda.is_available else 'cpu'

for epoch in range(n_epochs):
    h = net.init_hidden(batch_size)
    
    for inputs, labels in tqdm(train_loader):
        step += 1
        inputs, labels = inputs.to(device), labels.to(device)

        h = tuple([each.data for each in h])   
        
        net.zero_grad()
        output, h = net(inputs)#.to(device)
        loss = criterion(output.squeeze(), labels.float())**0.5
        loss.backward()
        nn.utils.clip_grad_norm(net.parameters(), clip)
        optimizer.step()
        
        if (step % print_every) == 0:            
            net.eval()
            valid_losses = []
            v_h = net.init_hidden(batch_size)
            
            for v_inputs, v_labels in valid_loader:
                v_inputs, v_labels = inputs.to(device), labels.to(device)
        
                v_h = tuple([each.data for each in v_h])
                
                v_output, v_h = net(v_inputs)#.to(device)
                v_loss = criterion(v_output.squeeze(), v_labels.float())**0.5
                valid_losses.append(v_loss.item())

            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
            net.train()

In [None]:
labels

## Текстування отриманих результатів

In [None]:
net.eval()
test_losses = []
num_correct = 0
test_h = net.init_hidden(batch_size)

for v_inputs, v_labels in test_loader:
                v_inputs, v_labels = inputs.to(device), labels.to(device)
        
                v_h = tuple([each.data for each in v_h])
                
                v_output, v_h = net(v_inputs)#.to(device)
                v_loss = criterion(v_output.squeeze(), v_labels.float())
                valid_losses.append(v_loss.item())

                preds = torch.round(v_output.squeeze())
                correct_tensor = preds.eq(v_labels.float().view_as(preds).to(device))
                correct = np.squeeze(correct_tensor.to("cpu").numpy())
                num_correct += np.sum(correct)

In [None]:
print("Test Accuracy: {:.2f}".format(num_correct/len(test_loader.dataset)))

In [None]:
print("Test Loss: {:.4f}".format(np.mean(valid_losses)))

In [None]:
def predict(net, review, seq_length = 200):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    words = list(pd.Series(review).str.lower().values)[0].split()
    encoded_words = [vocab_to_int[word] for word in words]
    padded_words = pad_features([encoded_words], seq_length)
    padded_words = torch.from_numpy(padded_words).to(device)
    
    if(len(padded_words) == 0):
        "Треба більше слів написати!"
        return None
    
    net.eval()
    h = net.init_hidden(1)
    output, h = net(padded_words)
    #pred = torch.round(output.squeeze())[0]
    #msg = "У цьому тексті позитив" if pred == 1 else "У цьому тексті негатив"
    
    return output.squeeze()[0].item()

In [None]:
test_review = 'some text'
seq_length=200 
predict(net, 'some text', seq_length)

In [None]:
test_review = 'love'
seq_length=200 
predict(net, 'love', seq_length)

In [None]:
predict(net, 'power look', seq_length)