In [None]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.tokenize import word_tokenize
from torchtext import vocab
import random
from sklearn.model_selection import KFold

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark = False
    print("Seeding done")
seed_everything(42)

In [None]:
VECTOR_NAME = 'glove.6B.300d.txt'
VECTOR_PATH = './'
train_dataset_path = '../input/commonlitreadabilityprize/train.csv'
test_dataset_path = '../input/commonlitreadabilityprize/test.csv'

In [None]:
data_csv = pd.read_csv(train_dataset_path)
data_csv.head()

In [None]:
len(data_csv)

In [None]:
TEXT_LENGTH = 200
EMBEDDING_SIZE = 300
HIDDEN_SIZE = 200
BATCH_SIZE=64

In [None]:
data_csv['lengths'] = data_csv['excerpt'].apply(lambda x: len(str(x).split()))
data_csv['excerpt'] = data_csv['excerpt'].apply(lambda x:str(x).lower())

In [None]:
def get_word_to_index(texts):
    word_to_index = {
        '<PAD>':0,
        '<START>':1,
        '<END>':2,
    }
    ind = 3
    for text in texts:
        words = word_tokenize(text)
        for word in words:
            if word not in word_to_index.keys():
                word_to_index[word] = ind
                ind += 1
                
    return word_to_index

In [None]:
word_to_index_dict = get_word_to_index(data_csv['excerpt'])
VOCABULARY_SIZE = len(word_to_index_dict.keys())
print(VOCABULARY_SIZE)

In [None]:
def get_tensor_from_text(text):
    word_list = []
    text = text.lower()
    words = word_tokenize(text)
    words = [word for word in words if word in word_to_index_dict.keys()]
    for word in words:
        word_list.append(word_to_index_dict[word])
    if len(word_list) > TEXT_LENGTH:
        word_list = word_list[:200]
    else:
        word_list.extend([0]*(TEXT_LENGTH-len(word_list)))
    
    tensor_list = torch.tensor(word_list, device=device, dtype=torch.long)
    return tensor_list

In [None]:
class CommonLitDataset(torch.utils.data.Dataset):
    def __init__(self,dataset):
        self.dataset = dataset
    
    def __getitem__(self,index): 
        text = self.dataset['excerpt'].iloc[index]
        x = get_tensor_from_text(text)
        return x
    
    def __len__(self):
        return len(self.dataset)

In [None]:
!cp -r  ../input/glove6b/glove.6B.300d.txt ./

In [None]:
embeddings = vocab.Vectors(VECTOR_NAME,VECTOR_PATH)

def create_embedding_matrix(embeddings):  
    embedding_matrix = np.random.rand(VOCABULARY_SIZE,EMBEDDING_SIZE)
    for string,index in word_to_index_dict.items():
        if not  all(x == 0 for x in embeddings[string].tolist()):
            embedding_matrix[index] = embeddings[string] 
    return embedding_matrix

In [None]:
class CommonLitModel(nn.Module):
    def __init__(self,embedding_matrix,num_layers,batch_size=BATCH_SIZE,hidden_size = HIDDEN_SIZE):
        super().__init__()
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
                
        self.embedding_layer = nn.Embedding(VOCABULARY_SIZE,EMBEDDING_SIZE,padding_idx = 0)
        self.embedding_layer.weight = nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float32))
        self.embedding_layer.weight.requires_grad = False
        
        self.lstm_layer = nn.LSTM(EMBEDDING_SIZE,HIDDEN_SIZE,batch_first = True,num_layers = self.num_layers,bidirectional=True)
        
        # self.conv1d = nn.Conv1d(in_channels=200,out_channels=400,kernel_size=3,stride=1,padding=0)
        
        self.output_layer_1 = nn.Linear(HIDDEN_SIZE,100)
        self.output_layer_2 = nn.Linear(100,1)
    
    def forward(self,input_text,hidden_state,cell_state):
        self.embeddings = self.embedding_layer(input_text.long().to(device))
        
        # Doing spatial dropout 1-dimensional
        self.embeddings = self.embeddings.permute(0, 2, 1)   # convert to [batch, channels, time]
        self.embeddings = F.dropout2d(self.embeddings, 0.2, training=self.training)
        self.embeddings = self.embeddings.permute(0, 2, 1)
        
        lstm_output,(hidden_state,cell_state) = self.lstm_layer(self.embeddings)
#         conv_output = self.conv1d(hidden_state[-1,:,:])
#         print(conv_output.size())
        linear_output_1 = self.output_layer_1(hidden_state[-1,:,:])
        linear_output_2 = self.output_layer_2(linear_output_1)
        return linear_output_2,hidden_state,cell_state
        
    def init_hidden(self):
        return torch.zeros(self.batch_size,2*self.num_layers,self.hidden_size,dtype = torch.float32,device = device),torch.zeros(self.batch_size,2*self.num_layers,self.hidden_size,dtype = torch.float32,device = device) 

In [None]:
class CommonLitAttentionModel(nn.Module):
    def __init__(self,embedding_matrix,num_layers,batch_size=BATCH_SIZE,hidden_size = HIDDEN_SIZE):
        super().__init__()
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
                
        self.embedding_layer = nn.Embedding(VOCABULARY_SIZE,EMBEDDING_SIZE,padding_idx = 0)
        self.embedding_layer.weight = nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float32))
        self.embedding_layer.weight.requires_grad = False
        
        self.lstm_layer = nn.LSTM(EMBEDDING_SIZE,HIDDEN_SIZE,batch_first = True,num_layers = self.num_layers,bidirectional=True)        
        
        self.attention_linear_layer = nn.Linear(HIDDEN_SIZE,2*HIDDEN_SIZE)
        
        self.lstm_layer_2 = nn.LSTM(2*HIDDEN_SIZE,HIDDEN_SIZE,batch_first=True,num_layers = self.num_layers,bidirectional=True)
        
        self.linear_1 = nn.Linear(HIDDEN_SIZE,100)
        self.linear_2 = nn.Linear(100,1)
    
    def forward(self,input_text,hidden_state,cell_state):
        self.embeddings = self.embedding_layer(input_text.long().to(device))
        
        self.embeddings = self.embeddings.permute(0, 2, 1)   # convert to [batch, channels, time]
        self.embeddings = F.dropout2d(self.embeddings, 0.2, training=self.training)
        self.embeddings = self.embeddings.permute(0, 2, 1)
        
        lstm_output,(hidden_state,cell_state) = self.lstm_layer(self.embeddings)
        final_state = hidden_state[-1,:,:]
        
        attention_linear_output = self.attention_linear_layer(final_state)
        attention_linear_output = attention_linear_output.unsqueeze(1)
        attention_multiplied_context = lstm_output * attention_linear_output
        softmax_attention = F.softmax(attention_multiplied_context,dim=1)
        global_context = softmax_attention * lstm_output
        
        lstm_output_2, (hidden_state_2,cell_state_2) = self.lstm_layer_2(global_context)
        final_state_2 = hidden_state_2[-1,:,:]        
        
        self.linear_output_1 = self.linear_1(final_state_2)
        final_output_2 = self.linear_2(self.linear_output_1)
        
        return final_output_2,hidden_state_2,cell_state_2
        
    def init_hidden(self):
        return torch.zeros(self.batch_size,2*self.num_layers,self.hidden_size,dtype = torch.float32,device = device),torch.zeros(self.batch_size,2*self.num_layers,self.hidden_size,dtype = torch.float32,device = device)  

In [None]:
# Reference: General attention mechanism
class CommonLitCNNLSTMAttentionEnsembleModel(nn.Module):
    def __init__(self,embedding_matrix,num_layers,dropout_prob,batch_size=BATCH_SIZE,hidden_size = HIDDEN_SIZE):
        super().__init__()
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout_prob = dropout_prob
                
        self.embedding_layer = nn.Embedding(VOCABULARY_SIZE,EMBEDDING_SIZE,padding_idx = 0)
        self.embedding_layer.weight = nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float32))
        self.embedding_layer.weight.requires_grad = False
        
        self.lstm_layer_1 = nn.LSTM(EMBEDDING_SIZE,hidden_size,dropout=self.dropout_prob,batch_first = True,num_layers = self.num_layers,bidirectional=True)        
        
        self.attention_linear_layer = nn.Linear(hidden_size,2*hidden_size)
        
        self.lstm_layer_2 = nn.LSTM(4*hidden_size,hidden_size,dropout=self.dropout_prob,batch_first=True,num_layers = self.num_layers,bidirectional=True)
        
        ######

        self.lstm_layer_3 = nn.LSTM(EMBEDDING_SIZE,hidden_size,dropout=self.dropout_prob,batch_first = True,num_layers = self.num_layers,bidirectional=True)        
        self.conv1 = nn.Conv1d(in_channels = 2*hidden_size, out_channels=hidden_size,kernel_size=3,stride=1)
        self.conv2 = nn.Conv1d(in_channels = 2*hidden_size, out_channels=hidden_size,kernel_size=5,stride=1)
        self.conv3 = nn.Conv1d(in_channels = 2*hidden_size, out_channels=hidden_size,kernel_size=7,stride=1)

        ######

        self.low_rnn = nn.GRU(hidden_size,hidden_size,dropout=self.dropout_prob,batch_first = True,num_layers = self.num_layers,bidirectional=True)
        self.med_rnn = nn.GRU(hidden_size,hidden_size,dropout=self.dropout_prob,batch_first = True,num_layers = self.num_layers,bidirectional=True)
        self.high_rnn = nn.GRU(hidden_size,hidden_size,dropout=self.dropout_prob,batch_first = True,num_layers = self.num_layers,bidirectional=True)

        self.lstm_features_concat_layer = nn.Linear(3*hidden_size,hidden_size)

        ######

        self.output_linear_1 = nn.Linear(2*hidden_size,hidden_size)
        self.output_linear_2 = nn.Linear(hidden_size,hidden_size // 2)
        self.output_linear_3 = nn.Linear(hidden_size// 2,1)
    
    def forward(self,input_text):
        
        self.embeddings = self.embedding_layer(input_text.long().to(device))
        self.embeddings = self.embeddings.permute(0, 2, 1)   # convert to [batch, channels, time]
        self.embeddings = F.dropout2d(self.embeddings, 0.2, training=self.training)
        self.embeddings = self.embeddings.permute(0, 2, 1)
        
        ###
        
        lstm_output_1,(hidden_state_1,cell_state) = self.lstm_layer_1(self.embeddings)
        final_state_1 = hidden_state_1[-1,:,:]
        
        ###
        
        attention_linear_output = self.attention_linear_layer(final_state_1)
        attention_linear_output = attention_linear_output.unsqueeze(1)
        attention_multiplied_context = lstm_output_1 * attention_linear_output
        softmax_attention = F.softmax(attention_multiplied_context,dim=1)
        global_context = softmax_attention * lstm_output_1
        
        final_context_words = torch.cat([global_context,lstm_output_1],dim=2) # 64,sequence, 4*hidden_size
        
        lstm_output_2, (hidden_state_2,cell_state_2) = self.lstm_layer_2(final_context_words)
        final_state_2 = hidden_state_2[-1,:,:]
        ###
        
        lstm_output_3,(hidden_state_3,cell_state_3) = self.lstm_layer_3(self.embeddings)
        lstm_output_3 = lstm_output_3.permute(0,2,1)
        conv_1_output = self.conv1(lstm_output_3)
        conv_1_output = conv_1_output.permute(0,2,1) 

        conv_2_output = self.conv2(lstm_output_3)
        conv_2_output = conv_2_output.permute(0,2,1)
        
        conv_3_output = self.conv3(lstm_output_3)
        conv_3_output = conv_3_output.permute(0,2,1)

        low_lstm_output,hidden_state_low = self.low_rnn(conv_1_output)
        med_lstm_output,hidden_state_med = self.med_rnn(conv_2_output)
        high_lstm_output,hidden_state_high = self.high_rnn(conv_3_output)
        concat_features = torch.cat([hidden_state_low[-1,:,:],hidden_state_med[-1,:,:],hidden_state_high[-1,:,:]],dim=1)
        lstm_linear_concat_output = self.lstm_features_concat_layer(concat_features)

        ###
        short_long_context_features = torch.cat([final_state_2,lstm_linear_concat_output],dim=1)
        linear_output_1 = self.output_linear_1(short_long_context_features)
        linear_output_2 = self.output_linear_2(linear_output_1)
        linear_output_3 = self.output_linear_3(linear_output_2)

        return linear_output_3

In [None]:
embedding_matrix = create_embedding_matrix(embeddings)

In [None]:
test_data_csv = pd.read_csv(test_dataset_path)
criterion = nn.MSELoss()
dataset = CommonLitDataset(test_data_csv)

In [None]:
def test(dataset,embedding_matrix,hidden_size=HIDDEN_SIZE,batch_size=BATCH_SIZE):    
    testloader = torch.utils.data.DataLoader(
                      dataset,
                      batch_size=1)
        
    with torch.no_grad():   
        model = torch.load('../input/commonreadability-models/lstm_attention_cnn_gru_attention_1.pkl')
        model.eval()
        
        preds = []
        print(len(testloader))
        for inputs in testloader:
            model = model.to(device)
            inputs = inputs.to(device)
            outputs = model(inputs)
            outputs = outputs.squeeze(1)
            preds.append(outputs.item())
    
    data_csv = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
    sub_df = pd.DataFrame({
        'id':data_csv['id'].values,
        'target':preds
    })
    sub_df = sub_df[['id','target']]
    sub_df.to_csv('./submission.csv',index=False)

In [None]:
test(dataset,embedding_matrix)

In [None]:
x = pd.read_csv('./submission.csv')
x