# Hugging Face models

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.model_selection import StratifiedKFold

import transformers
import torch
from tqdm.notebook import tqdm

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

In [None]:
train_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
# train_df = train_df.head(100)
test_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')

In [None]:
train_df.head()

In [None]:
train_df.shape

In [None]:
num_bins = int(np.floor(1 + np.log2(len(train_df))))
train_df.loc[:,'bins'] = pd.cut(train_df['target'],bins=num_bins,labels=False)

target = train_df['target'].to_numpy()
bins = train_df.bins.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [None]:
train_sent_len = [len(i.split()) for i in train_df['excerpt']]
test_sent_len = [len(i.split()) for i in test_df['excerpt']]

plt.hist(train_sent_len, bins=range(min(train_sent_len), max(train_sent_len) + 1, 1), 
              alpha=0.4, color="red")

plt.hist(test_sent_len, bins=range(min(test_sent_len), max(test_sent_len) + 1, 1), 
              alpha=0.4, color="blue")


labels = ['Train','Test']
plt.legend(labels)
plt.xlabel("length of sentence")
plt.ylabel("proportion")
plt.title("comparing number of words per sentence distribution in Train and Test")
plt.show()

In [None]:
from transformers import AutoTokenizer,AutoModel

tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/robertalarge')
model = AutoModel.from_pretrained('/kaggle/input/robertalarge')

In [None]:
model = model.to(device)

In [None]:
def data_encode(data, maximum_length):
    
    encoded = tokenizer(
        data.values.tolist(),
        add_special_tokens=True,
        max_length=maximum_length,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
#     my_list.append(encoded)
    return encoded#,input_ids,attention_masks

def get_embeddings(encoded):
    encoded.to(device)
    return model(**encoded)['last_hidden_state'][:,0].cpu().detach().numpy()

In [None]:
# t= data_encode(train_texts[0:50],200)

In [None]:
# get_embeddings(t)

In [None]:
max_len = 200 # 200 for actual training
batch_size = 100

train_texts = train_df['excerpt']
train_target = train_df['target']

train_embeddings = []
incr = 20
for i in tqdm(range(0,len(train_texts),incr)):
#     print(f'from {i} to {incr+i}')
    train_input = data_encode(train_texts[i:i+incr],max_len)
#     print(train_input.shape)
    embeddings = get_embeddings(train_input)
    train_embeddings.extend(embeddings)

    

test_texts = test_df['excerpt']
# test_target = [0 for i in range(test_df.shape[0])] #fake

test_embeddings = []
for i in tqdm(range(0,len(test_texts),incr)):
#     print(f'from {i} to {incr+i}')
    test_input = data_encode(test_texts[i:i+incr],max_len)
    embeddings = get_embeddings(test_input)
    test_embeddings.extend(embeddings)

In [None]:
# train_embeddings = []
# train_embeddings = model(**train_input)['last_hidden_state'][:,0].detach().numpy()
    
# train_embeddings = np.array(train_embeddings)

# test_embeddings = model(**test_input)['last_hidden_state'][:,0].detach().numpy()

# test_embeddings = np.array(test_embeddings)

In [None]:
#for kfold  
num_bins = int(np.floor(1 + np.log2(len(train_df))))
train_df.loc[:,'bins'] = pd.cut(train_df['target'],bins=num_bins,labels=False)
bins = train_df.bins.to_numpy()

from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

def get_preds_svm(X,y,X_test,bins=bins,nfolds=5,C=10,kernel='rbf'):
    kfold = StratifiedKFold(n_splits=nfolds)
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        svd_model = SVR(C=C,kernel=kernel,gamma='auto')
        
        
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        svd_model.fit(X_train,y_train)
        prediction = svd_model.predict(X_valid)
#         score = rmse_score(prediction,y_valid)
        score = np.sqrt(mean_squared_error(y_valid,prediction)) ## RMSE SCORE
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += svd_model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

In [None]:
train_embeddings = np.array([i.tolist() for i in train_embeddings])
test_embeddings = np.array([i.tolist() for i in test_embeddings])

In [None]:
svm_preds1 = get_preds_svm(train_embeddings,train_target,test_embeddings)
svm_preds2 = get_preds_svm(train_embeddings,train_target,test_embeddings)
svm_preds3 = get_preds_svm(train_embeddings,train_target,test_embeddings)
svm_preds4 = get_preds_svm(train_embeddings,train_target,test_embeddings)
svm_preds5 = get_preds_svm(train_embeddings,train_target,test_embeddings)

In [None]:
svm_preds = (svm_preds1 + svm_preds2 + svm_preds3 + svm_preds4 + svm_preds5)/5

In [None]:
pd.DataFrame({
    'id':test_df.id,
    'target':svm_preds
}).to_csv('submission.csv',index=False)