In [None]:
!cp -r /kaggle/input/save-sentence-transformer-repo/sentence-transformers /tmp/sentence-transformers
!pip install /tmp/sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

modelPath = "/kaggle/input/save-sentence-transformer-repo/bert-base/"

model = SentenceTransformer(modelPath)

In [None]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from scipy import sparse
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import pickle

In [None]:
NFOLDs = 5

In [None]:
dataset_train = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/train.csv')
dataset_test = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/test.csv')

In [None]:
dataset_train['text'] = dataset_train['essay_id'].apply(lambda x: open(f'/kaggle/input/feedback-prize-effectiveness/train/{x}.txt').read())
dataset_test['text'] = dataset_test['essay_id'].apply(lambda x: open(f'/kaggle/input/feedback-prize-effectiveness/test/{x}.txt').read())
dataset_train.head(2)

In [None]:
effectiveness_map = {"Ineffective":0, "Adequate":1,"Effective":2}
dataset_train["target"] = dataset_train["discourse_effectiveness"].map(effectiveness_map)

In [None]:
dataset_train.head(2)

In [None]:
skfold = StratifiedKFold(n_splits=NFOLDs,shuffle=True,random_state=NFOLDs)
for i,(train_index, test_index) in enumerate(skfold.split(dataset_train, dataset_train["target"])):
    dataset_train.loc[test_index,"fold"] = i
print(dataset_train.fold.value_counts())   

In [None]:
preds = []

In [None]:
dataset_train["discourse_text_encoding"] = model.encode(dataset_train["discourse_text"].tolist(),show_progress_bar=True, batch_size=256).tolist()
dataset_test["discourse_text_encoding"] = model.encode(dataset_test["discourse_text"].tolist(),show_progress_bar=True, batch_size=256).tolist()

In [None]:
dataset_train["text_encoding"] = model.encode(dataset_train["text"].tolist(),show_progress_bar=True, batch_size=256).tolist()
dataset_test["text_encoding"] = model.encode(dataset_test["text"].tolist(),show_progress_bar=True, batch_size=256).tolist()

In [None]:
dataset_test.head()

In [None]:
for fold in range(NFOLDs):
    dataset_train_ = dataset_train[dataset_train['fold']!=fold] #use all the discourse_ids which are not marked by current fold index
    dataset_eval_ = dataset_train[dataset_train['fold']==fold] #use current fold index rows as validation set
         
    # Training, Validation, and Test Dataset
    #discourse_id
    
    train_discourse_text =  dataset_train_["discourse_text_encoding"].tolist()
    eval_discourse_text =  dataset_eval_["discourse_text_encoding"].tolist()
    test_discourse_text =  dataset_test["discourse_text_encoding"].tolist()
    
    #text
    train_text = dataset_train_["text_encoding"].tolist()
    eval_text = dataset_eval_["text_encoding"].tolist()
    test_text = dataset_test["text_encoding"].tolist()
    
    #discourse_type
    ohe = OneHotEncoder()
    tr_type_ohe =  sparse.csr_matrix(ohe.fit_transform(dataset_train_["discourse_type"].values.reshape(-1,1)))
    eval_type_ohe =  sparse.csr_matrix(ohe.transform(dataset_eval_["discourse_type"].values.reshape(-1,1)))
    te_type_ohe =  sparse.csr_matrix(ohe.transform(dataset_test["discourse_type"].values.reshape(-1,1)))
    
    #Stack each vector representations 
    train_dat = sparse.hstack((tr_type_ohe,train_discourse_text,train_text))
    eval_dat = sparse.hstack((eval_type_ohe,eval_discourse_text,eval_text))
    test_dat = sparse.hstack((te_type_ohe,test_discourse_text,test_text))
    
    #Model
    clf = LogisticRegression(max_iter=500,penalty="l2",C=1.0131816333513533,)
    clf.fit(train_dat, dataset_train_["target"].values)

    filename = f"ohe_{fold}.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(ohe, f)
        
    filename = f"clf_{fold}.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(clf, f)

    # and later you can load it
    with open(filename, 'rb') as f:
        clf = pickle.load(f)
    ev_preds = clf.predict_proba(eval_dat)
    ev_loss = log_loss(dataset_eval_["target"].values,ev_preds)
    print("Fold : {} EV score: {}".format(fold,ev_loss))
    
    #Test
    preds.append(clf.predict_proba(test_dat))

In [None]:
dataset_submission = pd.read_csv("../input/feedback-prize-effectiveness/sample_submission.csv")

In [None]:
all_preds = np.array(preds).mean(0)
print(all_preds.shape)
dataset_submission.loc[:,"Ineffective"] = all_preds[:,0]
dataset_submission.loc[:,"Adequate"] = all_preds[:,1]
dataset_submission.loc[:,"Effective"] = all_preds[:,2]
dataset_submission.head()

In [None]:
dataset_submission.to_csv('submission.csv',index=False)