In [None]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

In [None]:
NFOLDs = 5

In [None]:
dataset_tr = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/train.csv')
dataset_te = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/test.csv')

In [None]:
dataset_tr['text'] = dataset_tr['essay_id'].apply(lambda x: open(f'/kaggle/input/feedback-prize-effectiveness/train/{x}.txt').read())
dataset_te['text'] = dataset_te['essay_id'].apply(lambda x: open(f'/kaggle/input/feedback-prize-effectiveness/test/{x}.txt').read())
dataset_tr.head(2)

In [None]:
effectiveness_map = {"Ineffective":0, "Adequate":1,"Effective":2}
dataset_tr["target"] = dataset_tr["discourse_effectiveness"].map(effectiveness_map)

In [None]:
dataset_tr = dataset_tr.reset_index(drop=True)

In [None]:
skfold = StratifiedKFold(n_splits=NFOLDs,shuffle=True,random_state=NFOLDs)
for i,(train_index, test_index) in enumerate(skfold.split(dataset_tr, dataset_tr["target"])):
    dataset_tr.loc[test_index,"fold"] = i
print(dataset_tr.fold.value_counts())   

In [None]:
preds = []

In [None]:
for n_fold in range(NFOLDs):
    dataset_tr_ = dataset_tr[dataset_tr['fold']!=n_fold] #use all the discourse_ids which are not marked by current fold index
    dataset_eval_ = dataset_tr[dataset_tr['fold']==n_fold] #use current fold index rows as validation set
         
    # Training, Validation, and Test Dataset
    #discourse_id
    tf = TfidfVectorizer(ngram_range=(1,2),norm='l2', smooth_idf=True)
    tr_discourse_tfidf = tf.fit_transform(dataset_tr_["discourse_text"])
    eval_discourse_tfidf = tf.transform(dataset_eval_["discourse_text"])
    te_discourse_tfidf = tf.transform(dataset_te["discourse_text"])
    
    #text
    tf = TfidfVectorizer(ngram_range=(1,2),norm='l2', smooth_idf=True) # Load tf another time because it will learn the new vocabulary for 'text'
    tr_text_tfidf = tf.fit_transform(dataset_tr_["text"])
    eval_text_tfidf = tf.transform(dataset_eval_["text"])
    te_text_tfidf = tf.transform(dataset_te["text"])
    
    #discourse_type
    ohe = OneHotEncoder()
    tr_type_ohe =  sparse.csr_matrix(ohe.fit_transform(dataset_tr_["discourse_type"].values.reshape(-1,1)))
    eval_type_ohe =  sparse.csr_matrix(ohe.transform(dataset_eval_["discourse_type"].values.reshape(-1,1)))
    te_type_ohe =  sparse.csr_matrix(ohe.transform(dataset_te["discourse_type"].values.reshape(-1,1)))
        
    #Stack each vector representations 
    tr_tfidf = sparse.hstack((tr_type_ohe,tr_discourse_tfidf,tr_text_tfidf))
    eval_tfidf = sparse.hstack((eval_type_ohe,eval_discourse_tfidf,eval_text_tfidf))
    te_tfidf = sparse.hstack((te_type_ohe,te_discourse_tfidf,te_text_tfidf))
    
    #Model
    clf = LogisticRegression(max_iter=500,penalty="l2",C=1.0131816333513533)
    clf.fit(tr_tfidf, dataset_tr_["target"].values)
    
    #Validation 
    ev_preds = clf.predict_proba(eval_tfidf)
    ev_loss = log_loss(dataset_eval_["target"].values,ev_preds)
    print("Fold : {} EV score: {}".format(n_fold,ev_loss))
    
    #Test
    preds.append(clf.predict_proba(te_tfidf))

In [None]:
dataset_submission = pd.read_csv("../input/feedback-prize-effectiveness/sample_submission.csv")

In [None]:
all_preds = np.array(preds).mean(0)
print(all_preds.shape)
dataset_submission.loc[:,"Ineffective"] = all_preds[:,0]
dataset_submission.loc[:,"Adequate"] = all_preds[:,1]
dataset_submission.loc[:,"Effective"] = all_preds[:,2]
dataset_submission.head()

In [None]:
dataset_submission.to_csv('submission.csv',index=None)