#### Experiment Log:


| Version | Models Used | CV Score | LB Score | Comments |
| --- | --- | --- | --- | --- |
| v1 | LogisticRegression <br> RandomForestClassifier | - | - | Baseline (Errored Out)
| v2 | LogisticRegression <br> RandomForestClassifier | 0.8898984 <br> 0.8877021 | - | Baseline
| v3 | LogisticRegression <br> RandomForestClassifier | 0.8898984 <br> 0.8877021 | 0.868 | Baseline
| v4 | LogisticRegression <br> RandomForestClassifier | 0.7463457 <br> 0.8374394 | 0.784 | Text Preprocessing <br> TF-IDF
| v5 | LogisticRegression <br> RandomForestClassifier | 0.7534682 <br> 0.7823450 | 0.761 | Text Preprocessing <br> Lemmatization <br> TF-IDF <br> Glove Embeddings
| v6 | LogisticRegression <br> XGBoost <br> LightGBM | 0.7534682 <br> 0.6215837 <br> 0.5861382 | 0.695 | Text Preprocessing <br> Lemmatization <br> TF-IDF <br> Glove Embeddings
| v7 | LogisticRegression <br> XGBoost <br> LightGBM | 0.7932787 <br> 0.6045388 <br> 0.5755040 | 0.685 | Text Preprocessing <br> Lemmatization <br> Glove Embeddings
| v8 | LogisticRegression <br> XGBoost <br> LightGBM | 0.7932841 <br> 0.6041433 <br> 0.5748984 | 0.677 | Text Preprocessing <br> Lemmatization <br> Glove Embeddings <br> New features added
| v9 | LogisticRegression <br> XGBoost <br> LightGBM | 0.7013439 <br> 0.6039653 <br> 0.5749525 | 0.673 | Text Preprocessing <br> Lemmatization <br> Glove Embeddings <br> Quantile Transformer
| v10 | LogisticRegression <br> XGBoost <br> LightGBM | 0.7012418 <br> 0.5843301 <br> 0.5770625 | 0.679 | Text Preprocessing <br> Lemmatization <br> Glove Embeddings <br> Quantile Transformer
| v11 | LogisticRegression <br> XGBoost <br> LightGBM <br> Voting Classifier | Error | - | Text Preprocessing <br> Lemmatization <br> Glove Embeddings <br> Quantile Transformer
| v13 | LogisticRegression <br> XGBoost <br> LightGBM <br> Voting Classifier | 0.2121148 | 0.681 | Text Preprocessing <br> Lemmatization <br> Glove Embeddings <br> Quantile Transformer
| v14 | LogisticRegression <br> XGBoost <br> LightGBM | 0.7034141 <br> 0.6002171 <br> 0.5768558 | 0.669 | Text Preprocessing <br> Lemmatization <br> Glove + FastText Embeddings <br> Quantile Transformer
| v15 | LogisticRegression <br> XGBoost <br> LightGBM | 0.6929070 <br> 0.6058480 <br> 0.5769126 | 0.685 | Sentence-Transformers
| v16 | XGBoost <br> LightGBM | Error | - | Text Preprocessing (handle OOV words) <br> Lemmatization <br> Glove + FastText Embeddings <br> Quantile Transformer
| v17 | XGBoost <br> LightGBM | 0.5981064 <br> 0.5738860 | 0.673 | Text Preprocessing (handle OOV words) <br> Lemmatization <br> Glove + FastText Embeddings <br> Quantile Transformer
| v18 | LogisticRegression <br> XGBoost <br> LightGBM | 0.8130772 <br> 0.6800559 <br> 0.6760178 | 0.672 | Text Preprocessing <br> Lemmatization <br> Glove + FastText Embeddings <br> Quantile Transformer <br> GroupKFold
| v19 | LogisticRegression <br> XGBoost <br> LightGBM | 0.8135349 <br> 0.6804664 <br> 0.6747677 | - | Text Preprocessing <br> Lemmatization <br> Glove + FastText Embeddings <br> TextBlob to handle OOV tokens <br> Quantile Transformer <br> GroupKFold
| v21 | LogisticRegression <br> XGBoost <br> LightGBM | 0.7428862 <br> 0.6817771 <br> 0.6779332 | 0.672 | Preprocessing <br> Lemmatization <br> Glove + FastText Embeddings <br> Min Max Scaler <br> Stratified GroupKFold
| v22 | LogisticRegression <br> XGBoost <br> LightGBM | 0.7022837 <br> 0.5999361 <br> 0.5741513 | 0.669 | Text Preprocessing <br> Lemmatization <br> Glove + FastText Embeddings <br> Quantile Transformer
| v23 | LogisticRegression <br> XGBoost <br> LightGBM | - | - | Modified Text Preprocessing <br> Discourse text removal from Essay <br> Lemmatization <br> Glove + FastText Embeddings <br> Quantile Transformer

## Import libraries

In [None]:
import gc
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

import re
import nltk
import string
from textblob import TextBlob
from collections import Counter
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import QuantileTransformer

import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

tqdm.pandas()
np.random.seed(42)

## Load source datasets

In [None]:
train = pd.read_csv('../input/feedback-prize-effectiveness/train.csv')
train["essay_text"] = train["essay_id"].progress_apply(lambda x: open(f'../input/feedback-prize-effectiveness/train/{x}.txt').read())
print(f"train: {train.shape}")
train.head()

In [None]:
test = pd.read_csv('../input/feedback-prize-effectiveness/test.csv')
test["essay_text"] = test["essay_id"].progress_apply(lambda x: open(f'../input/feedback-prize-effectiveness/test/{x}.txt').read())
print(f"test: {test.shape}")
test.head()

In [None]:
train["discourse_effectiveness"] = train["discourse_effectiveness"].map({
    "Adequate":1,
    "Effective":2,
    "Ineffective":0
})
train['discourse_effectiveness'].value_counts()

In [None]:
Ytrain = train['discourse_effectiveness'].values
train.drop(['discourse_effectiveness'], inplace=True, axis=1)

print(f"train: {train.shape} \ntest: {test.shape} \nYtrain: {Ytrain.shape}")

## Feature Engineering

### Helper Functions

In [None]:
def contraction_count(sent):
    count = 0
    count += re.subn(r"won\'t", '', sent)[1]
    count += re.subn(r"can\'t", '', sent)[1]
    count += re.subn(r"n\'t", '', sent)[1]
    count += re.subn(r"\'re", '', sent)[1]
    count += re.subn(r"\'s", '', sent)[1]
    count += re.subn(r"\'d", '', sent)[1]
    count += re.subn(r"\'ll", '', sent)[1]
    count += re.subn(r"\'t", '', sent)[1]
    count += re.subn(r"\'ve", '', sent)[1]
    count += re.subn(r"\'m", '', sent)[1]
    return count

In [None]:
def pos_count(sent):
    nn_count = 0   #Noun
    pr_count = 0   #Pronoun
    vb_count = 0   #Verb
    jj_count = 0   #Adjective
    uh_count = 0   #Interjection
    cd_count = 0   #Numerics
    
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)

    for token in sent:
        if token[1] in ['NN','NNP','NNS']:
            nn_count += 1

        if token[1] in ['PRP','PRP$']:
            pr_count += 1

        if token[1] in ['VB','VBD','VBG','VBN','VBP','VBZ']:
            vb_count += 1

        if token[1] in ['JJ','JJR','JJS']:
            jj_count += 1

        if token[1] in ['UH']:
            uh_count += 1

        if token[1] in ['CD']:
            cd_count += 1
    
    return pd.Series([nn_count, pr_count, vb_count, jj_count, uh_count, cd_count])

In [None]:
def decontraction(phrase):
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
def remove_punctuations(text):
    for punctuation in list(string.punctuation):
        text = text.replace(punctuation, '')
    return text

In [None]:
def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {
        "N": wordnet.NOUN, 
        "V": wordnet.VERB, 
        "J": wordnet.ADJ, 
        "R": wordnet.ADV
    }
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

In [None]:
def sent2vec(text):
    words = nltk.word_tokenize(text)
    words = [w for w in words if w.isalpha()]
    
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            M.append(np.random.uniform(-0.01, 0.01, 300))
            continue
    
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    
    return v / np.sqrt((v ** 2).sum())

### Create basic text features

In [None]:
train['discourse_idx'] = train.progress_apply(lambda x: x['essay_text'].find(x['discourse_text'].strip()), axis=1)
train['discourse_len'] = train['discourse_text'].progress_apply(lambda x: len(x.strip()))
train['essay_text'] = train.progress_apply(lambda x: x['essay_text'][0:x['discourse_idx']] + ' ' + \
                                           x['essay_text'][x['discourse_idx']+x['discourse_len']:].strip(), axis=1)
train.drop('discourse_len', axis=1, inplace=True)
train.head()

In [None]:
test['discourse_idx'] = test.progress_apply(lambda x: x['essay_text'].find(x['discourse_text'].strip()), axis=1)
test['discourse_len'] = test['discourse_text'].progress_apply(lambda x: len(x.strip()))
test['essay_text'] = test.progress_apply(lambda x: x['essay_text'][0:x['discourse_idx']] + \
                                         x['essay_text'][x['discourse_idx']+x['discourse_len']:].strip(), axis=1)
test.drop('discourse_len', axis=1, inplace=True)
test.head()

In [None]:
def text_features(df, col):
    df[f"{col}_num_words"] = df[col].progress_apply(lambda x: len(str(x).split()))
    df[f"{col}_num_unique_words"] = df[col].progress_apply(lambda x: len(set(str(x).split())))
    df[f"{col}_num_chars"] = df[col].progress_apply(lambda x: len(str(x)))
    df[f"{col}_num_stopwords"] = df[col].progress_apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))
    df[f"{col}_num_punctuations"] = df[col].progress_apply(lambda x: len([c for c in str(x) if c in list(string.punctuation)]))
    df[f"{col}_num_words_upper"] = df[col].progress_apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
    df[f"{col}_num_words_title"] = df[col].progress_apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
    df[f"{col}_mean_word_len"] = df[col].progress_apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    df[f"{col}_num_paragraphs"] = df[col].progress_apply(lambda x: len(x.split('\n')))
    df[f"{col}_num_contractions"] = df[col].progress_apply(contraction_count)
    df[f"{col}_polarity"] = df[col].progress_apply(lambda x: TextBlob(x).sentiment[0])
    df[f"{col}_subjectivity"] = df[col].progress_apply(lambda x: TextBlob(x).sentiment[1])
    #df[[f'{col}_nn_count',f'{col}_pr_count',f'{col}_vb_count',f'{col}_jj_count',f'{col}_uh_count',f'{col}_cd_count']] = df[col].progress_apply(pos_count)
    return df

In [None]:
discourse_train = train[['discourse_id','discourse_text','discourse_idx']].copy()
discourse_train.drop_duplicates(inplace=True)
print(f"discourse_train: {discourse_train.shape}")

discourse_train = text_features(discourse_train, "discourse_text")
discourse_train.head()

In [None]:
essay_train = train[['essay_id','essay_text']].copy()
essay_train.drop_duplicates(inplace=True)
print(f"essay_train: {essay_train.shape}")

essay_train = text_features(essay_train, "essay_text")
essay_train.head()

In [None]:
discourse_test = test[['discourse_id','discourse_text','discourse_idx']].copy()
discourse_test.drop_duplicates(inplace=True)
print(f"discourse_test: {discourse_test.shape}")

discourse_test = text_features(discourse_test, "discourse_text")
discourse_test.head()

In [None]:
essay_test = test[['essay_id','essay_text']].copy()
essay_test.drop_duplicates(inplace=True)
print(f"essay_test: {essay_test.shape}")

essay_test = text_features(essay_test, "essay_text")
essay_test.head()

### Text Preprocessing

In [None]:
def text_cleanup(df, col):
    # Convert to lower case
    df[col] = df[col].progress_apply(lambda x: str(x).lower().replace('\\', '').replace('_', ' '))

    # Remove double spaces
    df[col] = df[col].progress_apply(lambda x: re.sub('\s+',  ' ', x))

    # Replace contractions ("don't" with "do not" and "we've" with "we have")
    df[col] = df[col].progress_apply(lambda x: decontraction(x))

    # Remove punctuations
    df[col] = df[col].progress_apply(remove_punctuations)

    # Lemmatize words
    df[col] = df[col].progress_apply(lambda text: lemmatize_words(text))
    
    return df

In [None]:
stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = text.lower()
    text = decontraction(text)
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

In [None]:
discourse_train['discourse_text'] = discourse_train['discourse_text'].progress_apply(clean_text)
discourse_test['discourse_text'] = discourse_test['discourse_text'].progress_apply(clean_text)

In [None]:
essay_train['essay_text'] = essay_train['essay_text'].progress_apply(clean_text)
essay_test['essay_text'] = essay_test['essay_text'].progress_apply(clean_text)

### Glove Embeddings

In [None]:
with open("../input/nlp-word-embeddings/Glove_Embeddings.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
embeddings_index = processed_data['glove_embeddings_index']
print('Word vectors found: {}'.format(len(embeddings_index)))

del processed_data
gc.collect()

In [None]:
discourse_train.set_index('discourse_id', inplace=True)

glove_vec = [sent2vec(x) for x in tqdm(discourse_train["discourse_text"].values)]
col_list = ['discourse_glove_'+str(i) for i in range(300)]
glove_vec_df = pd.DataFrame(np.array(glove_vec), columns=col_list, index=discourse_train.index)
print(f"glove_vec_df: {glove_vec_df.shape}")

discourse_train = pd.merge(
    discourse_train, 
    glove_vec_df, 
    how="inner", 
    on="discourse_id", 
    sort=False
)

del glove_vec, glove_vec_df
gc.collect()

print(f"discourse_train: {discourse_train.shape}")
discourse_train.head()

In [None]:
essay_train.set_index('essay_id', inplace=True)

glove_vec = [sent2vec(x) for x in tqdm(essay_train["essay_text"].values)]
col_list = ['essay_glove_'+str(i) for i in range(300)]
glove_vec_df = pd.DataFrame(np.array(glove_vec), columns=col_list, index=essay_train.index)
print(f"glove_vec_df: {glove_vec_df.shape}")

essay_train = pd.merge(
    essay_train, 
    glove_vec_df, 
    how="inner", 
    on="essay_id", 
    sort=False
)

del glove_vec, glove_vec_df
gc.collect()

print(f"essay_train: {essay_train.shape}")
essay_train.head()

In [None]:
discourse_test.set_index('discourse_id', inplace=True)

glove_vec = [sent2vec(x) for x in tqdm(discourse_test["discourse_text"].values)]
col_list = ['discourse_glove_'+str(i) for i in range(300)]
glove_vec_df = pd.DataFrame(np.array(glove_vec), columns=col_list, index=discourse_test.index)
print(f"glove_vec_df: {glove_vec_df.shape}")

discourse_test = pd.merge(
    discourse_test, 
    glove_vec_df, 
    how="inner", 
    on="discourse_id", 
    sort=False
)

del glove_vec, glove_vec_df
gc.collect()

print(f"discourse_test: {discourse_test.shape}")
discourse_test.head()

In [None]:
essay_test.set_index('essay_id', inplace=True)

glove_vec = [sent2vec(x) for x in tqdm(essay_test["essay_text"].values)]
col_list = ['essay_glove_'+str(i) for i in range(300)]
glove_vec_df = pd.DataFrame(np.array(glove_vec), columns=col_list, index=essay_test.index)
print(f"glove_vec_df: {glove_vec_df.shape}")

essay_test = pd.merge(
    essay_test, 
    glove_vec_df, 
    how="inner", 
    on="essay_id", 
    sort=False
)

del glove_vec, glove_vec_df
gc.collect()

print(f"essay_test: {essay_test.shape}")
essay_test.head()

In [None]:
del embeddings_index
gc.collect()

### FastText Embeddings

In [None]:
with open("../input/nlp-word-embeddings/FastText_Embeddings.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
embeddings_index = processed_data['fasttext_embeddings_index']
print('Word vectors found: {}'.format(len(embeddings_index)))

del processed_data
gc.collect()

In [None]:
fasttext_vec = [sent2vec(x) for x in tqdm(discourse_train["discourse_text"].values)]
col_list = ['discourse_fasttext_'+str(i) for i in range(300)]
fasttext_vec_df = pd.DataFrame(np.array(fasttext_vec), columns=col_list, index=discourse_train.index)
print(f"fasttext_vec_df: {fasttext_vec_df.shape}")

discourse_train = pd.merge(
    discourse_train, 
    fasttext_vec_df, 
    how="inner", 
    on="discourse_id", 
    sort=False
)

del fasttext_vec, fasttext_vec_df
gc.collect()

discourse_train.drop('discourse_text', axis=1, inplace=True)
print(f"discourse_train: {discourse_train.shape}")
discourse_train.head()

In [None]:
fasttext_vec = [sent2vec(x) for x in tqdm(essay_train["essay_text"].values)]
col_list = ['essay_fasttext_'+str(i) for i in range(300)]
fasttext_vec_df = pd.DataFrame(np.array(fasttext_vec), columns=col_list, index=essay_train.index)
print(f"fasttext_vec_df: {fasttext_vec_df.shape}")

essay_train = pd.merge(
    essay_train, 
    fasttext_vec_df, 
    how="inner", 
    on="essay_id", 
    sort=False
)

del fasttext_vec, fasttext_vec_df
gc.collect()

essay_train.drop('essay_text', axis=1, inplace=True)
print(f"essay_train: {essay_train.shape}")
essay_train.head()

In [None]:
fasttext_vec = [sent2vec(x) for x in tqdm(discourse_test["discourse_text"].values)]
col_list = ['discourse_fasttext_'+str(i) for i in range(300)]
fasttext_vec_df = pd.DataFrame(np.array(fasttext_vec), columns=col_list, index=discourse_test.index)
print(f"fasttext_vec_df: {fasttext_vec_df.shape}")

discourse_test = pd.merge(
    discourse_test, 
    fasttext_vec_df, 
    how="inner", 
    on="discourse_id", 
    sort=False
)

del fasttext_vec, fasttext_vec_df
gc.collect()

discourse_test.drop('discourse_text', axis=1, inplace=True)
print(f"discourse_test: {discourse_test.shape}")
discourse_test.head()

In [None]:
fasttext_vec = [sent2vec(x) for x in tqdm(essay_test["essay_text"].values)]
col_list = ['essay_fasttext_'+str(i) for i in range(300)]
fasttext_vec_df = pd.DataFrame(np.array(fasttext_vec), columns=col_list, index=essay_test.index)
print(f"fasttext_vec_df: {fasttext_vec_df.shape}")

essay_test = pd.merge(
    essay_test, 
    fasttext_vec_df, 
    how="inner", 
    on="essay_id", 
    sort=False
)

del fasttext_vec, fasttext_vec_df
gc.collect()

essay_test.drop('essay_text', axis=1, inplace=True)
print(f"essay_test: {essay_test.shape}")
essay_test.head()

In [None]:
del embeddings_index
gc.collect()

### Merge all datasets

In [None]:
train = pd.merge(
    train,
    discourse_train,
    how='inner',
    on='discourse_id',
    sort=False
)

train = pd.merge(
    train,
    essay_train,
    how='inner',
    on='essay_id',
    sort=False
)

print(f"train: {train.shape}")
train.head()

In [None]:
test = pd.merge(
    test,
    discourse_test,
    how='inner',
    on='discourse_id',
    sort=False
)

test = pd.merge(
    test,
    essay_test,
    how='inner',
    on='essay_id',
    sort=False
)

print(f"test: {test.shape}")
test.head()

In [None]:
del discourse_train, essay_train
del discourse_test, essay_test
gc.collect()

### Additional features

In [None]:
train['num_words_ratio'] = train['discourse_text_num_words']/train['essay_text_num_words']
train['num_unique_words_ratio'] = train['discourse_text_num_unique_words']/train['essay_text_num_unique_words']
train['num_chars_ratio'] = train['discourse_text_num_chars']/train['essay_text_num_chars']
train['num_stopwords_ratio'] = train['discourse_text_num_stopwords']/train['essay_text_num_stopwords']
train['num_punctuations_ratio'] = train['discourse_text_num_punctuations']/train['essay_text_num_punctuations']
train['mean_word_len_ratio'] = train['discourse_text_mean_word_len']/train['essay_text_mean_word_len']
train.head()

In [None]:
test['num_words_ratio'] = test['discourse_text_num_words']/test['essay_text_num_words']
test['num_unique_words_ratio'] = test['discourse_text_num_unique_words']/test['essay_text_num_unique_words']
test['num_chars_ratio'] = test['discourse_text_num_chars']/test['essay_text_num_chars']
test['num_stopwords_ratio'] = test['discourse_text_num_stopwords']/test['essay_text_num_stopwords']
test['num_punctuations_ratio'] = test['discourse_text_num_punctuations']/test['essay_text_num_punctuations']
test['mean_word_len_ratio'] = test['discourse_text_mean_word_len']/test['essay_text_mean_word_len']
test.head()

### Label Encoding and Feature Scaling

In [None]:
le = LabelEncoder().fit(train['discourse_type'].append(test['discourse_type']))
train['discourse_type'] = le.transform(train['discourse_type'])
test['discourse_type'] = le.transform(test['discourse_type'])
train.head()

In [None]:
train.drop([
    'discourse_id',
    'essay_id',
    'discourse_text',
    'essay_text'
], axis=1, inplace=True)


test.drop([
    'discourse_id',
    'essay_id',
    'discourse_text',
    'essay_text'
], axis=1, inplace=True)

train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [None]:
features = test.columns.tolist()

qt = QuantileTransformer(n_quantiles=1000, 
                         output_distribution='normal', 
                         random_state=42).fit(train[features])

train[features] = qt.transform(train[features])
test[features] = qt.transform(test[features])

In [None]:
Xtrain = train.copy()
Xtest = test.copy()
print(f"Xtrain: {Xtrain.shape} \nXtest: {Xtest.shape}")

In [None]:
del train, test, qt
gc.collect()

## Models Training

### Logistic Regression

In [None]:
FOLD = 5
SEEDS = [42]

counter = 0
oof_score = 0
y_pred_final_lr = np.zeros((Xtest.shape[0], 3))


for sidx, seed in enumerate(SEEDS):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain)):
        counter += 1

        train_x, train_y = Xtrain.iloc[train], Ytrain[train]
        val_x, val_y = Xtrain.iloc[val], Ytrain[val]

        model = LogisticRegression(max_iter=2000, random_state=42)
        model.fit(train_x, train_y)
        
        y_pred = model.predict_proba(val_x)
        y_pred_final_lr += model.predict_proba(Xtest)
        
        score = log_loss(val_y, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
        
        with open(f'FPE_LR_Model_{counter}.pkl', 'wb') as file:
            pickle.dump(model, file)
        
        del model, y_pred
        del train_x, train_y
        del val_x, val_y
        gc.collect()
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_final_lr = y_pred_final_lr / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

### XGBoost

In [None]:
FOLD = 5
SEEDS = [42]

counter = 0
oof_score = 0
y_pred_final_xgb = np.zeros((Xtest.shape[0], 3))


for sidx, seed in enumerate(SEEDS):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain)):
        counter += 1

        train_x, train_y = Xtrain.iloc[train], Ytrain[train]
        val_x, val_y = Xtrain.iloc[val], Ytrain[val]

        model = XGBClassifier(
            objective='multi:softproba',
            eval_metric='mlogloss',
            booster='gbtree',
            sample_type='weighted',
            tree_method='hist',
            grow_policy='lossguide',
            use_label_encoder=False,
            num_round=5000,
            num_class=3,
            max_depth=9, 
            max_leaves=36,
            learning_rate=0.095,
            subsample=0.7024,
            colsample_bytree=0.5289,
            min_child_weight=15,
            reg_lambda=0.05465,
            verbosity=0,
            random_state=42
        )
        
        model.fit(train_x, train_y, eval_set=[(train_x, train_y), (val_x, val_y)], 
                  early_stopping_rounds=100, verbose=50)
        
        y_pred = model.predict_proba(val_x, iteration_range=(0, model.best_iteration))
        y_pred_final_xgb += model.predict_proba(Xtest, iteration_range=(0, model.best_iteration))
        
        score = log_loss(val_y, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
        
        with open(f'FPE_XGB_Model_{counter}.pkl', 'wb') as file:
            pickle.dump(model, file)
        
        del model, y_pred
        del train_x, train_y
        del val_x, val_y
        gc.collect()
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_final_xgb = y_pred_final_xgb / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

### LightGBM

In [None]:
params = {}
params["objective"] = 'multiclass'
params['metric'] = 'multi_logloss'
params['boosting'] = 'gbdt'
params['num_class'] = 3
params['is_unbalance'] = True
params["learning_rate"] = 0.05
params["lambda_l2"] = 0.0256
params["num_leaves"] = 52
params["max_depth"] = 10
params["feature_fraction"] = 0.503
params["bagging_fraction"] = 0.741
params["bagging_freq"] = 8
params["bagging_seed"] = 10
params["min_data_in_leaf"] = 10
params["verbosity"] = -1
params["random_state"] = 42
num_rounds = 5000

In [None]:
FOLD = 5
SEEDS = [42]

counter = 0
oof_score = 0
y_pred_final_lgb = np.zeros((Xtest.shape[0], 3))


for sidx, seed in enumerate(SEEDS):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain)):
        counter += 1

        train_x, train_y = Xtrain.iloc[train], Ytrain[train]
        val_x, val_y = Xtrain.iloc[val], Ytrain[val]
        
        lgtrain = lgb.Dataset(train_x, label=train_y.ravel())
        lgvalidation = lgb.Dataset(val_x, label=val_y.ravel())

        model = lgb.train(params, lgtrain, num_rounds, 
                          valid_sets=[lgtrain, lgvalidation], 
                          early_stopping_rounds=100, verbose_eval=100)
        
        y_pred = model.predict(val_x, num_iteration=model.best_iteration)
        y_pred_final_lgb += model.predict(Xtest, num_iteration=model.best_iteration)
        
        score = log_loss(val_y, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
        
        with open(f'FPE_LGB_Model_{counter}.pkl', 'wb') as file:
            pickle.dump(model, file)
        
        del model, y_pred
        del train_x, train_y
        del val_x, val_y
        gc.collect()
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_final_lgb = y_pred_final_lgb / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

## Create submission file

In [None]:
y_pred_final = (y_pred_final_lr * 0.1) + (y_pred_final_xgb * 0.55) + (y_pred_final_lgb * 0.35)

submission = pd.read_csv("../input/feedback-prize-effectiveness/sample_submission.csv")
submission['Ineffective'] = y_pred_final[:,0]
submission['Adequate'] = y_pred_final[:,1]
submission['Effective'] = y_pred_final[:,2]
submission.to_csv("./submission.csv", index=False)
submission.head()

In [None]:
## Good Day!!