###    Libraries Import

In [1]:
import os

import warnings

import numpy as np
import pandas as pd
from category_encoders import (
    BackwardDifferenceEncoder,
    BaseNEncoder,
    BinaryEncoder,
    CatBoostEncoder,
    CountEncoder,
    GLMMEncoder,
    HelmertEncoder,
    JamesSteinEncoder,
    LeaveOneOutEncoder,
    MEstimateEncoder,
    SummaryEncoder,
    TargetEncoder,
    WOEEncoder,
)
from sklearn import set_config
from sklearn.base import clone as model_clone
from sklearn.cluster import *
from sklearn.compose import *
from sklearn.cross_decomposition import *
from sklearn.decomposition import *
from sklearn.ensemble import *
from sklearn.feature_selection import *
from sklearn.gaussian_process import *
from sklearn.linear_model import *
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.multioutput import *
from sklearn.multiclass import *
from sklearn.naive_bayes import *
from sklearn.neighbors import *
from sklearn.neural_network import *
from sklearn.pipeline import *
from sklearn.preprocessing import *
from sklearn.svm import *
from sklearn.tree import *
from sklearn.utils import *
from sklearn.dummy import *
from sklearn.semi_supervised import *
from sklearn.discriminant_analysis import *

from xgboost import XGBClassifier, XGBRFClassifier

from sklearn.calibration import *
import joblib
pd.options.display.max_columns = 50
set_config(display="diagram")
warnings.filterwarnings("ignore")
import pickle
from collections import defaultdict

from joblib import parallel_backend
from joblib.memory import Memory
def allow_stopping(func):
    def wrapper():
        try:
            value = func()
            return value
            # gc.collect()
        except KeyboardInterrupt as e:
            print("Program Stopped")
        gc.collect()

    return wrapper

###    ColumnSelectors

In [3]:
class ColumnSelectors:
    def __init__(self, default=None):
        self.dtype_info = {
            "binary__v_1": "Binary",
            "binary__v_11": "Binary",
            "binary__v_14": "Binary",
            "binary__v_26": "Binary",
            "binary__v_27": "Binary",
            "binary__v_28": "Binary",
            "binary__v_30": "Binary",
            "binary__v_9": "Binary",
            "nominal__v_12": "Nominal",
            "nominal__v_18": "Nominal",
            "nominal__v_20": "Nominal",
            "nominal__v_21": "Nominal",
            "nominal__v_25": "Nominal",
            "nominal__v_3": "Nominal",
            "nominal__v_32": "Nominal",
            "nominal__v_4": "Nominal",
            "ordinal__v_0": "Ordinal",
            "ordinal__v_10": "Ordinal",
            "ordinal__v_13": "Ordinal",
            "ordinal__v_15": "Ordinal",
            "ordinal__v_17": "Ordinal",
            "ordinal__v_19": "Ordinal",
            "ordinal__v_22": "Ordinal",
            "ordinal__v_23": "Ordinal",
            "ordinal__v_24": "Ordinal",
            "ordinal__v_29": "Ordinal",
            "ordinal__v_31": "Ordinal",
            "ordinal__v_33": "Ordinal",
            "ordinal__v_5": "Ordinal",
            "ordinal__v_6": "Ordinal",
            "ratio__v_16": "Ratio",
            "ratio__v_2": "Ratio",
            "ratio__v_34": "Ratio",
            "ratio__v_35": "Ratio",
            "ratio__v_36": "Ratio",
            "ratio__v_37": "Ratio",
            "ratio__v_38": "Ratio",
            "ratio__v_39": "Ratio",
            "ratio__v_40": "Ratio",
            "ratio__v_7": "Ratio",
            "ratio__v_8": "Ratio",
        }

        self.ordinal_cols = [
            i for i in self.dtype_info if self.dtype_info[i] == "Ordinal"
        ]
        self.nominal_cols = [
            i for i in self.dtype_info if self.dtype_info[i] == "Nominal"
        ]
        self.binary_cols = [
            i for i in self.dtype_info if self.dtype_info[i] == "Binary"
        ]
        self.ratio_cols = [i for i in self.dtype_info if self.dtype_info[i] == "Ratio"]
        self.ordinal = make_column_selector(
            pattern="|".join(self.ordinal_cols),
        )
        self.nominal = make_column_selector(
            pattern="|".join(self.nominal_cols),
        )
        self.binary = make_column_selector(
            pattern="|".join(self.binary_cols),
        )
        self.ratio = make_column_selector(
            pattern="|".join(self.ratio_cols),
        )

    def ordinal_selector(self):
        return self.ordinal

    def nominal_selector(self):
        return self.nominal

    def binary_selector(self):
        return self.binary

    def ratio_selector(self):
        return self.ratio

column_directory = ColumnSelectors()

### Prepare Data

In [7]:
DATA_PATH="/kaggle/input/students-drop-out-prediction/"
DATA_SAVE_PATH='/kaggle/working/'
TRAIN_DATA='train.csv'
TEST_DATA='test.csv'
KAGGLE_ENV = 0

if KAGGLE_ENV == 0:
    LOCAL_PATH = '../data/'
    DATA_PATH = LOCAL_PATH + DATA_PATH
    DATA_SAVE_PATH= LOCAL_PATH + DATA_SAVE_PATH
    

raw_data = pd.read_csv(DATA_PATH+TRAIN_DATA,index_col=0)
raw_data_eval = pd.read_csv(DATA_PATH+TEST_DATA,index_col=0)

raw_dtypes_info = {}
saved_dtypes_info = column_directory.dtype_info
for k, v in saved_dtypes_info.items():
    tmp = k.split('__')
    data_type = tmp[0]
    column_name = tmp[1]
    raw_dtypes_info[column_name] = k

raw_data.rename(columns=raw_dtypes_info,inplace=True)
raw_data_eval.rename(columns=raw_dtypes_info,inplace=True)




Unnamed: 0_level_0,nominal__v_21,nominal__v_32,ordinal__v_15,nominal__v_4,binary__v_1,ratio__v_8,nominal__v_12,nominal__v_25,nominal__v_20,nominal__v_18,nominal__v_3,binary__v_11,binary__v_14,binary__v_26,binary__v_27,binary__v_30,binary__v_9,ordinal__v_31,binary__v_28,ordinal__v_13,ordinal__v_33,ordinal__v_17,ordinal__v_19,ordinal__v_29,ordinal__v_23,ordinal__v_24,ordinal__v_10,ordinal__v_5,ordinal__v_6,ordinal__v_22,ordinal__v_0,ratio__v_7,ratio__v_2,ratio__v_16,ratio__v_34,ratio__v_35,ratio__v_36,ratio__v_37,ratio__v_38,ratio__v_39,ratio__v_40,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1
0,3,6,11,55,1,1,16,5,60,30,17,1,0,0,1,0,0,26,0,14,29,48,24,22.571429,3,15,25,37,14,28.666667,12,13.9,-0.3,0.79,1,93,47,1,48,26,0,1
1,8,94,12,65,10,4,22,4,173,199,413,5,7,4,2,10,5,27,4,23,30,55,27,18.800000,4,19,26,46,13,27.000000,13,14.9,5.4,6.51,6,29,37,25,53,35,7,2
2,3,166,10,155,1,1,16,1,309,30,31,1,0,0,1,1,0,31,0,14,29,48,24,19.333333,3,15,25,37,12,25.750000,12,11.1,0.6,2.02,1,8,34,4,43,38,2,1
3,3,6,13,121,1,1,16,1,6,23,70,1,0,0,1,0,1,26,0,14,30,48,25,19.117143,3,15,27,40,15,25.600000,12,11.1,0.6,2.02,7,37,40,27,43,65,0,1
4,4,51,18,21,5,17,22,118,168,106,44,7,0,3,8,8,7,49,8,33,41,61,41,29.500000,7,28,32,50,27,30.909091,14,10.9,8.4,12.51,16,22,25,25,24,23,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3791,3,89,13,102,1,1,16,1,6,23,17,1,0,0,1,1,0,27,0,14,29,48,23,18.200000,3,15,25,43,10,25.000000,12,7.6,2.6,0.32,7,77,42,38,12,24,5,2
3792,7,8,20,130,5,1,25,6,10,29,44,5,5,2,1,3,3,30,8,22,34,55,27,24.471429,3,15,35,44,24,30.462500,15,12.4,8.5,3.79,13,75,18,35,63,78,6,2
3793,5,168,14,59,1,15,23,1,174,28,42,7,3,4,1,2,7,30,8,17,32,57,25,25.333333,8,24,33,49,11,32.000000,20,9.9,1.4,3.51,12,56,13,34,12,79,6,0
3794,4,89,19,122,2,9,17,189,256,105,95,9,2,2,2,5,10,33,9,14,32,55,31,28.842857,9,23,30,43,16,34.055556,18,21.2,4.3,8.08,10,55,36,21,48,76,6,1


###    Load Data

In [None]:
import pandas as pd


class DFCollection:
    """
    Contains all the data used.
    Upon Init all data gets loaded.
    Save method is also provided.
    """

    def __init__(self):
        self.c_sel = ColumnSelectors()

        self.file_path = "/kaggle/input/students-drop-out-prediction/"
        if KAGGLE_ENV == 0:
            self.file_path = '../data/'
        self.data = pd.read_parquet(
            self.file_path + "train.parquet", engine="fastparquet"
        )
        self.prediction_data = pd.read_parquet(
            self.file_path + "test.parquet", engine="fastparquet"
        )
        self.data_logits = pd.read_parquet(
            self.file_path + "data_with_ridit.hdfs", engine="fastparquet"
        )
        self.final_data = pd.read_parquet(
            self.file_path + "final_data.parquet", engine="fastparquet"
        )
        self.final_pred_data = pd.read_parquet(
            self.file_path + "final_pred_data.parquet", engine="fastparquet"
        )
        self.baseline_prediction_data = pd.read_parquet(
            self.file_path + "baseline.parquet", engine="fastparquet"
        )
        self.master = pd.concat(
            [self.final_data, self.baseline_prediction_data], axis=0, ignore_index=True
        )
        self.core_frames = [
            self.data,
            self.prediction_data,
            self.data_logits,
            self.final_data,
            self.final_pred_data,
            self.baseline_prediction_data,
        ]
        save_paths = [
            "train.parquet",
            "test.parquet",
            "data_with_ridit.hdfs",
            "final_data.parquet",
            "final_pred_data.parquet",
            "baseline.parquet",
        ]
        self.save_paths = [self.file_path + x for x in save_paths]
        self.core_names = [x.split(".")[0] for x in self.save_paths]
        self.final_data.rename(columns={"label": "target"}, inplace=True)
        self.data.rename(columns={"label": "target"}, inplace=True)
        self.nominal_categories = {}
        for nc in self.c_sel.nominal_cols:
            ncs = self.master.loc[:, nc].unique()
            self.nominal_categories[nc] = ncs

        self.ordinal_categories = {}
        for nc in self.c_sel.ordinal_cols:
            ncs = self.master.loc[:, nc].unique()
            self.ordinal_categories[nc] = ncs

    @staticmethod
    def __save__(df: pd.DataFrame, loc: str):
        try:
            df.to_parquet(loc, engine="fastparquet", compression="brotli")
        except:
            return "Save Failed"
        return "Saved Successfully"

    def save_all(self):
        """
        Before Saving all objects ask question for each of them.
        And for each question if the answer is yes proceed to save otherwise continue.
        """
        exit_msg = "Exiting!"
        try:
            for df_name, df, df_loc in zip(
                self.core_names, self.core_frames, self.save_paths
            ):
                base_question = f"Do you want to save {df_name}?(Yes/No/Exit)"
                skip_msg = f"Skipping {df_name}"
                while True:
                    answer = input(base_question)
                    if answer == "Yes":
                        msg = self.__save__(df, df_loc)
                        print(df_name + msg)
                        break
                    elif answer in ["No", "n"]:
                        print(skip_msg)
                        break
                    elif answer in ["Exit", "e"]:
                        print(exit_msg)
                        return
                    else:
                        print("Not Valid Input")
                        continue
        except KeyboardInterrupt:
            print(exit_msg)
            return

    def categorise_data(self, df: pd.DataFrame = None):

        if isinstance(df, pd.DataFrame):
            ordinal_data = df.loc[:, self.c_sel.ordinal_cols]
            nominal_data = df.loc[:, self.c_sel.nominal_cols]
            binary_data = df.loc[:, self.c_sel.binary_cols]
            ratio_data = df.loc[:, self.c_sel.ratio_cols]
        else:
            df = self.final_data
            ordinal_data = df.loc[:, self.c_sel.ordinal_cols]
            nominal_data = df.loc[:, self.c_sel.nominal_cols]
            binary_data = df.loc[:, self.c_sel.binary_cols]
            ratio_data = df.loc[:, self.c_sel.ratio_cols]
        return ordinal_data, nominal_data, binary_data, ratio_data


# if __name__ == "__main__":
#     db = DFCollection()
