In [None]:
import os
import gc

import numpy as np 
import pandas as pd 
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
import category_encoders as ce

In [None]:
class DataLoader():
    def __init__(self):
        self.train = None
        self.dev = None
        self.test = None
        self.X_train = None
        self.X_dev = None
        self.X_test = None
        self.y_train = None
        self.y_dev = None
        self.prediction_dev = None
        self.prediction_test = None
        
        self.sub = None
        self.feature_columns = [
                   'timestamp',
                   'user_id',
                   'content_id',
                   'content_type_id',
                   'task_container_id',
                   'prior_question_elapsed_time',
                   'prior_question_had_explanation'
                  ]
        self.label_column = 'answered_correctly'
        self.cat_features = [
                'user_id',
                'content_id',
                'content_type_id',
                'task_container_id', 
                'prior_question_had_explanation']
        self.numeric_columns = ['timestamp', 'prior_question_elapsed_time']

In [None]:
class PreProcess():
    def __init__(self):
        self.min_max_scaler = None
        self.test_size=0.2
        
    # Create a column with the number of missing values in each row:    
    def set_missing_values_column(df):
        df['nb_missing_values'] = df.shape[1] - df.count(axis=1)
        return df
    
    def preprocess_MinMaxScaler(self,dd,which):
        df1 = dd.train[dd.numeric_features]
        if which == 'test':
            df1 = dd.test[dd.numeric_features]

        if self.min_max_scaler is None:
            self.min_max_scaler = MinMaxScaler()
            self.min_max_scaler.fit(df1)
        df_cols = self.min_max_scaler.transform(df1)
        if which == 'train':
            dd.train[dd.numeric_features] = df_cols
        elif which == 'test':
            dd.test[dd.numeric_features] = df_cols
        return dd

    def preprocess_MinMaxScaler_all(self, dd):
        scaler = MinMaxScaler()
        scaler.fit(dd.train[dd.numeric_features])
        dd.train[dd.numeric_features] = scaler.transform(dd.train[dd.numeric_features])
        dd.test[dd.numeric_features] = scaler.transform(dd.test[dd.numeric_features])
        self.min_max_scaler = scaler
        return dd

    def preprocess_OrdinalEncoder_all(self, dd):
        scaler = OrdinalEncoder()
        scaler.fit(dd.train[dd.cat_features])
        dd.train[dd.cat_features] = scaler.transform(dd.train[dd.cat_features])
        dd.test[dd.cat_features] = scaler.transform(dd.test[dd.cat_features])
        return dd
    
    def preprocess_target_encoder(self, dd):
        encoder = ce.TargetEncoder(cols=dd.cat_features)
        gc.collect()
        encoder.fit(dd.train[dd.feature_columns], dd.train[dd.label_column])
        gc.collect()
        dd.train[dd.feature_columns] = encoder.transform(dd.train[dd.feature_columns], dd.train[dd.label_column])
        gc.collect() 
        dd.test[dd.feature_columns] = encoder.transform(dd.test[dd.feature_columns])
        gc.collect()
        return dd

    def custom_train_test_split(self, dd):
        dd.X_train, dd.X_dev, dd.y_train, dd.y_dev = train_test_split(dd.train[dd.feature_columns], dd.train[dd.label_column], test_size=self.test_size, random_state=42)
        dd.train=None
        gc.collect()
        return dd