In [None]:
from IPython.display import clear_output
!pip install mljar-supervised --user
clear_output()

In [None]:
import numpy as np 
import pandas as pd
import gc
from supervised import AutoML

In [None]:
TARGET = 'target'
SAVE_PATH = 'agModels-predictClass' 
DEFAULT_RANDOM_SEED = 2021

import os 
import random
def seedBasic(seed=DEFAULT_RANDOM_SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seedBasic()

In [None]:
train = pd.read_csv("../input/tabular-playground-series-nov-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-nov-2021/test.csv")
gc.collect()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
train.head()

In [None]:
len(train[train["target"] == 0]) ,len(train[train["target"] == 1])

In [None]:
train["target"].unique()

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

train = reduce_memory_usage(train)
test = reduce_memory_usage(test)
gc.collect()

In [None]:
TARGET = "target"
N_SPLITS = 4
SEED = 42
FEATURES = train.columns
IS_SHUFFLE = False

In [None]:
def saveMiddelPrediction(middle_pred,foldnum):
    submission = pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")
    submission.target = middle_pred
    submission.to_csv(f"submission{foldnum}.csv",index=False)
    submission.head()
    

In [None]:
def predict(train_df):
    features = test.columns
    from sklearn.model_selection import StratifiedKFold
    test_pred = np.zeros(test.shape[0])
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=False, random_state=None)

    for fold, (trn_idx, val_idx) in enumerate(skf.split(X=train_df, y=train_df[TARGET])):
        print(f"===== fold {fold} =====")
        X_valid, y_valid = train_df[features].iloc[val_idx], train_df[TARGET].iloc[val_idx]

        automl = AutoML() 
        automl.fit(X_valid, y_valid)
        middle_pred = automl.predict_proba(test)[:,1]
        saveMiddelPrediction(middle_pred,fold)
        test_pred += middle_pred/N_SPLITS
        gc.collect()
        
    return test_pred

In [None]:
test_pred = predict(train)

In [None]:
submission = pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")
submission.target = test_pred
submission.to_csv("submission.csv",index=False)
submission.head()