#### 1. [Reduce memory usage](#section-one)
#### 2. [CatBoostClassifier](#section-two)
#### 3. [LGBM](#section-three)
#### 4. [XGBClassifier](#section-four)
#### 5. [LOGISTIC REGRESSION](#section-five)
#### 6. [ENSEMBLING](#section-six)

In [None]:
import pandas as pd
import datatable as dt
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from scipy.stats import uniform
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
import gc

In [None]:
data_train =  dt.fread('../input/tabular-playground-series-oct-2021/train.csv').to_pandas()
data_test = dt.fread('../input/tabular-playground-series-oct-2021/test.csv').to_pandas()

### Reduce memory usage
<a id="section-one"></a>

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
data_train = reduce_memory_usage(data_train, verbose=True)
data_test = reduce_memory_usage(data_test, verbose=True)

In [None]:
for col in range(data_train.shape[1]):
    if True in data_train.iloc[:,col].values:
        data_train.iloc[:,col] = data_train.iloc[:,col].replace({True: int(1), False:  int(0)})
        
for col in range(data_test.shape[1]):
    if True in data_test.iloc[:,col].values:
        data_test.iloc[:,col] = data_test.iloc[:,col].replace({True: int(1), False:  int(0)}) 

In [None]:
TARGET = 'target'
FEATURES = [col for col in data_train.columns if col not in ['id', TARGET]]

X = data_train[FEATURES]
Y = data_train[TARGET]
X_test = data_train[FEATURES]

data_train["mean"] = data_train[FEATURES].mean(axis=1)
data_train["std"] = data_train[FEATURES].std(axis=1)

y = data_train['target']
data_train = data_train.drop(['target', 'id'], axis = 1)

data_test["mean"] = data_test[FEATURES].mean(axis=1)
data_test["std"] = data_test[FEATURES].std(axis=1)


gc.collect()

In [None]:
#standartize data
float_columns = ['f'+str(i) for i in range(242)]
float_columns = float_columns + ['mean', 'std']
float_columns.remove('f22')
float_columns.remove('f43')

scaler = StandardScaler()
for col in float_columns:
    data_train[col] = scaler.fit_transform(data_train[col].to_numpy().reshape(-1,1))
    data_test[col] = scaler.transform(data_test[col].to_numpy().reshape(-1,1))

gc.collect()

#### Further we will make a few models one by one and then unite them in ensemble. 

## CatBoosting
<a id="section-two"></a>


In [None]:
params_cat = {'min_data_in_leaf': 116,
 'objective': 'CrossEntropy',
 'min_data_in_leaf': 193,        
 'depth': 4,
 'iterations': 284,
 'learning_rate': 0.5307391048885213,
 'l2_leaf_reg': 3.766159322596347,
 'loss_function':'CrossEntropy',
 'eval_metric' : 'AUC',
 'task_type': 'GPU',
 'verbose': 0}

In [None]:
# Train model using only part of available data
train_set = 800000
seed = [1,2]

for i in seed:
    model_cat = CatBoostClassifier(random_seed = i, **params_cat)
    globals()[f'prediction_cat{i}'] = np.zeros(data_test.shape[0])
    print(f'{i} seed')
    skf = StratifiedKFold(n_splits=5, random_state= i)
    
    for train_index, test_index in skf.split(data_train.iloc[:train_set,:], y[:train_set]):
        x = data_train.iloc[train_index,:]
        y_train = y[train_index]
        x_val = data_train.iloc[test_index,:]
        y_val = y[test_index]
    
        #fit model and make final prediction
        model_cat.fit(x, y_train, eval_set = (x_val, y_val), use_best_model=True)
        globals()[f'prediction_cat{i}'] += model_cat.predict_proba(data_test.iloc[:,1:] )[:,-1]/skf.get_n_splits()
    
        #define roc_auc for each test fold
        roc_auc = roc_auc_score(y_val.values, model_cat.predict_proba(x_val)[:,-1])
        print(f'AUC score = {roc_auc}')

In [None]:
np.savetxt("prediction_cat1.csv", prediction_cat1, delimiter=",")
np.savetxt("prediction_cat2.csv", prediction_cat2, delimiter=",")

In [None]:
#This is prediction of this model for unseen data. This way we get the same output as model would get 
#for unseen test data. This output will be used for ensembling
seed = [1,2]
for i in seed:
    model_cat = CatBoostClassifier(random_seed = i, **params_cat)
    model_cat.fit(data_train.iloc[:800000,:], y[:800000])
    globals()[f'cat_pred_tr{i}'] = model_cat.predict_proba(data_train.iloc[800000:,:])[:,-1]

In [None]:
np.savetxt("cat_pred_tr1.csv", cat_pred_tr1, delimiter=",")
np.savetxt("cat_pred_tr2.csv", cat_pred_tr2, delimiter=",")

### LGBM MODEL
<a id="section-three"></a>

In [None]:
lgb_params = {'objective': 'binary',
               'boosting_type': 'gbdt',
               'num_leaves': 62, 
               'max_depth': 512,
               'learning_rate': 0.05,
               'n_estimators': 5000,
               'reg_alpha': 29.5,
               'device' : 'gpu',
               'reg_lambda': 94.1,
               'n_jobs': 4,
               'subsample': 0.5, 
               'subsample_freq': 2, 
               'colsample_bytree': 0.41, 
               'min_child_samples': 117,
               'min_child_weight': 426}

In [None]:
# Train model using only part of available data
train_set = 800000
seed = [1,2]
for i in seed:
    model_lgbm = lgb.LGBMClassifier(random_seed = i, **lgb_params)
    globals()[f'prediction_lgb{i}'] = np.zeros(data_test.shape[0])
    skf = StratifiedKFold(n_splits=5, random_state= i)
    

    for train_index, test_index in skf.split(data_train.iloc[:train_set, :], y[:train_set]):
        X = data_train.iloc[train_index, :]
        y_train = y[train_index]
    
        x_valid = data_train.iloc[test_index, :]
        y_valid = y[test_index]
    
        model_lgbm.fit(X, y_train, eval_set = [(x_valid, y_valid)], 
                  eval_metric='auc',
                  early_stopping_rounds = 200,
                  verbose = 1000
                  )
    
        globals()[f'prediction_lgb{i}']  += model_lgbm.predict_proba(data_test.iloc[:,1:])[:,-1]/skf.get_n_splits()
        auc = roc_auc_score(y_valid, model_lgbm.predict_proba(x_valid)[:,-1])
        
        gc.collect()
        print(f'auc = {auc}')

In [None]:
np.savetxt("prediction_lgb1.csv", prediction_lgb1, delimiter=",")
np.savetxt("prediction_lgb2.csv", prediction_lgb2, delimiter=",")

In [None]:
#This is prediction of this model for unseen data. This way we get the same output as model would get 
#for unseen test data. This output will be used for ensembling
seed = [1,2]
for i in seed:
    model_lgbm = lgb.LGBMClassifier(random_seed = i, **lgb_params)
    model_lgbm.fit(data_train.iloc[:700000, :], y[:700000],
               eval_set = [(data_train.iloc[700000:800000, :], y[700000:800000])],
               early_stopping_rounds = 100,
                  verbose = 500)
    globals()[f'lgbm_pred_tr{i}'] = model_lgbm.predict_proba(data_train.iloc[800000:, :])[:,-1]

In [None]:
np.savetxt("lgbm_pred_tr1.csv", lgbm_pred_tr1, delimiter=",")
np.savetxt("lgbm_pred_tr2.csv", lgbm_pred_tr2, delimiter=",")

### XGBClassifier
<a id="section-four"></a>

In [None]:
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 6,
    'n_estimators': 9500,
    'learning_rate': 0.007279718158350149,
    'subsample': 0.7,
    'colsample_bytree': 0.2,
    'colsample_bylevel': 0.6000000000000001,
    'min_child_weight': 56.41980735551558,
    'reg_lambda': 75.56651890088857,
    'reg_alpha': 0.11766857055687065,
    'gamma': 0.6407823221122686,
    'tree_method': 'gpu_hist',
    'gpu_id': 0,
    'predictor': 'gpu_predictor',
}

In [None]:
# Train model using only part of available data
train_set = 800000
seed = [1,2]
for i in seed:
    model_xgb = xgb.XGBClassifier(random_state = i, **xgb_params)
    globals()[f'prediction_xgb{i}'] = np.zeros(data_test.shape[0])
    skf = StratifiedKFold(n_splits=5, random_state= i)
    prediction_xgb = np.zeros(data_test.shape[0])

    for train_index, test_index in skf.split(data_train.iloc[:train_set, :], y[:train_set]):
        X = data_train.iloc[train_index, :]
        y_train = y[train_index]
    
        x_valid = data_train.iloc[test_index, :]
        y_valid = y[test_index]
    
        model_xgb.fit(X, y_train)
    
        globals()[f'prediction_xgb{i}'] += model_xgb.predict_proba(data_test.iloc[:,1:])[:,-1]/skf.get_n_splits()
        auc = roc_auc_score(y_valid, model_xgb.predict_proba(x_valid)[:,-1])
    
        gc.collect()
        print(f'auc = {auc}')


In [None]:
np.savetxt("prediction_xgb1.csv", prediction_xgb1, delimiter=",")
np.savetxt("prediction_xgb2.csv", prediction_xgb2, delimiter=",")

In [None]:
#This is prediction of this model for unseen data. This way we get the same output as model would get 
#for unseen test data. This output will be used for ensembling
seed = [1,2]
for i in seed:
    model_xgb = xgb.XGBClassifier(random_state = i, **xgb_params)
    model_xgb.fit(data_train.iloc[:800000, :], y[:800000])
    globals()[f'xgb_pred_tr{i}'] = model_xgb.predict_proba(data_train.iloc[800000:, :])[:,-1]

In [None]:
np.savetxt("xgb_pred_tr1.csv", xgb_pred_tr1, delimiter=",")
np.savetxt("xgb_pred_tr2.csv", xgb_pred_tr2, delimiter=",")

### LOGISTIC REGRESSION
<a id="section-five"></a>

In [None]:
# Train model using only part of available data
train_set = 800000
seed = [1,2]
for i in seed:
    model_log = LogisticRegression(random_state=i, solver='liblinear')
    globals()[f'prediction_log{i}'] = np.zeros(data_test.shape[0])
    skf = StratifiedKFold(n_splits=3, random_state= i)

    for train_index, test_index in skf.split(data_train.iloc[:train_set,:], y[:train_set]):
        x = data_train.iloc[train_index, :]
        y_train = y[train_index]
    
        x_valid = data_train.iloc[test_index, :]
        y_valid = y[test_index]
    
        model_log.fit(x.values, y_train.values)
    
        globals()[f'prediction_log{i}'] += model_log.predict_proba(data_test.iloc[:,1:])[:,-1]/skf.get_n_splits()
        auc = roc_auc_score(y_valid.values, model_log.predict_proba(x_valid.values)[:,-1])
    
        print(f'auc = {auc}')

In [None]:
np.savetxt("prediction_log1.csv", prediction_log1, delimiter=",")
np.savetxt("prediction_log2.csv", prediction_log2, delimiter=",")

In [None]:
#This is prediction of this model for unseen data. This way we get the same output as model would get 
#for unseen test data. This output will be used for ensembling
model_log = LogisticRegression(random_state=1,solver='liblinear')
model_log.fit(data_train.iloc[:800000, :], y[:800000])
log_pred_tr1 = model_log.predict_proba(data_train.iloc[800000:, :])[:,-1]

In [None]:
np.savetxt("log_pred_tr1.csv", log_pred_tr1, delimiter=",")

### ENSEMBLING
<a id="section-six"></a>

#### Now we have 7 predictions of 4 models. Each prediction was recieved using unseen dataset. It means that these prediction will look like predictions for real test data. We will gather these predictions in a Data Frame and use it to train a plain Linear Regression. Then we will get the same set of 7 predictions but this time on test data. After that we just apply trained Liner regression to get final result.

In [None]:
#Load previosly made predictions for test dataset
prediction_xgb1 = genfromtxt('../input/prediction-xgb2/prediction_xgb2.csv', delimiter=',')
prediction_xgb2 = genfromtxt('../input/prediction-xgb1/prediction_xgb1.csv', delimiter=',')
prediction_cat1 = genfromtxt('../input/prediction-cat1/prediction_cat1.csv', delimiter=',')
prediction_cat2 = genfromtxt('../input/prediction-cat2/prediction_cat2.csv', delimiter=',')
prediction_log1 = genfromtxt('../input/prediction-log1/prediction_log1.csv', delimiter=',')
prediction_lgb1 = genfromtxt('../input/prediction-lgb1/prediction_lgb1.csv', delimiter=',')
prediction_lgb2= genfromtxt('../input/prediction-lgb2/prediction_lgb2.csv', delimiter=',')

In [None]:
prediction_frame = pd.DataFrame({'cat1' : prediction_cat1,
                                 'cat2' : prediction_cat2,
                                'lgbm1' : prediction_lgb1,
                                'lgbm2' : prediction_lgb2,
                               'xgb1' : prediction_xgb1,
                               'xgb2' : prediction_xgb2,
                               'log1': prediction_log1})
train_pred.head()

In [None]:
# load previosly made predictions for unseen  train dataset
xgb_pred_tr1 = genfromtxt('../input/xgb-pred-tr1/xgb_pred_tr1.csv', delimiter=',')
xgb_pred_tr2 = genfromtxt('../input/xgb-pred-tr2/xgb_pred_tr2.csv', delimiter=',')
cat_pred_tr1 = genfromtxt('../input/cat-pred-tr1/cat_pred_tr1.csv', delimiter=',')
cat_pred_tr2 = genfromtxt('../input/cat-pred-tr2/cat_pred_tr2.csv', delimiter=',')
log_pred_tr1 = genfromtxt('../input/log-pred-tr1/log_pred_tr1.csv', delimiter=',')
lgbm_pred_tr1 = genfromtxt('../input/lgbm-predtr1/lgbm_pred_tr1 (2).csv', delimiter=',')
lgbm_pred_tr2 = genfromtxt('../input/lgbm-predtr2/lgbm_pred_tr2 (1).csv', delimiter=',')

In [None]:
train_pred = pd.DataFrame({'cat1' : cat_pred_tr1,
                                 'cat2' : cat_pred_tr2,
                                'lgbm1' : lgbm_pred_tr1,
                                'lgbm2' : lgbm_pred_tr2,
                               'xgb1' : xgb_pred_tr1,
                               'xgb2' : xgb_pred_tr2,
                               'log1': log_pred_tr1})
train_pred.head()

In [None]:
#train linear model using data frame with predictions for unseen train dataset
lin_model = LinearRegression()

skf = StratifiedKFold(n_splits=3)
prediction_test = np.zeros(prediction_frame.shape[0])

for train_index, test_index in skf.split(train_pred, y.values[800000:]):
    X = train_pred.iloc[train_index, :]
    y_train = y.values[train_index]
    
    x_valid = train_pred.iloc[test_index, :]
    y_valid = y.values[test_index]
    
    lin_model.fit(X, y_train)
    
    valid_pred = lin_model.predict(x_valid) 
    auc = roc_auc_score(y_valid, valid_pred)
    print(auc)
    
    prediction_test += lin_model.predict(prediction_frame)/skf.get_n_splits()

In [None]:
sub = pd.DataFrame({'id': data_test.id,
                          'target': prediction_test})
sub.to_csv("sub_forest.csv", index=False, header = True)