## Import libraries and data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing, modelling and evaluating
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
from xgboost import XGBClassifier
import xgboost as xgb

## Hyperopt modules
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from functools import partial

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# Any results you write to the current directory are saved as output.

import gc

In [None]:
train = pd.read_csv("../input/cat-in-the-dat-ii/train.csv")
test = pd.read_csv("../input/cat-in-the-dat-ii/test.csv")
submission = pd.read_csv("../input/cat-in-the-dat-ii/sample_submission.csv", index_col='id')

## Data profile

In [None]:
import pandas_profiling as pp
sample_profile=train.sample(frac=0.01)
pp.ProfileReport(sample_profile)

## Interaction testing

In [None]:
from statsmodels.formula.api import ols
import statsmodels.stats.multicomp
all_cols = train.columns.tolist()  
all_cols.remove('id')
all_cols.remove('target')

# Fits the model with the interaction term
# This will also automatically include the main effects for each factor
#for col1 in all_cols:
#    for col2 in all_cols:
#        formula=f"target~C("+col1+f")*C("+all_cols[0]+f")"
#        model = ols(formula, train).fit()
#        print(col1 + f"*" + col2 + f" p = {model.f_pvalue: .4f}")

## Target encode everything to begin with:

In [None]:
#Source: https://www.kaggle.com/caesarlupum/2020-20-lines-target-encoding
#train.sort_index(inplace=True)
train_y = train['target']
test_id = test['id']
tt=train.drop(['target', 'id'], axis=1)
te=test.drop('id', axis=1)

In [None]:
#from sklearn.metrics import roc_auc_score
cat_feat_to_encode = tt.columns.tolist()  

import category_encoders as ce
oof = pd.DataFrame([])
from sklearn.model_selection import StratifiedKFold

for tr_idx, oof_idx in StratifiedKFold(n_splits=5, random_state=2020, shuffle=True).split(tt, train_y):
    ce_target_encoder = ce.TargetEncoder(cols = cat_feat_to_encode, smoothing=0.2)
    ce_target_encoder.fit(tt.iloc[tr_idx, :], train_y.iloc[tr_idx])
    oof = oof.append(ce_target_encoder.transform(tt.iloc[oof_idx, :]), ignore_index=False)

In [None]:
ce_target_encoder = ce.TargetEncoder(cols = cat_feat_to_encode, smoothing=0.2) 
ce_target_encoder.fit(tt, train_y) 
tt = oof.sort_index()
te = ce_target_encoder.transform(te)

In [None]:
tt = tt.add_suffix('_tt')
te = te.add_suffix('_tt')

## Merge train and test

In [None]:
trest = train.drop('target', axis=1).append(test)
trest.info()

In [None]:
trest['missing_count'] = trest.apply(lambda x: x.count()/trest.shape[1], axis=1)

In [None]:
#merge test and train for later OHE
trest1 = trest.drop('id', axis=1)
#test1 = test.drop('id', axis=1)
trest1.head(10)

## Ordinals into numerics

In [None]:
#Only ord_0 is numerical values;
#We need to transform ord_1, ord_2 and ord_3 to set it in the correctly order to feed the machine learning model

ord_cols = ['ord_0', 'ord_1', 'ord_2', 'ord_3']
from pandas.api.types import CategoricalDtype 

# seting the orders of our ordinal features
ord_1 = CategoricalDtype(categories=['Novice', 'Contributor','Expert','Master', 'Grandmaster'], ordered=True)
ord_2 = CategoricalDtype(categories=['Freezing', 'Cold', 'Warm', 'Hot','Boiling Hot', 'Lava Hot'], ordered=True)
ord_3 = CategoricalDtype(categories=['a', 'b', 'c', 'd', 'e', 'f', 'g','h', 'i', 'j', 'k', 'l', 'm', 'n', 'o'], ordered=True)
ord_4 = CategoricalDtype(categories=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I','J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R','S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'], ordered=True)

In [None]:
# Transforming ordinal Features
#news=list([ord_1,ord_2,ord_3,ord_4])
#for i in range(3):
#    train1[ord_cols[i]]=train1[ord_cols[i]].astype(news[i])
#    train1[ord_cols[i]]=train1[ord_cols[i]].cat.codes

trest1.ord_1 = trest1.ord_1.astype(ord_1)
trest1.ord_2 = trest1.ord_2.astype(ord_2)
trest1.ord_3 = trest1.ord_3.astype(ord_3)
trest1.ord_4 = trest1.ord_4.astype(ord_4)

# test dataset
#test1.ord_1 = test1.ord_1.astype(ord_1)
#test1.ord_2 = test1.ord_2.astype(ord_2)
#test1.ord_3 = test1.ord_3.astype(ord_3)
#test1.ord_4 = test1.ord_4.astype(ord_4)

# Getting the codes of ordinal categoy's - train
trest1.ord_1 = trest1.ord_1.cat.codes
trest1.ord_2 = trest1.ord_2.cat.codes
trest1.ord_3 = trest1.ord_3.cat.codes
trest1.ord_4 = trest1.ord_4.cat.codes

# Geting the codes of ordinal categoy's - test
#test1.ord_1 = test1.ord_1.cat.codes
#test1.ord_2 = test1.ord_2.cat.codes
#test1.ord_3 = test1.ord_3.cat.codes
#test1.ord_4 = test1.ord_4.cat.codes

## Handmade imputation from a great R notebook: https://www.kaggle.com/ccccat/let-s-overfit-some

In [None]:
#https://www.kaggle.com/ccccat/let-s-overfit-some
trest1['ord_0']=trest1['ord_0'].replace(-1,2.01)
trest1['ord_1']=trest1['ord_1'].replace(-1, 1.86)
trest1['ord_2']=trest1['ord_2'].replace(-1,2.37)

trest1['ord_5']=trest1['ord_5'].fillna('Zx')
trest1=trest1.fillna(-1)
trest1.head()

## Frequency encode ord_5 after imputing NAs

In [None]:
# https://www.kaggle.com/lucamassaron/categorical-feature-encoding-with-tensorflow
# Enconding frequencies for some columns

#def frequency_encoding(column, df, df_test=None):
#    frequencies = df[column].value_counts().reset_index()
#    df_values = df[[column]].merge(frequencies, how='left',left_on=column, right_on='index').iloc[:,-1].values
#    if df_test is not None:
#        df_test_values = df_test[[column]].merge(frequencies, how='left', left_on=column, right_on='index').fillna(1).iloc[:,-1].values
#    else:
#        df_test_values = None
#    return df_values, df_test_values

#freq_encoded = list()

#for column in ['ord_5']:
#    train_values, test_values = frequency_encoding(column, train1, test1)
#    train1[column+'_counts'] = train_values/ len(train1)
#    test1[column+'_counts'] = test_values/ len(test1)
#    freq_encoded.append(column+'_counts')

## Make ord5 numerical

In [None]:
s=trest1['ord_5'].map(str).unique().tolist()
s.sort()
n=range(0, len(s))
sn=pd.concat([pd.DataFrame(s),pd.DataFrame(n)],axis=1)
sn.columns = ['ord_5', 'ord_5.1']

trest1['ord_5']=trest1['ord_5'].map(str)
trest1=pd.merge(trest1,sn,on='ord_5', how='left')
trest1=trest1.drop(['ord_5'], axis=1)

#test1['ord_5']=test1['ord_5'].map(str)
#test1=pd.merge(test1,sn,on='ord_5', how='left')
#test1=test1.drop(['ord_5'], axis=1)

trest1.info()

## Make dummy variables

In [None]:
dft = pd.get_dummies(trest1, columns=['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4','nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4','day', 'month','ord_0','ord_1','ord_2'])
#dfte = pd.get_dummies(test1, columns=['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4','nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4','day', 'month'])

In [None]:
dft.head(20)

## Hash encoding nominal high cardinality features - DONT DO IT much regret

In [None]:
high_card_feats = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']

In [None]:
from sklearn.feature_extraction import FeatureHasher
fh = FeatureHasher(n_features=12, input_type='string')
#dft1[high_card_feats] = dft1[high_card_feats].apply(lambda x: x.astype(str))
#dfte1[high_card_feats] = dfte1[high_card_feats].apply(lambda x: x.astype(str))
#tt1=pd.DataFrame([])
#te1=pd.DataFrame([])

#for col in high_card_feats:
#    hashed_features = fh.fit_transform(dft1[col])
#    hashed_features = hashed_features.toarray()
#    hashed_features = pd.DataFrame(hashed_features)
#    hashed_features = hashed_features.add_prefix(col)
#    tt1=pd.concat([tt1,hashed_features],axis=1)
    
#    hashed_features_test = fh.fit_transform(dfte1[col])
#    hashed_features_test = hashed_features_test.toarray()
#    hashed_features_test = pd.DataFrame(hashed_features_test)
#    hashed_features_test = hashed_features_test.add_prefix(col)
#    te1=pd.concat([te1,hashed_features_test],axis=1)

## Merge dft3 and target encoded variables

In [None]:
#target encodings

tte = pd.concat([tt,te],axis=0)
tte=tte.reset_index(drop=True)
dft4=pd.concat([dft,tte], axis=1)
dft4.head()

## Interactions NAH

In [None]:
#dft4['ord_02']=dft4['ord_0'].map(str)+dft4['ord_2'].map(str)
#dfte4['ord_02']=dfte4['ord_0'].map(str)+dfte4['ord_2'].map(str)
#dft4['ord_01']=dft4['ord_0'].map(str)+dft4['ord_1'].map(str)
#dfte4['ord_01']=dfte4['ord_0'].map(str)+dfte4['ord_1'].map(str)

#numericals
#dft4['ord_02']=dft4['ord_0_tt']*dft4['ord_2_tt']
#dfte4['ord_02']=dfte4['ord_0']*dfte4['ord_2_tt']
#dft4['ord_01']=dft4['ord_0_tt']*dft4['ord_1_tt']
#dfte4['ord_01']=dfte4['ord_0_tt']*dfte4['ord_1_tt']

#dft4['nomord_05']=dft4['ord_0_tt']*dft4['nom_5_tt']
#dfte4['nomord_05']=dfte4['ord_0_tt']*dfte4['nom_5_tt']
#dft4['nomord_09']=dft4['ord_0_tt']*dft4['nom_9_tt']
#dfte4['nomord_09']=dfte4['ord_0_tt']*dfte4['nom_9_tt']

In [None]:
#drop large noms for now
dft4=dft4.drop(high_card_feats, axis = 1)
#dfte4=dfte4.drop(high_card_feats, axis = 1)

In [None]:
#dft4 = pd.get_dummies(dft4, columns=['ord_02'],prefix=['ord_02'], drop_first=True, dummy_na=True) #drop muuda false
#dfte4 = pd.get_dummies(dfte4, columns=['ord_02'],prefix=['ord_02'], drop_first=True, dummy_na=True)
#dft4 = pd.get_dummies(dft4, columns=['ord_01'],prefix=['ord_01'], drop_first=True, dummy_na=True) #drop muuda false
#dfte4 = pd.get_dummies(dfte4, columns=['ord_01'],prefix=['ord_01'], drop_first=True, dummy_na=True)

In [None]:
dft4.head()

In [None]:
X=pd.DataFrame(dft4.iloc[0:600000])
Xe=pd.DataFrame(dft4.iloc[600000:1000000])
y=train['target']
y = y.astype(bool)
#X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2)

Let's see which variables to drop:

Old one:

In [None]:
#X1=dft4.drop(['id','target',
           #  'ord_0','ord_1','ord_2',
#             'ord_3','ord_4',
#             'nom_0_nan','nom_1_nan','nom_2_nan','nom_3_nan','nom_4_nan','day','month',
           # 'ord_02_nan','ord_02_-1.02','ord_02_1.04','ord_02_2.02',
           # 'ord_0_tt','ord_02_2.0-1','ord_02_-1.03','ord_02_3.03',
            # 'ord_01_nan','ord_01_3.02','ord_01_3.0-1','ord_01_2.03','ord_01_-1.01','ord_01_1.02',
#              'ord_0_nan','ord_1_nan', 'ord_2_nan','ord_0_2.0'
            # 'ord_02_3.0-1','ord_02_2.03','ord_01_2.02','ord_01_2.0-1','ord_01_1.03','ord_01_1.0-1','ord_01_-1.00'
#           ],axis=1)
#Xe1=dfte4.drop(['id',
            #   'ord_0','ord_1','ord_2',
#               'ord_3','ord_4',
#               'nom_0_nan','nom_1_nan','nom_2_nan','nom_3_nan','nom_4_nan','day','month',
          #  'ord_02_nan','ord_02_-1.02','ord_02_1.04','ord_02_2.02',
          #  'ord_0_tt','ord_02_2.0-1','ord_02_-1.03','ord_02_3.03',
           #  'ord_01_nan','ord_01_3.02','ord_01_3.0-1','ord_01_2.03','ord_01_-1.01','ord_01_1.02',
#              'ord_0_nan','ord_1_nan', 'ord_2_nan','ord_0_2.0'
           #  'ord_02_3.0-1','ord_02_2.03','ord_01_2.02','ord_01_2.0-1','ord_01_1.03','ord_01_1.0-1','ord_01_-1.00'
#             ],axis=1)

After much OH:

In [None]:
X1=X.drop(['ord_3','month_3.0','ord_4','nom_1_Star','ord_5_tt'#,'ord_2_0.0'#,'missing_count',
     #        'ord_01_2.01.86','ord_01_nan','ord_01_2.04.0','ord_01_2.01.0','ord_01_-1.01.0','ord_02_2.04.0','ord_02_nan','ord_02_-1.05.0',
      #       'ord_02_1.00.0','ord_01_-1.03.0','ord_5_tt','ord_01_-1.04.0','ord_01_2.00.0','ord_01_-1.02.0'
              ],axis=1)
Xe1=Xe.drop(['ord_3','month_3.0','ord_4','nom_1_Star'#,'ord_5_tt'#,'ord_2_0.0'#,'missing_count',
        #     'ord_01_2.01.86','ord_01_nan','ord_01_2.04.0','ord_01_2.01.0','ord_01_-1.01.0','ord_02_2.04.0','ord_02_nan','ord_02_-1.05.0',
       #      'ord_02_1.00.0','ord_01_-1.03.0','ord_5_tt','ord_01_-1.04.0','ord_01_2.00.0','ord_01_-1.02.0'
              ],axis=1)

In [None]:
import statsmodels.api as sm
X2 = sm.add_constant(X1)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

## Calculate multicollinearity between variable (be patient):

In [None]:
#Code for VIF Calculation
#https://statinfer.com/204-1-9-issue-of-multicollinearity-in-python/
#Writing a function to calculate the VIF values
import statsmodels.formula.api as sm1

def vif_cal(input_data):
    x_vars=input_data
    xvar_names=x_vars.columns
    for i in range(0,xvar_names.shape[0]):
        y=x_vars[xvar_names[i]] 
        x=x_vars[xvar_names.drop(xvar_names[i])]
        rsq=sm1.ols(formula="y~x", data=x_vars).fit().rsquared  
        vif=round(1/(1-rsq),2)
        print (xvar_names[i], " VIF = " , vif)

In [None]:
#Calculating VIF values using that function
#Multicollinearity not an issue for dummy encoded factors

#vif_cal(input_data=X)

## Logistic regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import f_regression


In [None]:
    X_train,X_test,y_train,y_test=train_test_split(X1,y,random_state=42,test_size=0.2)
    lr=LogisticRegression(C=1, class_weight="balanced")
    lr.fit(X_train,y_train)
    #y_pre=lr.predict(X_test)
    #print('Accuracy : ',accuracy_score(y_test,y_pre))
    preds_val = lr.predict_proba(X_test)[:,1]
    score = roc_auc_score(y_test ,preds_val)
    print("score: %f" % (score))

0.789246

In [None]:
#pd.DataFrame({'id': test_id, 'target': lr.predict_proba(Xe1)[:,-1]}).to_csv('submission.csv', index=False) #logreg
#pd.DataFrame({'id': test_id, 'target': glm.predict_proba(Xe1)[:,-1]}).to_csv('submission.csv', index=False) #GLM

## XGboost 

In [None]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score
X_train,X_test,y_train,y_test=train_test_split(X1,y,random_state=42,test_size=0.2)
model = xgb.XGBClassifier(objective ='binary:logistic',
                      learning_rate = 0.3,
                      max_depth = 3,
                      n_estimators = 100,
                      scale_pos_weight = 2,
                      random_state = 2020,
                      subsample = 0.8,
                      colsample_bytree=0.3)

In [None]:
#model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True,)

In [None]:
#preds_val = model.predict_proba(X_test)[:,1]
#score = roc_auc_score(y_test ,preds_val)
#print("score: %f" % (score))

0.7865 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0,
              learning_rate=0.3, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=400, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=2020,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=2, seed=None,
              silent=None, subsample=0.8, verbosity=1)

In [None]:
#pd.DataFrame({'id': test_id, 'target': model.predict_proba(Xe1)[:,-1]}).to_csv('submission.csv', index=False) #xgb

## Catboost with 3f validation

In [None]:
#!pip install --upgrade scikit-learn
import category_encoders as ce
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics, preprocessing
from sklearn.metrics import auc
#from sklearn.metrics import plot_roc_curve
import datetime
from time import time
from catboost import CatBoostClassifier
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import StackingClassifier

In [None]:
def make_classifier():
    clf = CatBoostClassifier(
                               loss_function='CrossEntropy',
                               eval_metric="AUC",
                               task_type="CPU",
                               learning_rate=0.05,
                               n_estimators =500,   
                               early_stopping_rounds=10,
                               random_seed=2019,
                               silent=True
                              )
        
    return clf

In [None]:
#X_train,X_test,y_train,y_test=train_test_split(X1,y,random_state=42,test_size=0.2)
y1=train['target']
scoring = "roc_auc"

folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
fold_preds = np.zeros([Xe1.shape[0],3])
oof_preds = np.zeros([X1.shape[0],3])
results = {}

# Fit Folds
for i, (trn_idx, val_idx) in enumerate(folds.split(X1,y1)):
    print(f"Fold {i} stacking....")
    clf = make_classifier()
    clf.fit(X1.loc[trn_idx,:], y1.loc[trn_idx])
    tmp_pred = clf.predict_proba(X1.loc[val_idx,:])[:,1]
    
    oof_preds[val_idx,0] = tmp_pred 
    fold_preds[:,0] += clf.predict_proba(Xe1)[:,1] / folds.n_splits
        
    estimator_performance = {}
    estimator_performance['stack_score'] = metrics.roc_auc_score(y1.loc[val_idx], tmp_pred)
    
print(estimator_performance)

0.78628

In [None]:
submission['target'] =fold_preds[:,0] #preds
submission.to_csv('submission.csv')
submission.head()

## Blend logreg and CB

In [None]:
blend=0.5*lr.predict_proba(Xe1)[:,-1]+0.5*fold_preds[:,0]
submission['target']=blend
submission.to_csv('submission.csv')
submission.head()