In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from string import ascii_uppercase
import collections
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from eli5.sklearn import PermutationImportance
import gc

In [None]:
def label_point(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        ax.text(point['x']+.02, point['y'], str(point['val']))

def WOE_based_IV_category(df,target,cont_var):
    woe_df=pd.DataFrame()
    bins_list=list()
    event_list=list()
    non_event_list=list()
    for var in df[cont_var].unique():
        even_count=np.nansum(df[df[cont_var]==var][target]>0)
        non_even_count=np.nansum(df[df[cont_var]==var][target]<1)
        event_list.append(even_count)
        non_event_list.append(non_even_count)
        bins_list.append(var)
        
    woe_df=pd.DataFrame({
        "bin":bins_list,
        "No_events":event_list,
        "No_nonevents":non_event_list
    })
    woe_df["event_pct"]=woe_df["No_events"]/sum(woe_df["No_events"])
    woe_df["nonevent_pct"]=woe_df["No_nonevents"]/sum(woe_df["No_nonevents"])
    woe_df["WOE"]=np.log(woe_df["event_pct"]/woe_df["nonevent_pct"])
    woe_df["IV"]=(woe_df["event_pct"]-woe_df["nonevent_pct"])*woe_df["WOE"]
    return woe_df
        
        
def WOE_based_IV(df,target,cont_var, limits):
    woe_df=pd.DataFrame()
    bins_list=list()
    event_list=list()
    non_event_list=list()
    for i in range(1,len(limits)):
        even_count=np.nansum(df[(limits[i-1]<df[cont_var])&(df[cont_var]<=limits[i])][target]>0)
        non_even_count=np.nansum(df[(limits[i-1]<df[cont_var])&(df[cont_var]<=limits[i])][target]<1)
        event_list.append(even_count)
        non_event_list.append(non_even_count)
        bins_list.append("lower: "+str(limits[i-1])+" - upper: "+str(limits[i]))
        
    woe_df=pd.DataFrame({
        "bin":bins_list,
        "No_events":event_list,
        "No_nonevents":non_event_list
    })
    woe_df["event_pct"]=woe_df["No_events"]/sum(woe_df["No_events"])
    woe_df["nonevent_pct"]=woe_df["No_nonevents"]/sum(woe_df["No_nonevents"])
    woe_df["WOE"]=np.log(woe_df["event_pct"]/woe_df["nonevent_pct"])
    woe_df["IV"]=(woe_df["event_pct"]-woe_df["nonevent_pct"])*woe_df["WOE"]
    return woe_df
        
def fit_model_using_classifier(alg,
                               dtrain,
                               predictors,
                               target="target",
                               performCV=True, 
                               printFeatureImportance=True, 
                               cv_folds=3,
                               repeat=5,
                               scoring='roc_auc',
                               only_top_x_feature=60
                              ):
    """
    I used the function found in this source
    https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
    I modified the code slightly
    """

    # Perform cross-validation:
    cv_score=list()
    if performCV:
        for i in range(0,repeat):
            cv_score_temp = cross_val_score(
                            alg, 
                            dtrain[predictors], 
                            dtrain[target], 
                            cv=cv_folds, 
                            scoring=scoring)
            cv_score=cv_score+list(cv_score_temp)
    
    # Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[target])
        
    # Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]        
    
    # Print model report:
    print("\nModel Report")
    print("Accuracy : " + str(round(metrics.accuracy_score(
        dtrain[target].values, dtrain_predictions),4)))
    print("AUC Score (Train): " + str(round(
        metrics.roc_auc_score(dtrain[target], dtrain_predprob),4)))
    
    if performCV:
        print("\n Cross validation summary ("+scoring+")")
        print("Average: "+str(round(np.mean(cv_score),4)))
        print("Std    : "+str(round(np.std(cv_score),4)))
        print("Min    : "+str(round(np.min(cv_score),4)))
        print("Max    : "+str(round(np.max(cv_score),4)))
                
    # Print Feature Importance:
    if printFeatureImportance and "feature_importances_" in dir(alg):
        plt.figure(figsize=(20,6))
        feat_imp = pd.Series(alg.feature_importances_, predictors).sort_values(ascending=False)
        feat_imp.head(only_top_x_feature).plot(kind='bar', title='Feature Importances',fontsize=12, color="#74B72E")
        plt.ylabel('Feature Importance Score')
        return alg, feat_imp
    else:
        return alg, list()

# 0. Load data and set the colors

In [None]:
PALETTE="Spectral"
col_light="#AEF359"
col_dark="#74B72E"
sns.color_palette("Spectral", as_cmap=True)
# model and selection params
prot_goods_min=0.000001
scoref="roc_auc"
repeat_numb=3
best_feature_numb=100
iteration_numb=1
total_trh=300
select_dummy=100

In [None]:
sample_submission=pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/sample_submission.csv")
train=pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/train.csv")
test=pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/test.csv")

In [None]:
display(train.describe())
display(train.head())

In [None]:
display(train.info())
# no missing values in train
display(test.info())
# no missing values in test

In [None]:
# generate column names
col_base=list(train.columns[1:-1])
print("Number of original features: "+str(len(col_base)))
col_base_numeric=list(set(col_base)-set(["f_27"]))
col_base_numeric.sort()
col_string=["f_27"]
col_category=["f_07","f_08","f_09","f_10","f_11","f_12","f_13","f_14","f_15","f_16","f_17","f_18","f_29","f_30"] # category ~= only a few possible values
col_continuous=list(set(col_base)-set(col_string)-set(col_category))
col_continuous.sort()

# 1. EDA

In [None]:
# Limited info on the distribution nothing really special at this point
for i in range(0,6):
    plt.figure(figsize=(20,6))
    index=0
    for col in col_base_numeric[i*5:min(31,(5*i+5))]:
        plt.subplot(1,5,index+1)
        plt.hist(train[col],
                 bins=100,
                 density=False,
                 color=col_dark);
        plt.title(col, fontsize=10);
        index+=1
    plt.show()

In [None]:
for i in range(0,6):
    plt.figure(figsize=(20,6))
    index=0
    for col in col_base_numeric[i*5:min(30,(5*i+5))]:
        plt.subplot(1,5,index+1)
        plt.hist(train[train["target"]<1][col],
                 bins=100,
                 density=False,
                 color=col_dark);
        plt.hist(train[train["target"]>0][col],
                 bins=100,
                 density=False,
                 color=col_light);
        plt.legend(["target = 0", "target = 1"], fontsize=6)
        plt.title(col, fontsize=10);
        index+=1
    plt.show()

In [None]:
plt.figure(figsize=(18, 14))
sns.heatmap(train[col_continuous + ['target']].corr(), center=0, annot=True, fmt='.2f',cmap=PALETTE)
plt.title('Continuous variable correlation with target', fontsize=20);
plt.show()
# Note: calculating correlations with a binary target is not very accurate, but mathematically feasible...

In [None]:
# Feature 27 must be treated separately

In [None]:
characters_present=list()
for c in ascii_uppercase:
    train_val=train["f_27"].str.count(c)
    if sum(train_val)>0:
        train[c]=train_val
        test[c]=test["f_27"].str.count(c)
        characters_present.append(c)
train['dist_char']=train["f_27"].apply(set).apply(len)
train['most_common_numb']=train["f_27"].apply(lambda s:collections.Counter(s).most_common(1)[0][1])

test['dist_char']=test["f_27"].apply(set).apply(len)
test['most_common_numb']=test["f_27"].apply(lambda s:collections.Counter(s).most_common(1)[0][1])
from_character_var=characters_present+['dist_char','most_common_numb']

In [None]:
woe_table=pd.DataFrame()
iv_table=pd.DataFrame()
for col in col_category+from_character_var:
    temp_df=train[[col,"target"]].groupby(col).agg({"target":[sum,len]}).reset_index()
    temp_df.columns=["value","goods","total"]
    temp_df["bads"]=temp_df["total"]-temp_df["goods"]
    temp_df["dist_bads"]=temp_df["bads"]/sum(temp_df["bads"])
    temp_df["dist_goods"]=temp_df["goods"]/sum(temp_df["goods"])
    temp_df["feature"]=col
    temp_df["dist_goods"]=np.where(temp_df["dist_goods"]<prot_goods_min,prot_goods_min,temp_df["dist_goods"])
    temp_df["dist_bads"]=np.where(temp_df["dist_bads"]<prot_goods_min,prot_goods_min,temp_df["dist_bads"])
    temp_df["WOE"]=np.log(temp_df["dist_goods"]/(temp_df["dist_bads"]))*100
    temp_df["IV_component"]=(temp_df["dist_goods"]-temp_df["dist_bads"])*temp_df["WOE"]
    woe_table=woe_table.append(temp_df)
    iv_table=iv_table.append(pd.DataFrame({
        "feature":[col],
        "IV": [sum(temp_df["IV_component"])]
    }))

woe_table.sort_values(by=["WOE"], inplace=True)
# we also generate a WOE lavel
woe_table["feature_with_value"]=woe_table["feature"]+"_value_"+woe_table["value"].astype(str)

In [None]:
# We can plot the highlights here
plt.figure(figsize=(20,6))
ax = sns.barplot(x="feature", y="IV", data=iv_table, palette=PALETTE)
plt.title('Comparison of categorical features by their information values', fontsize=20);
plt.ylabel('Information Value for categorical feature', fontsize=14);
plt.xlabel('Feature name', fontsize=14);
plt.xticks(rotation = 90);

In [None]:
# We can plot the highlights here
plt.figure(figsize=(20,6))
ax = sns.barplot(x="feature", y="IV", data=iv_table[iv_table["feature"].isin(col_category)], palette=PALETTE)
plt.title('Comparison of categorical features by their information values (excluding f_27 related features)', fontsize=20);
plt.ylabel('Information Value for categorical feature', fontsize=14);
plt.xlabel('Feature name', fontsize=14);
plt.xticks(rotation = 90);

In [None]:
plt.figure(figsize=(20,6))
ax = sns.barplot(x="feature_with_value", y="WOE", data=woe_table[woe_table["total"]>50].head(30), palette=PALETTE)
plt.title('Comparison of categorical feature values where WOE is small (higher prob. of target=0)', fontsize=20);
plt.ylabel('WOE', fontsize=14);
plt.xlabel('Feature name with value', fontsize=14);
plt.xticks(rotation = 90);

In [None]:
plt.figure(figsize=(20,6))
ax = sns.barplot(x="feature_with_value", y="WOE", data=woe_table[woe_table["total"]>50].tail(30), palette=PALETTE)
plt.title('Comparison of categorical feature values where WOE is large (higher prob. of target=1)', fontsize=20);
plt.ylabel('WOE', fontsize=14);
plt.xlabel('Feature name with value', fontsize=14);
plt.xticks(rotation = 90);

In [None]:
all_calc=pd.DataFrame()
for i in range(0,10):
    train["pos_candidate"]=train["f_27"].str.slice(i,i+1)
    woe_res=WOE_based_IV_category(df=train,target="target",cont_var="pos_candidate")
    woe_res["position"]=i
    all_calc=all_calc.append(woe_res)
all_calc["WOE"]=abs(all_calc["WOE"])
all_calc.sort_values(by=["WOE"],inplace=True)
all_calc["total"]=all_calc["No_events"]+all_calc["No_nonevents"]

In [None]:
# We can generate dummies using these variables
# The WOE is decent in some case, so might have information value
selected_indicators=all_calc[all_calc["total"]>(total_trh*2/3)].tail(60).copy()
display(selected_indicators)

In [None]:
position_chars=list()
for char, position in zip(list(selected_indicators["bin"]),list(selected_indicators["position"])):
    train[char+"_pos_"+str(position)]=np.where(train["f_27"].str.slice(position,position+1)==char,1,0)
    test[char+"_pos_"+str(position)]=np.where(test["f_27"].str.slice(position,position+1)==char,1,0)
    position_chars.append(char+"_pos_"+str(position))

# 2. Feature generation

In [None]:
# Now we need an algorithm to create new categorical variables
# WE create dummy variables for the top x feature - value pairs based on WOE
woe_table["WOE_abs"]=abs(woe_table["WOE"])
woe_table.sort_values(by=["WOE_abs"], inplace=True)
select_dummies=woe_table[woe_table["total"]>total_trh].tail(select_dummy)

In [None]:
dummy_names=list()
for v,feature,dummy_name in zip(list(select_dummies["value"]),
                                list(select_dummies["feature"]),
                                list(select_dummies["feature_with_value"])):
    dummy_names.append(dummy_name)
    train[dummy_name]=np.where(train[feature]==int(v),1,0)
    test[dummy_name]=np.where(test[feature]==int(v),1,0)

In [None]:
# Here I attemp to generate new features, I consider:
# 1) multiplicative relationship
# 2) a pairwise additive relationship based on some common factor (I deploy a PCA here)

train_add=train[["id","target"]+col_continuous].copy()
test_add=test[["id"]+col_continuous].copy()
col_continuous_corr=col_continuous.copy()
for k in range(0,iteration_numb):
    print("iteration (mults) : "+str(k))
    additional_cols=list()
    for i in range(0,len(col_continuous_corr)):
        for j in range(min(i+1,len(col_continuous_corr)),len(col_continuous_corr)):
            f_i, _ =f_classif(np.array(train_add[[col_continuous_corr[i]]]).reshape(-1,1),np.array(train_add["target"]))
            f_j, _ =f_classif(np.array(train_add[[col_continuous_corr[j]]]).reshape(-1,1),np.array(train_add["target"]))
            f_cand, _ =f_classif(np.array(train_add[col_continuous_corr[i]]*train_add[col_continuous_corr[j]]).reshape(-1,1),np.array(train_add["target"]))
            if f_cand>(1.03*max(f_i,f_j)):
                train_add[col_continuous_corr[i]+"_"+col_continuous_corr[j]+"_m"]=train_add[col_continuous_corr[i]]*train_add[col_continuous_corr[j]]
                additional_cols.append(col_continuous_corr[i]+"_"+col_continuous_corr[j]+"_m")
                test_add[col_continuous_corr[i]+"_"+col_continuous_corr[j]+"_m"]=test_add[col_continuous_corr[i]]*test_add[col_continuous_corr[j]]
    col_continuous_corr=list(set(col_continuous_corr).union(set(additional_cols)))
    
    pca = PCA(n_components=2)
    additional_cols=list()
    print("iteration (PCA) : "+str(k))
    if k==0:
        for i in range(0,len(col_continuous_corr)):
            for j in range(min(i+1,len(col_continuous_corr)),len(col_continuous_corr)):
                pca_comps = pca.fit_transform(train_add[[col_continuous_corr[i], col_continuous_corr[j]]])
                np.array(np.matmul(train_add[[col_continuous_corr[i], col_continuous_corr[j]]],pca.components_[0]))
                f_i, _ =f_classif(np.array(train_add[[col_continuous_corr[i]]]).reshape(-1,1),np.array(train_add["target"]))
                f_j, _ =f_classif(np.array(train_add[[col_continuous_corr[j]]]).reshape(-1,1),np.array(train_add["target"]))
                f_cand, _ =f_classif(np.array(np.matmul(train_add[[col_continuous_corr[i], col_continuous_corr[j]]],pca.components_[0])).reshape(-1,1),np.array(train_add["target"]))
                if f_cand>(1.03*max(f_i,f_j)):
                    train_add[col_continuous_corr[i]+"_"+col_continuous_corr[j]+"_pca"]=np.matmul(train_add[[col_continuous_corr[i], col_continuous_corr[j]]],pca.components_[0])
                    additional_cols.append(col_continuous_corr[i]+"_"+col_continuous_corr[j]+"_pca")
                    test_add[col_continuous_corr[i]+"_"+col_continuous_corr[j]+"_pca"]=np.matmul(test_add[[col_continuous_corr[i], col_continuous_corr[j]]],pca.components_[0])
        col_continuous_corr=list(set(col_continuous_corr).union(set(additional_cols)))

In [None]:
# I add a constant temporarily, so we can estimate logits with one variable
train_add["zero"]=1

In [None]:
# We assess the predictive power of the new variables
anova_result=pd.DataFrame()
for col in col_continuous_corr:
    fval, _=f_classif(np.array(train_add[col]).reshape(-1,1),np.array(train_add["target"]))
    clf=LogisticRegression(random_state=0).fit(train_add[["zero",col]],np.array(train_add["target"]))
    accuracy=clf.score(train_add[["zero",col]],np.array(train_add["target"]))

    anova_result=anova_result.append(
                    pd.DataFrame({"feature":[col],
                                  "anova": [fval[0]],
                                  "accuracy": [accuracy],
                                  "coefficient": [clf.coef_[0][1]]
                                 })
                )
anova_result.sort_values(by=["anova"],ascending=False,inplace=True)
print(len(anova_result))
anova_result=anova_result.drop_duplicates(subset=["anova"]) # I'm not proud of this, but some metrics are symmetrical so we drop
# We can see how many variables did we drop...
print(len(anova_result))

In [None]:
plt.figure(figsize=(20,10))
sns.scatterplot(data=anova_result, x="anova", y="accuracy", color=col_dark)
plt.title('Comparison of numeric features', fontsize=20);
plt.xlabel('Anova F-statistic for feature', fontsize=14);
plt.ylabel('Naive Logistic regression accuracy with single feature', fontsize=14);
plt.show()

# only top features
anova_result_top=anova_result.head(10).copy()
plt.figure(figsize=(20,10))
sns.scatterplot(data=anova_result_top, x="anova", y="accuracy", color=col_dark)
plt.title('Comparison of numeric features (only top 10)', fontsize=20);
plt.xlabel('Anova F-statistic for feature', fontsize=14);
plt.ylabel('Naive Logistic regression accuracy with single feature', fontsize=14);
label_point(anova_result_top["anova"], anova_result_top["accuracy"], anova_result_top["feature"], plt.gca())  
plt.show()

In [None]:
plt.figure(figsize=(18, 14))
sns.heatmap(train_add[list(anova_result_top["feature"])+ col_continuous + ['target']].corr(), center=0, annot=True, fmt='.2f',cmap=PALETTE)
plt.title('Continuous vayeriable correlation with target (with top features)', fontsize=20);
plt.show()

In [None]:
gc.collect();

In [None]:
# Based on AMbrusM and WTI 200 I create some combined features manually and try to apply a similar WOE methodology to what I've used earlier, 
# so I can create bounds.
# https://www.kaggle.com/competitions/tabular-playground-series-may-2022/discussion/323892
# https://www.kaggle.com/competitions/tabular-playground-series-may-2022/discussion/323766    
# Qute from Ambrus A
# the projection to f_02 and f_21
# the projection to f_05 and f_22
# the projection to f_00+f_01 and f_26

In [None]:
train["f_00_f_01_p"]=train["f_00"]+train["f_01"]

In [None]:
plt.figure(figsize=(10,7))
plt.ylim(-10, 10)
plt.xlim(-10, 10)
sns.scatterplot(data=train, x="f_02", y="f_21",s=1, hue="target",palette=PALETTE);
plt.title("f_02 and f_21");
plt.show()
plt.figure(figsize=(10,7))
sns.scatterplot(data=train, x="f_05", y="f_22",s=1, hue="target",palette=PALETTE);
plt.title("f_05 and f_22");
plt.ylim(-10, 10)
plt.xlim(-10, 10)
plt.show()
plt.figure(figsize=(10,7))
sns.scatterplot(data=train, x="f_00_f_01_p", y="f_26",s=1, hue="target",palette=PALETTE);
plt.title("f_00_f_01_p and f_26");
plt.ylim(-10, 10)
plt.xlim(-10, 10)
plt.show()
# We can see certain line appearing, which indicates that based on the weighted sum of the feature values, we have different regions...
# E.g. f_02 and f_21: if the two features sum is smaller than aroun -5 we have a different target prob...
# We can try to determine the slope (f_02+y*f_21) or we can assume y (the slope) is simply 1... which is more or less inline with our observations
# Right now it seems the slope is indeed 1, x+y<k value can be used here.

In [None]:
train["f_21_f_02_p"]=1.0*train["f_21"]+1.0*train["f_02"]
train["f_22_f_05_p"]=1.0*train["f_22"]+1.0*train["f_05"]
train["f_00_f_01_p_f26_p"]=1.0*train["f_00_f_01_p"]+1.0*train["f_26"]

# I try to determine the optimal cuts...
# I did this manually based on the charts and some WOE calcs...
# f_21_f_02_p : -5.3 and 5.2 when x+y
# f_22_f_05_p : -5.4 and 5.1 when x+y
# f_00_f_01_p_f26_p: -5.0 and 5.0 when x+y

# I try to determine the first optimal cut...
optimal_cut_l=pd.DataFrame()
for i in np.arange(-12,-3,0.1):
    #print(i)
    woe_table=WOE_based_IV(train,target="target",cont_var="f_22_f_05_p", limits=[-100,i,5.2,100])
    optimal_cut_l=optimal_cut_l.append(pd.DataFrame({
        "value":[i],
        "WOE":[woe_table.head(1)["WOE"][0]]
    }))


optimal_cut_u=pd.DataFrame()
for i in np.arange(3,12,0.1):
    #print(i)
    woe_table=WOE_based_IV(train,target="target",cont_var="f_22_f_05_p", limits=[-100,-5.3,i,100])
    optimal_cut_u=optimal_cut_u.append(pd.DataFrame({
        "value":[i],
        "WOE":[woe_table.tail(1)["WOE"][2]]
    }))

In [None]:
plt.figure(figsize=(14,7))
plt.plot(optimal_cut_u["value"],optimal_cut_u["WOE"],color=col_dark,label="upper cut value")
plt.plot(abs(optimal_cut_l["value"]),optimal_cut_l["WOE"],color=col_light,label="lower cut value")
plt.legend()
plt.xlabel("Cut value")
plt.ylabel("WOE score")
plt.title("WOE and cut value choice (for lower cut value the abs(cut value) is presented, f_22_f_05_p)");

In [None]:
woe_table=WOE_based_IV(train,target="target",cont_var="f_00_f_01_p_f26_p", limits=[-100,-5.0,5.0,100])
display(woe_table)

In [None]:
# let us extend the feature list
cont_features_interact=["f_00_f_01_p_f26_p","f_22_f_05_p","f_21_f_02_p"]
test["f_00_f_01_p"]=test["f_00"]+test["f_01"]
test["f_21_f_02_p"]=test["f_21"]+test["f_02"]
test["f_22_f_05_p"]=test["f_22"]+test["f_05"]
test["f_00_f_01_p_f26_p"]=test["f_00_f_01_p"]+test["f_26"]

# f_21_f_02_p : -5.3 and 5.2 when x+y
# f_22_f_05_p : -5.4 and 5.1 when x+y
# f_00_f_01_p_f26_p: -5.0 and 5.0 when x+y

train["f_21_f_02_p_UPPER"]=np.where(train["f_21_f_02_p"]>=5.2,1,0)
test["f_21_f_02_p_UPPER"]=np.where(test["f_21_f_02_p"]>=5.2,1,0)
train["f_21_f_02_p_LOWER"]=np.where(train["f_21_f_02_p"]<=-5.3,1,0)
test["f_21_f_02_p_LOWER"]=np.where(test["f_21_f_02_p"]<=-5.3,1,0)

train["f_22_f_05_p_UPPER"]=np.where(train["f_22_f_05_p"]>=5.1,1,0)
test["f_22_f_05_p_UPPER"]=np.where(test["f_22_f_05_p"]>=5.1,1,0)
train["f_22_f_05_p_LOWER"]=np.where(train["f_22_f_05_p"]<=-5.4,1,0)
test["f_22_f_05_p_LOWER"]=np.where(test["f_22_f_05_p"]<=-5.4,1,0)

train["f_00_f_01_p_f26_p_UPPER"]=np.where(train["f_00_f_01_p_f26_p"]>=5.0,1,0)
test["f_00_f_01_p_f26_p_UPPER"]=np.where(test["f_00_f_01_p_f26_p"]>=5.0,1,0)
train["f_00_f_01_p_f26_p_LOWER"]=np.where(train["f_00_f_01_p_f26_p"]<=-5.0,1,0)
test["f_00_f_01_p_f26_p_LOWER"]=np.where(test["f_00_f_01_p_f26_p"]<=-5.0,1,0)

cont_features_interact=cont_features_interact+[
    "f_21_f_02_p_UPPER", "f_21_f_02_p_LOWER",
    "f_22_f_05_p_UPPER", "f_22_f_05_p_LOWER",
    "f_00_f_01_p_f26_p_UPPER", "f_00_f_01_p_f26_p_LOWER"
]

In [None]:
# Explore othger variables
# We were able to reconcile AmbrusM's estimates by simple WOE, we can extend the heuristic to look for similar pairs
optimal_cut_all=pd.DataFrame()
for i in range(0,len(col_continuous)):
    print(col_continuous[i])
    for j in range(i,len(col_continuous)):
        train["candidate"]=train[col_continuous[i]]+train[col_continuous[j]]
        woe_table=WOE_based_IV(train,target="target",cont_var="candidate", limits=[-100,-5.1,5.1,100])
        optimal_cut_all=optimal_cut_all.append(pd.DataFrame({
            "variables":[col_continuous[i]+"__"+col_continuous[j]],
            "WOE_lower":[woe_table.head(1)["WOE"][0]],
            "WOE_upper":[woe_table.tail(1)["WOE"][2]]
        }))

In [None]:
# The pairs identified earlier are here...
optimal_cut_all[(abs(optimal_cut_all["WOE_lower"])>1)|(abs(optimal_cut_all["WOE_upper"])>1)]

In [None]:
# We see patterns with f_21, but this seems to only impact f_21...
sns.scatterplot(data=train, x="f_01", y="f_21",s=1, hue="target",palette=PALETTE);
plt.title("f_01 and f_21");
plt.show()

sns.scatterplot(data=train, x="f_00", y="f_21",s=1, hue="target",palette=PALETTE);
plt.title("f_00 and f_21");
plt.show()

sns.scatterplot(data=train, x="f_05", y="f_21",s=1, hue="target",palette=PALETTE);
plt.title("f_05 and f_21");
plt.show()

In [None]:
# However there are patterns with other features
sns.scatterplot(data=train, x="f_22", y="f_24",s=1, hue="target",palette=PALETTE);
plt.title("f_22 and f_24");
plt.show()

sns.scatterplot(data=train, x="f_02", y="f_26",s=1, hue="target",palette=PALETTE);
plt.title("f_02 and f_26");
plt.show()

In [None]:
optimal_cut_substr=pd.DataFrame()
for i in range(0,len(col_continuous)):
    print(col_continuous[i])
    for j in range(i,len(col_continuous)):
        train["candidate"]=train[col_continuous[i]]-train[col_continuous[j]]
        woe_table=WOE_based_IV(train,target="target",cont_var="candidate", limits=[-100,-5.1,5.1,100])
        optimal_cut_substr=optimal_cut_substr.append(pd.DataFrame({
            "variables":[col_continuous[i]+"__"+col_continuous[j]],
            "WOE_lower":[woe_table.head(1)["WOE"][0]],
            "WOE_upper":[woe_table.tail(1)["WOE"][2]]
        }))

In [None]:
optimal_cut_substr[(abs(optimal_cut_substr["WOE_lower"])>0.8)|(abs(optimal_cut_substr["WOE_upper"])>0.8)]

In [None]:
sns.scatterplot(data=train, x="f_03", y="f_21",s=1, hue="target",palette=PALETTE);
plt.title("f_03 and f_21");
plt.show()

sns.scatterplot(data=train, x="f_19", y="f_21",s=1, hue="target",palette=PALETTE);
plt.title("f_19 and f_21");
plt.show()

train["f_03_f_19_cand"]=train["f_03"]+train["f_19"]

sns.scatterplot(data=train, x="f_03_f_19_cand", y="f_21",s=1, hue="target",palette=PALETTE);
plt.title("f_03_f_19_cand and f_21");
plt.show()

sns.scatterplot(data=train, x="f_03", y="f_19",s=1, hue="target",palette=PALETTE);
plt.title("f_03 and f_19");
plt.show()


In [None]:
# Conclusion there seems to be only weaker relationships here, so no need to add new dummy variables...

# 3. Assemble train and test dataset

In [None]:
train_final=train_add.drop(columns=["zero"]).merge(train[["id"]+dummy_names+position_chars+col_category+from_character_var+cont_features_interact],how="left",on="id")
del train
del train_add
test_final=test_add.merge(test[["id"]+dummy_names+position_chars+col_category+from_character_var+cont_features_interact],how="left",on="id")
feature_columns=list(train_final.columns[2:])
del test
del test_add

In [None]:
scaler = StandardScaler()
train_final[feature_columns]=scaler.fit_transform(train_final[feature_columns])
test_final[feature_columns]=scaler.transform(test_final[feature_columns])

# 4. Initial model estimation

In [None]:
log = LogisticRegression(verbose=False, C=0.1, max_iter=1000)

log, feat_imp=fit_model_using_classifier(log, 
                                           dtrain=train_final, 
                                           predictors=feature_columns,
                                           repeat=repeat_numb,
                                           scoring=scoref)
gc.collect();

In [None]:
clf = CatBoostClassifier(verbose=False)

clf, feat_imp=fit_model_using_classifier(clf, 
                                           dtrain=train_final,
                                           predictors=feature_columns,
                                           repeat=repeat_numb,
                                           scoring=scoref)
gc.collect();

In [None]:
catboost_submission=pd.DataFrame({
    "id": test_final["id"],
    "target": clf.predict_proba(test_final[feature_columns])[:,1]
})
catboost_submission.to_csv("catboost_submission.csv",index=False)

# 5. Variable selection and refined model estimation

In [None]:
# Variable selection using initial model estimate and permutation score
perm1 = PermutationImportance(clf, random_state=1).fit(train_final[feature_columns], train_final["target"])
feature_importance_recalc=pd.DataFrame({
                        "feature": feature_columns,
                        "feature_importance":perm1.feature_importances_*100
                    })
feature_importance_recalc.sort_values(by=["feature_importance"],ascending=False,inplace=True)
best_features=list(set(set(feature_importance_recalc.head(best_feature_numb)["feature"]).union(set(feat_imp.index[0:best_feature_numb]))))

In [None]:
best_features=list(set(set(feature_importance_recalc.head(best_feature_numb)["feature"]).union(set(feat_imp.index[0:best_feature_numb]))))

In [None]:
plt.figure(figsize=(20,6))
ax = sns.barplot(x="feature",
                 y="feature_importance", 
                 data=feature_importance_recalc[feature_importance_recalc["feature_importance"]>0.01].tail(30), 
                 palette=PALETTE)
plt.title('Comparison of feature importance scores', fontsize=20);
plt.ylabel('Feature importance', fontsize=14);
plt.xlabel('Feature name', fontsize=14);
plt.xticks(rotation = 90);

In [None]:
# Number of total features
len(best_features)

In [None]:
clf2 = CatBoostClassifier(verbose=False)

clf2, feat_imp=fit_model_using_classifier(clf2, 
                                           dtrain=train_final, 
                                           predictors=best_features,
                                           repeat=repeat_numb,
                                           scoring=scoref)
gc.collect();

In [None]:
catboost_submission2=pd.DataFrame({
    "id": test_final["id"],
    "target": clf2.predict_proba(test_final[best_features])[:,1]
})
catboost_submission2.to_csv("catboost_submission2.csv",index=False)

In [None]:
cat_params={
    "iterations":1200,
    "learning_rate":0.025,
    'loss_function' : 'Logloss',
    "eval_metric":"AUC",
    "verbose":False
}
clf3 = CatBoostClassifier(**cat_params)

clf3, feat_imp=fit_model_using_classifier(clf3, 
                                           dtrain=train_final, 
                                           predictors=best_features,
                                           repeat=repeat_numb,
                                           scoring=scoref)
gc.collect();

In [None]:
catboost_submission3=pd.DataFrame({
    "id": test_final["id"],
    "target": clf3.predict_proba(test_final[best_features])[:,1]
})
catboost_submission3.to_csv("catboost_submission3.csv",index=False)