# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict, learning_curve,\
train_test_split, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer, LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.inspection import permutation_importance
from sklearn.metrics import plot_confusion_matrix, classification_report, precision_recall_curve
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDRegressor, SGDClassifier, Ridge, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.compose import make_column_selector

## pipeline stuff

from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn import set_config; set_config(display='diagram')

import warnings 
warnings.filterwarnings('ignore')

from sklearn.metrics import make_scorer

# Preprocessing

In [3]:
# def_df = pd.read_csv("/home/slawa/code/code-rep0/projects/data/defaulter_data_13364.csv", index_col=[0])
# pay_df = pd.read_csv("/home/slawa/code/code-rep0/projects/data/payer_data_41940.csv", index_col=[0])

def_df = pd.read_parquet("/Users/sjoerddewit/Desktop/Programming/6 Le Wagon Data Science/final_project/defaulter_data_6k_ids_compress.parquet")
pay_df = pd.read_parquet("/Users/sjoerddewit/Desktop/Programming/6 Le Wagon Data Science/final_project/payer_data_20k_ids.parquet")

def_df['default'] = 1
pay_df['default'] = 0

df = pd.concat([def_df, pay_df])

df.reset_index(inplace=True)

y = df['default'].reset_index(drop=True)

X = df.drop(columns=['default']).reset_index(drop=True)

cat_vars = ['B_30', 
            'B_38', 
            'D_114', 
            'D_116', 
            'D_117', 
            'D_120', 
            'D_126', 
            'D_63', 
            'D_64', 
            'D_66', 
            'D_68']

X_corr = X.corr()

X_corr = X_corr.unstack().reset_index() # Unstack correlation matrix 
X_corr.columns = ['feature_1','feature_2', 'correlation_all'] # rename columns
X_corr.sort_values(by="correlation_all",ascending=False, inplace=True) # sort by correlation
X_corr = X_corr[X_corr['feature_1'] != X_corr['feature_2']] # Remove self correlation
X_corr = X_corr.drop_duplicates(subset='correlation_all')

red_features = list(X_corr[abs(X_corr['correlation_all'])>=.95]['feature_1']) ## abs so we also consider the negative corrs

X_red = X.drop(columns=red_features) ## dropping the highly correlated columns

## checking whether the high correlations are gone
X_red_corr = X_red.corr()
X_red_corr = X_red_corr.unstack().reset_index() # Unstack correlation matrix 
X_red_corr.columns = ['feature_1','feature_2', 'correlation_all'] # rename columns
X_red_corr.sort_values(by="correlation_all",ascending=False, inplace=True) # sort by correlation
X_red_corr = X_red_corr[X_red_corr['feature_1'] != X_red_corr['feature_2']] # Remove self correlation
X_red_corr = X_red_corr.drop_duplicates(subset='correlation_all')


## drop columns with nans if in both groups > 80% nans

nan_threshold= 0.8 ## adjust the hardcoded values
def_nans = def_df.isna().sum()/len(def_df) 
def_nans_80 = def_nans[def_nans >= nan_threshold].index
pay_nans = pay_df.isna().sum()/len(pay_df)
pay_nans_80 = pay_nans[pay_nans>= nan_threshold].index
nans_80 = [feature for feature in pay_nans_80 if feature in def_nans_80]

## check whether features were already removed
red_features_nan = [feature for feature in nans_80 if feature not in red_features] 
X_red = X_red.drop(columns=red_features_nan)
dropped_columns = red_features + red_features_nan

## Builsing the pipeline
num_vars = [feature for feature in X_red.columns[2:] if feature not in cat_vars] ## exclude dates and IDs (first two columns)
str_vars = [feature for feature in X_red.columns[2:] if not pd.api.types.is_numeric_dtype(X_red[feature])] ## columns that are not numeric at all 
#red_cat_vars = [feature for feature in cat_vars if feature not in dropped_columns + str_vars] ## remaining categorical variables that have no string values
red_cat_vars = [feature for feature in cat_vars if feature not in dropped_columns + str_vars] ## remaining categorical variables 


# impute mean/most frequent value for other nans (specific to group?)
# robustscale all numerical values

num_imputer = SimpleImputer(strategy='mean')
num_scaler = RobustScaler()

#num_imputer = KNNImputer(n_neighbors=2) ## KNNIMputer is computationally demanding
## should come AFTER SCALING

num_pipe = make_pipeline(num_imputer, num_scaler)

#str_trans = OrdinalEncoder() # is only needed if one wants to do knnimputer

#nan_trans = FunctionTransformer(nan_imp)

#nan_trans = FunctionTransformer(lambda X: X.applymap(lambda x: np.nan if x in [-1,-1.0, "-1.0", "-1"] else x))

cat_imputer = SimpleImputer(strategy="most_frequent") ## replace with KNNimputer on one neighbour, after transforming to numericals
#cat_imputer = KNNImputer(n_neighbors=1) # introducing it did not improve performance, but is computationally demanding
cat_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore') ## what happens to the old columns?
cat_pipe = make_pipeline(cat_imputer, cat_encoder)
#str_pipe = make_pipeline(cat_imputer, str_trans, cat_encoder)
#str_pipe = make_pipeline(cat_imputer, cat_encoder)
# impute mean/most frequent value for other nans (specific to group?)
# robustscale all numerical values

num_imputer = SimpleImputer(strategy='mean')
num_scaler = RobustScaler()

#num_imputer = KNNImputer(n_neighbors=2) ## KNNIMputer is computationally demanding
## should come AFTER SCALING

num_pipe = make_pipeline(num_imputer, num_scaler)

#str_trans = OrdinalEncoder() # is only needed if one wants to do knnimputer

#nan_trans = FunctionTransformer(nan_imp)

#nan_trans = FunctionTransformer(lambda X: X.applymap(lambda x: np.nan if x in [-1,-1.0, "-1.0", "-1"] else x))

cat_imputer = SimpleImputer(strategy="most_frequent") ## replace with KNNimputer on one neighbour, after transforming to numericals
#cat_imputer = KNNImputer(n_neighbors=1) # introducing it did not improve performance, but is computationally demanding
cat_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore') ## what happens to the old columns?
cat_pipe = make_pipeline(cat_imputer, cat_encoder)
#str_pipe = make_pipeline(cat_imputer, str_trans, cat_encoder)
#str_pipe = make_pipeline(cat_imputer, cat_encoder)

print('done ✅')


# A self-contained alternative Nans imputer

In [5]:
def alt_nan_imp(X):
    
    cat_vars = ['B_30', 
            'B_38', 
            'D_114', 
            'D_116', 
            'D_117', 
            'D_120', 
            'D_126', 
            'D_63', 
            'D_64', 
            'D_66', 
            'D_68']
    
    alt_nan_list = [-1,-1.0, "-1.0", "-1"]
    
    cat_columns = [column for column in X.columns if column in cat_vars]
    
    X[cat_columns] = X[cat_columns].applymap(lambda x: np.nan if x in alt_nan_list else x)

alt_nan_imp(X_red)

preprocessor = ColumnTransformer([
    ('num_pip', num_pipe, num_vars),
    ('cat_pip', cat_pipe, red_cat_vars)],
    remainder='drop' ## all columns not in num_vars and red_cat_vars are dropped.
)

alt_nan_imp(X)
preprocessor.fit(X)

X_pp = pd.DataFrame(preprocessor.fit_transform(X_red))
X_pp['customer_ID'] = X_red['customer_ID']
X_pp_avg = X_pp.groupby('customer_ID').mean()
y_ID = pd.DataFrame(y)
y_ID['customer_ID'] = X_red['customer_ID']
y_unique = y_ID.groupby('customer_ID').mean().astype(int) ## actually, this data is just in train_labels
X_pp_avg_train, X_pp_avg_val, y_unique_train, y_unique_val = train_test_split(X_pp_avg, y_unique, test_size=0.3) 


if 'customer_ID' in X_pp.columns:
    X_pp.drop(columns='customer_ID', inplace=True)
else:
    pass

# # Train_test_split needs to be on preprocessed data
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)


# pp_pred_pipe = make_pipeline(preprocessor, mod)
# pp_pred_pipe
# pp_pred_pipe.fit(X, y)

print('done ✅')


done ✅


# Creating custom amex scoring metric

In [6]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    
    ## TWEAK
    y_true = pd.DataFrame(y_true.reset_index(drop=True))
    y_pred = pd.DataFrame(y_pred)
    
    y_true = y_true.rename(columns={y_true.columns[0]:'target'})
    y_pred = y_pred.rename(columns={y_pred.columns[0]:'prediction'})
    ##
    
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

print('done ✅')


done ✅


In [7]:
amex_metric_scorer = make_scorer(amex_metric)

## dict of scoring metrics one might want to pass into cross validation
scorings = {'recall':'recall',
            'f1':'f1',
           'amex': amex_metric_scorer}

print('done ✅')

done ✅


# Averaging preprocessed X and y

In [8]:
X_pp['customer_ID'] = X_red['customer_ID']
X_avg_pp = X_pp.groupby('customer_ID').mean()
y_ID = pd.DataFrame(y)
y_ID['customer_ID'] = X_red['customer_ID']
y_unique = y_ID.groupby('customer_ID').mean().astype(int) ## actually, this data is just in train_labels

print('done ✅')


done ✅


# XG Boost & Grid Search

In [12]:
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import GridSearchCV, KFold 
import xgboost as xgb
print('XGB Version',xgb.__version__)

print('done ✅')


XGB Version 2.0.0-dev
done ✅


In [None]:
%%time

xgb_class = XGBClassifier()

# Hyperparameter Grid
grid = {'max_depth': [3, 5, 7], 
        'n_estimators': [88, 90, 92],
        'learning_rate': [0.09, 0.1, 0.11]
         }

# Instanciate Grid Search
search = GridSearchCV(xgb_class, 
                      grid, 
#                       scoring = 'r2',
                      cv = 5,
                      n_jobs=-1 # paralellize computation
                     ) 


search.fit(X_pp_avg_train, y_unique_train,
    # evaluate loss at each iteration
    eval_set=[(X_pp_avg_train,y_unique_train), (X_pp_avg_val, y_unique_val)],  
    # stop iterating when eval loss increases 5 times in a row
    early_stopping_rounds=4
)

print('done ✅')

In [None]:
search.best_score_


In [None]:
search.best_params_


In [None]:
search.best_estimator_


In [None]:
xgb_best_reg = XGBClassifier(max_depth= 5,
                             n_estimators= 90, 
                             learning_rate= 0.1 )

xgb_best_reg.fit(X_pp_avg_train, y_unique_train,
    # evaluate loss at each iteration
    eval_set=[(X_pp_avg_train,y_unique_train), (X_pp_avg_val, y_unique_val)],  
    # stop iterating when eval loss increases 5 times in a row
    early_stopping_rounds=7
)



In [None]:
y_pred = xgb_best_reg.predict(X_pp_avg_val)


In [None]:
amex_metric(y_unique_val, y_pred)

In [10]:
# VERSION NAME FOR SAVED MODEL FILES
VER = 1

# TRAIN RANDOM SEED
SEED = 42

# FILL NAN VALUE
NAN_VALUE = -127 # will fit in int8

# FOLDS PER MODEL
FOLDS = 5


# XGB MODEL PARAMETERS
xgb_parms = { 
    'max_depth':4, 
    'learning_rate':0.05, 
    'subsample':0.8,
    'colsample_bytree':0.6, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'tree_method':'gpu_hist',
    'predictor':'gpu_predictor',
    'random_state':SEED
}

In [11]:
xgb_ = XGBClassifier(xgb_parms)

xgb_.fit(X_pp_avg_train, y_unique_train,
    # evaluate loss at each iteration
    eval_set=[(X_pp_avg_train,y_unique_train), (X_pp_avg_val, y_unique_val)],  
    # stop iterating when eval loss increases 5 times in a row
    early_stopping_rounds=7
)


XGBoostError: [09:02:25] /Users/sjoerddewit/code/Yuzhe17/AMEX_default_prediction/xgboost/python-package/build/temp.macosx-12.4-x86_64-3.8/xgboost/src/objective/objective.cc:26: Unknown objective function: `{'max_depth': 4, 'learning_rate': 0.05, 'subsample': 0.8, 'colsample_bytree': 0.6, 'eval_metric': 'logloss', 'objective': 'binary:logistic', 'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', 'random_state': 42}`
Objective candidate: survival:aft
Objective candidate: binary:hinge
Objective candidate: multi:softmax
Objective candidate: multi:softprob
Objective candidate: rank:pairwise
Objective candidate: rank:ndcg
Objective candidate: rank:map
Objective candidate: reg:squarederror
Objective candidate: reg:squaredlogerror
Objective candidate: reg:logistic
Objective candidate: binary:logistic
Objective candidate: binary:logitraw
Objective candidate: reg:linear
Objective candidate: reg:pseudohubererror
Objective candidate: count:poisson
Objective candidate: survival:cox
Objective candidate: reg:gamma
Objective candidate: reg:tweedie
Objective candidate: reg:absoluteerror

Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x00000001543fefc5 dmlc::LogMessageFatal::~LogMessageFatal() + 117
  [bt] (1) 2   libxgboost.dylib                    0x00000001545445d5 xgboost::ObjFunction::Create(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, xgboost::GenericParameter const*) + 1061
  [bt] (2) 3   libxgboost.dylib                    0x00000001544ef757 xgboost::LearnerConfiguration::ConfigureObjective(xgboost::LearnerTrainParam const&, std::__1::vector<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > >*) + 1767
  [bt] (3) 4   libxgboost.dylib                    0x00000001544e447e xgboost::LearnerConfiguration::Configure() + 1070
  [bt] (4) 5   libxgboost.dylib                    0x00000001543f4146 XGBoosterBoostedRounds + 102
  [bt] (5) 6   libffi.dylib                        0x00007ff8231f8882 ffi_call_unix64 + 82
  [bt] (6) 7   ???                                 0x000000030590e150 0x0 + 12978282832



In [None]:
y_pred = xgb_best_reg.predict(X_pp_avg_val)


In [None]:
amex_metric(y_unique_val, y_pred)

# XG Boost & Optuna

https://optuna.org/

In [13]:
import optuna


# XG Boost & Optuna

In [2]:
import lightgbm as lgb


OSError: dlopen(/Users/sjoerddewit/.pyenv/versions/3.8.12/envs/AMEX_default_prediction/lib/python3.8/site-packages/lightgbm/lib_lightgbm.so, 0x0006): Library not loaded: '/usr/local/opt/libomp/lib/libomp.dylib'
  Referenced from: '/Users/sjoerddewit/.pyenv/versions/3.8.12/envs/AMEX_default_prediction/lib/python3.8/site-packages/lightgbm/lib_lightgbm.so'
  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/local/lib/libomp.dylib' (no such file), '/usr/lib/libomp.dylib' (no such file)