In [None]:
!mkdir credit_samples

In [None]:
import warnings
warnings.filterwarnings('ignore')
import gc


In [None]:
import pandas as pd

### Application table

This is the main table, broken into two files for Train (with TARGET) and Test (without TARGET).
Static data for all applications. One row represents one loan in our data sample.

In [None]:
application_train = pd.read_csv("/kaggle/input/home-credit-default-risk/application_train.csv")

In [None]:
application_train[["TARGET"]].value_counts()

In [None]:
application_train.sample()

In [None]:
application_train.info()

In [None]:
application_train['SK_ID_CURR'].duplicated().any()


In [None]:
application_train['SK_ID_CURR'].nunique()

In [None]:
application_sample = application_train.sample(1000, random_state=0)

In [None]:
application_sample.head()

In [None]:
application_sample.to_csv("credit_samples/application_train.csv",index=False)

In [None]:
# Target variable (1 - client with payment difficulties: he/she had late payment more than X days
# on at least one of the first Y installments of the loan in our sample,
# 0 - all other cases)
application_sample[["TARGET"]].value_counts()

In [None]:
application_sample.corr()['TARGET'].sort_values(ascending=False).head(10)

In [None]:
application_sample.info(verbose=True,null_counts=True)

In [None]:
columns =  ['OWN_CAR_AGE','OCCUPATION_TYPE','EXT_SOURCE_1','EXT_SOURCE_3','EXT_SOURCE_2','APARTMENTS_AVG','BASEMENTAREA_AVG',
         'YEARS_BEGINEXPLUATATION_AVG','COMMONAREA_AVG','YEARS_BUILD_AVG','ELEVATORS_AVG','ENTRANCES_AVG','FLOORSMAX_AVG',
         'FLOORSMIN_AVG','LANDAREA_AVG','LIVINGAPARTMENTS_AVG','LIVINGAREA_AVG','NONLIVINGAPARTMENTS_AVG','APARTMENTS_MODE',
         'BASEMENTAREA_MODE','YEARS_BEGINEXPLUATATION_MODE','YEARS_BUILD_MODE','COMMONAREA_MODE','ELEVATORS_MODE','ELEVATORS_MODE',
         'ENTRANCES_MODE','FLOORSMAX_MODE','FLOORSMIN_MODE','LANDAREA_MODE','LIVINGAPARTMENTS_MODE','NONLIVINGAREA_MODE',
         'APARTMENTS_MEDI','BASEMENTAREA_MEDI','YEARS_BEGINEXPLUATATION_MEDI','YEARS_BUILD_MEDI','COMMONAREA_MEDI','ELEVATORS_MEDI','ENTRANCES_MEDI',
         'FLOORSMAX_MEDI','FLOORSMIN_MEDI','LANDAREA_MEDI','LIVINGAPARTMENTS_MEDI','NONLIVINGAPARTMENTS_MEDI','NONLIVINGAREA_MEDI','FONDKAPREMONT_MODE',
         'HOUSETYPE_MODE','TOTALAREA_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE','AMT_REQ_CREDIT_BUREAU_HOUR','AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK',
         'AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_QRT','AMT_REQ_CREDIT_BUREAU_YEAR','NONLIVINGAREA_AVG','LIVINGAREA_MODE','LIVINGAREA_MEDI','NONLIVINGAPARTMENTS_MODE']

selected_application_sample = application_sample.drop(columns=columns,axis=1)

In [None]:
selected_application_sample.head()

In [None]:
selected_application_sample.info()

In [None]:
selected_application_sample.columns[selected_application_sample.isnull().any()]

In [None]:
selected_application_sample['AMT_GOODS_PRICE'].fillna(selected_application_sample['AMT_GOODS_PRICE'].median(),inplace=True)

In [None]:
selected_application_sample['NAME_TYPE_SUITE'].value_counts()

In [None]:
selected_application_sample['NAME_TYPE_SUITE'].fillna('Unaccompanied',inplace=True)

In [None]:
selected_application_sample[['OBS_30_CNT_SOCIAL_CIRCLE']].head()

In [None]:
selected_application_sample['OBS_30_CNT_SOCIAL_CIRCLE'].fillna(selected_application_sample['OBS_30_CNT_SOCIAL_CIRCLE'].median(),inplace=True)

In [None]:
selected_application_sample['DEF_30_CNT_SOCIAL_CIRCLE'].fillna(selected_application_sample['DEF_30_CNT_SOCIAL_CIRCLE'].median(),inplace=True)

In [None]:
selected_application_sample['OBS_60_CNT_SOCIAL_CIRCLE'].fillna(selected_application_sample['OBS_60_CNT_SOCIAL_CIRCLE'].median(),inplace=True)

In [None]:
selected_application_sample['DEF_60_CNT_SOCIAL_CIRCLE'].fillna(selected_application_sample['DEF_60_CNT_SOCIAL_CIRCLE'].median(),inplace=True)

In [None]:
selected_application_sample.head()

##### Feature engineering/extraction

In [None]:
# credit amount ratio relative to the income of a client
selected_application_sample['CREDIT_INCOME_RATIO'] = selected_application_sample['AMT_CREDIT'] / selected_application_sample['AMT_INCOME_TOTAL']
# loan annuity percentage relative to the income of a client
selected_application_sample['ANNUITY_INCOME_RATIO'] = selected_application_sample['AMT_ANNUITY'] / selected_application_sample['AMT_INCOME_TOTAL']
# the length of the payment in months 
selected_application_sample['CREDIT_TERM'] = selected_application_sample['AMT_ANNUITY'] / selected_application_sample['AMT_CREDIT']
# days employed relative to the age of the client
selected_application_sample['DAYS_EMPLOYED_RATIO'] = selected_application_sample['DAYS_EMPLOYED'] / selected_application_sample['DAYS_BIRTH']

In [None]:
# Compare the amount credit given to the clients income
selected_application_sample['INCOME_CREDIT_RATIO'] = selected_application_sample['AMT_CREDIT'] / selected_application_sample['AMT_INCOME_TOTAL']

In [None]:
# Ratio of annuity to amount of income
selected_application_sample['ANNUITY_INCOME_RATIO'] = selected_application_sample['AMT_ANNUITY'] / selected_application_sample['AMT_INCOME_TOTAL']


In [None]:
# Difference betwwen the price of goods and the loan amount given
selected_application_sample['GOODS_PRICE_LOAN_DIFFERENCE'] = selected_application_sample['AMT_GOODS_PRICE'] - selected_application_sample['AMT_CREDIT']

In [None]:
# Rate of payment
selected_application_sample['PAYMENT_RATE'] = selected_application_sample['AMT_ANNUITY'] / selected_application_sample['AMT_CREDIT']


### Bureau table

All client's previous credits provided by other financial institutions that were reported to Credit Bureau (for clients who have a loan in our sample).
For every loan in our sample, there are as many rows as number of credits the client had in Credit Bureau before the application date.

In [None]:
loan_ids = list(application_sample['SK_ID_CURR'])

In [None]:
bureau = pd.read_csv("/kaggle/input/home-credit-default-risk/bureau.csv")
bureau.sample()

In [None]:
bureau_sample = bureau[ bureau['SK_ID_CURR'].isin(loan_ids)]

In [None]:
bureau_sample.head()

In [None]:
bureau_sample.to_csv("credit_samples/bureau.csv",index=False)

In [None]:
bureau_sample.info()

In [None]:
columns = ['AMT_ANNUITY']
selected_bureau_sample = bureau_sample.drop(columns=columns,axis=1)

In [None]:
selected_bureau_sample.head()

### Bureau balance table

Monthly balances of previous credits in Credit Bureau.
This table has one row for each month of history of every previous credit reported to Credit Bureau – i.e the table has (#loans in sample * # of relative previous credits * # of months where we have some history observable for the previous credits) rows.

In [None]:
bureaus = list(bureau_sample['SK_ID_BUREAU'])

In [None]:
bureau_balance = pd.read_csv("/kaggle/input/home-credit-default-risk/bureau_balance.csv")
bureau_balance.sample()

In [None]:
bureau_balance_sample = bureau_balance[ bureau_balance['SK_ID_BUREAU'].isin(bureaus)]

In [None]:
bureau_balance_sample.info()

In [None]:
bureau_balance_sample.tail()

In [None]:
bureau_balance_sample.to_csv("credit_samples/bureau_balance.csv",index=False)

### Previous balance mid-level table

In [None]:
previous_balance = selected_bureau_sample.merge(bureau_balance, on='SK_ID_BUREAU')

In [None]:
previous_balance.head()

In [None]:
previous_balance.info()

In [None]:
previous_balance.columns[previous_balance.isnull().any()]

In [None]:
previous_balance['DAYS_CREDIT_ENDDATE'].fillna(previous_balance['DAYS_CREDIT_ENDDATE'].median(),inplace=True)
previous_balance['DAYS_ENDDATE_FACT'].fillna(previous_balance['DAYS_ENDDATE_FACT'].median(),inplace=True)
previous_balance['AMT_CREDIT_SUM_DEBT'].fillna(previous_balance['AMT_CREDIT_SUM_DEBT'].median(),inplace=True)
previous_balance['AMT_CREDIT_MAX_OVERDUE'].fillna(previous_balance['AMT_CREDIT_MAX_OVERDUE'].median(),inplace=True)
previous_balance['AMT_CREDIT_SUM_LIMIT'].fillna(previous_balance['AMT_CREDIT_SUM_LIMIT'].median(),inplace=True)

In [None]:
previous_balance.info()

In [None]:
previous_balance.sample(5)

#### Feature engineering / extraction


In [None]:
previous_balance[['AMT_CREDIT_SUM_LIMIT']].describe()

In [None]:
def credit_sum_limit(credit_limit):
  if credit_limit > 0:
    limit_above_zero = 1
  else:
    limit_above_zero = 0
  return limit_above_zero

In [None]:
previous_balance['CREDIT_LIMIT_ABOVE_ZERO'] = previous_balance['AMT_CREDIT_SUM_LIMIT'].apply(credit_sum_limit)

In [None]:
previous_balance['CREDIT_LIMIT_ABOVE_ZERO'].value_counts()

In [None]:
previous_balance[['AMT_CREDIT_SUM_DEBT']].describe()

In [None]:
def has_debt(debt_amount):
  if debt_amount > 0:
    has_debt = 1
  else:
    has_debt = 0
  return has_debt

In [None]:
previous_balance["HAS_DEBT"] = previous_balance['AMT_CREDIT_SUM_DEBT'].apply(has_debt)

In [None]:
previous_balance['AMT_CREDIT_SUM_OVERDUE'].describe()

In [None]:
def over_due_debt(debt_amount):
  if debt_amount > 0:
    debt_overdue = 1
  else:
    debt_overdue = 0
  return debt_overdue

In [None]:
previous_balance['HAS_OVERDUE_DEBT'] = previous_balance['AMT_CREDIT_SUM_OVERDUE'].apply(over_due_debt)

In [None]:
previous_balance['HAS_OVERDUE_DEBT'].value_counts()

### Credit card balance table

Monthly balance snapshots of previous credit cards that the applicant has with Home Credit.
This table has one row for each month of history of every previous credit in Home Credit (consumer credit and cash loans) related to loans in our sample – i.e. the table has (#loans in sample * # of relative previous credit cards * # of months where we have some history observable for the previous credit card) rows.

In [None]:
credit_card_balance = pd.read_csv("/kaggle/input/home-credit-default-risk/credit_card_balance.csv")
credit_card_balance.sample()

In [None]:
credit_card_balance_sample = credit_card_balance[ credit_card_balance['SK_ID_CURR'].isin(loan_ids)]
credit_card_balance_sample.head()

In [None]:
credit_card_balance_sample.to_csv("credit_samples/credit_card_balance.csv",index=False)

In [None]:
credit_card_balance_sample.info()

In [None]:
credit_card_balance_sample.columns[credit_card_balance_sample.isnull().any()]

In [None]:
credit_card_balance_sample['AMT_DRAWINGS_ATM_CURRENT'].fillna(credit_card_balance_sample['AMT_DRAWINGS_ATM_CURRENT'].median(),inplace=True)
credit_card_balance_sample['AMT_DRAWINGS_POS_CURRENT'].fillna(credit_card_balance_sample['AMT_DRAWINGS_POS_CURRENT'].median(),inplace=True)
credit_card_balance_sample['AMT_DRAWINGS_POS_CURRENT'].fillna(credit_card_balance_sample['AMT_DRAWINGS_POS_CURRENT'].median(),inplace=True)
credit_card_balance_sample['AMT_INST_MIN_REGULARITY'].fillna(credit_card_balance_sample['AMT_INST_MIN_REGULARITY'].median(),inplace=True)
credit_card_balance_sample['AMT_PAYMENT_CURRENT'].fillna(credit_card_balance_sample['AMT_PAYMENT_CURRENT'].median(),inplace=True)
credit_card_balance_sample['CNT_DRAWINGS_ATM_CURRENT'].fillna(credit_card_balance_sample['CNT_DRAWINGS_ATM_CURRENT'].median(),inplace=True)
credit_card_balance_sample['CNT_DRAWINGS_OTHER_CURRENT'].fillna(credit_card_balance_sample['CNT_DRAWINGS_OTHER_CURRENT'].median(),inplace=True)
credit_card_balance_sample['CNT_DRAWINGS_POS_CURRENT'].fillna(credit_card_balance_sample['CNT_DRAWINGS_POS_CURRENT'].median(),inplace=True)
credit_card_balance_sample['CNT_INSTALMENT_MATURE_CUM'].fillna(credit_card_balance_sample['CNT_INSTALMENT_MATURE_CUM'].median(),inplace=True)
credit_card_balance_sample['AMT_DRAWINGS_OTHER_CURRENT'].fillna(credit_card_balance_sample['AMT_DRAWINGS_OTHER_CURRENT'].median(),inplace=True)


In [None]:
credit_card_balance_sample.rename(columns={"MONTHS_BALANCE": "CREDIT_MONTHS_BALANCE",
                                           "NAME_CONTRACT_STATUS":"CREDIT_NAME_CONTRACT_STATUS",
                                           "SK_DPD":"CREDIT_SK_DPD",
                                           "SK_DPD_DEF":"CURRENT_SK_DPD_DEF",
                                           },inplace=True)

#### Feature engineering / extraction

In [None]:
credit_card_balance_sample.head()

In [None]:
# Difference between credit card limit and the balance 
credit_card_balance_sample['BALANCE_LIMIT_DIFF'] = credit_card_balance_sample['AMT_BALANCE'] - credit_card_balance_sample['AMT_CREDIT_LIMIT_ACTUAL']

In [None]:
credit_card_balance_sample[['BALANCE_LIMIT_DIFF']].info()

In [None]:
credit_card_balance_sample[['AMT_PAYMENT_TOTAL_CURRENT']].info()

In [None]:
credit_card_balance_sample[['AMT_TOTAL_RECEIVABLE']].describe()

In [None]:
# Difference between client's total credit monthly payments and the total monthly receivable 
credit_card_balance_sample['CREDIT_RECEIVABLE_DIFF'] = credit_card_balance_sample['AMT_TOTAL_RECEIVABLE'] - credit_card_balance_sample['AMT_PAYMENT_TOTAL_CURRENT'] 

In [None]:
credit_card_balance_sample[['AMT_PAYMENT_TOTAL_CURRENT']].describe()

In [None]:
credit_card_balance_sample[['CREDIT_RECEIVABLE_DIFF']].describe()

In [None]:
# Total drawings
credit_card_balance_sample['TOTAL_DRAWINGS'] = credit_card_balance_sample['AMT_DRAWINGS_ATM_CURRENT'] + credit_card_balance_sample['AMT_DRAWINGS_CURRENT'] + credit_card_balance_sample['AMT_DRAWINGS_OTHER_CURRENT'] + credit_card_balance_sample['AMT_DRAWINGS_POS_CURRENT']

In [None]:
credit_card_balance_sample['TOTAL_DRAWINGS'].describe()

In [None]:
credit_card_balance_sample.head()

### POS cash balance table

Monthly balance snapshots of previous POS (point of sales) and cash loans that the applicant had with Home Credit.
This table has one row for each month of history of every previous credit in Home Credit (consumer credit and cash loans) related to loans in our sample – i.e. the table has (#loans in sample * # of relative previous credits * # of months in which we have some history observable for the previous credits) rows

In [None]:
POS_CASH_balance = pd.read_csv("/kaggle/input/home-credit-default-risk/POS_CASH_balance.csv")
POS_CASH_balance.sample()

In [None]:
POS_CASH_balance_sample = POS_CASH_balance[ POS_CASH_balance['SK_ID_CURR'].isin(loan_ids)]
POS_CASH_balance_sample.head()

In [None]:
POS_CASH_balance_sample.info()

In [None]:
POS_CASH_balance_sample.to_csv("credit_samples/POS_CASH_balance.csv",index=False)

In [None]:
POS_CASH_balance_sample.columns[POS_CASH_balance_sample.isnull().any()]

In [None]:
POS_CASH_balance_sample['CNT_INSTALMENT'].fillna(POS_CASH_balance_sample['CNT_INSTALMENT'].median(),inplace=True)
POS_CASH_balance_sample['CNT_INSTALMENT_FUTURE'].fillna(POS_CASH_balance_sample['CNT_INSTALMENT_FUTURE'].median(),inplace=True)

#### Feature engineering / extraction

In [None]:
POS_CASH_balance_sample.head()

### Installments payments table

Repayment history for the previously disbursed credits in Home Credit related to the loans in our sample.
There is a) one row for every payment that was made plus b) one row each for missed payment.
One row is equivalent to one payment of one installment OR one installment corresponding to one payment of one previous Home Credit credit related to loans in our sample.

In [None]:
installments_payments = pd.read_csv("/kaggle/input/home-credit-default-risk/installments_payments.csv")
installments_payments.sample()

In [None]:
installments_payments_sample = installments_payments[ installments_payments['SK_ID_CURR'].isin(loan_ids)]
installments_payments_sample.head()

In [None]:
installments_payments_sample.info()

In [None]:
installments_payments_sample.to_csv("credit_samples/installments_payments.csv",index=False)

In [None]:
installments_payments_sample.columns[installments_payments_sample.isnull().any()]

In [None]:
installments_payments_sample['DAYS_ENTRY_PAYMENT'].fillna(installments_payments_sample['DAYS_ENTRY_PAYMENT'].median(),inplace=True)
installments_payments_sample['AMT_PAYMENT'].fillna(installments_payments_sample['AMT_PAYMENT'].median(),inplace=True)


#### Feature engineering / extraction

In [None]:
installments_payments_sample.head()

In [None]:
# Ratio of installment amount and actual paid amount 
installments_payments_sample['INSTALMENT_PAYMENT_RATIO'] =  installments_payments_sample['AMT_PAYMENT'] / installments_payments_sample['AMT_INSTALMENT']

In [None]:
installments_payments_sample['INSTALMENT_PAYMENT_RATIO'].describe()

In [None]:
installments_payments_sample[['INSTALMENT_PAYMENT_RATIO']]

### Previous application table

All previous applications for Home Credit loans of clients who have loans in our sample.
There is one row for each previous application related to loans in our data sample.

In [None]:
previous_application = pd.read_csv("/kaggle/input/home-credit-default-risk/previous_application.csv")
previous_application.sample()

In [None]:
previous_application_sample = previous_application[ previous_application['SK_ID_CURR'].isin(loan_ids)]
previous_application_sample.head()

In [None]:
previous_application_sample.to_csv("credit_samples/previous_application.csv",index=False)

In [None]:
previous_application_sample.info()

In [None]:
columns = ['AMT_DOWN_PAYMENT','RATE_DOWN_PAYMENT','RATE_INTEREST_PRIMARY','RATE_INTEREST_PRIVILEGED','DAYS_FIRST_DRAWING','DAYS_FIRST_DUE',
           'DAYS_LAST_DUE_1ST_VERSION','DAYS_LAST_DUE','DAYS_TERMINATION','NFLAG_INSURED_ON_APPROVAL']
selected_previous_application_sample = previous_application_sample.drop(columns=columns, axis=1)
selected_previous_application_sample.head()

In [None]:
selected_previous_application_sample.columns[selected_previous_application_sample.isnull().any()]

In [None]:
selected_previous_application_sample['AMT_ANNUITY'].fillna(selected_previous_application_sample['AMT_ANNUITY'].median(),inplace=True)
selected_previous_application_sample['CNT_PAYMENT'].fillna(selected_previous_application_sample['CNT_PAYMENT'].median(),inplace=True)

selected_previous_application_sample['AMT_GOODS_PRICE'].fillna(selected_previous_application_sample['AMT_GOODS_PRICE'].median(),inplace=True)


In [None]:
selected_previous_application_sample['NAME_TYPE_SUITE'].value_counts()

In [None]:
selected_previous_application_sample['NAME_TYPE_SUITE'].fillna('Unaccompanied',inplace=True)


In [None]:
selected_previous_application_sample['PRODUCT_COMBINATION'].value_counts()

In [None]:
selected_previous_application_sample['PRODUCT_COMBINATION'].fillna('Cash',inplace=True)

#### Feature engineering / extraction

In [None]:
selected_previous_application_sample.head()

In [None]:
# Difference between the amount the client applied for and the awarded amount 
selected_previous_application_sample['APPLIED_AWARDED_AMOUNT_DIFF'] = selected_previous_application_sample["AMT_CREDIT"] - selected_previous_application_sample["AMT_APPLICATION"]

In [None]:
selected_previous_application_sample['APPLIED_AWARDED_AMOUNT_DIFF'].describe()

In [None]:
# Difference between the price of goods and the loan amount the client applied for
selected_previous_application_sample['GOODS_PRICE_APPLIED_DIFF'] = selected_previous_application_sample["AMT_GOODS_PRICE"] - selected_previous_application_sample["AMT_APPLICATION"]

In [None]:
selected_previous_application_sample['GOODS_PRICE_APPLIED_DIFF'].describe()

In [None]:
application_sample["NAME_TYPE_SUITE"].value_counts()

### Project Formulation



Objective: Predict whether an applicant will be able to repay a loan.

Target: 1 - client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample, 0 - all other cases

Features: Application features, previous application features, bureau features, installment features, credit balance features, and POS cash balance features. 

### Join tables 
installments_payments_sample

POS_CASH_balance_sample

selected_previous_application_sample

credit_card_balance_sample

selected_application_sample

previous_balance

In [None]:
installments_payments_sample.sample()

In [None]:
selected_previous_application_sample.sample()

In [None]:
POS_CASH_balance_sample.sample()

In [None]:
credit_card_balance_sample.sample()

In [None]:
previous_balance.head()

In [None]:
credit_card_balance_sample.merge(previous_balance,on="SK_ID_CURR")

In [None]:
dff = installments_payments_sample.merge(selected_previous_application_sample, on=['SK_ID_PREV','SK_ID_CURR']).\
                             merge(POS_CASH_balance_sample, on=['SK_ID_PREV','SK_ID_CURR']).\
                             merge(selected_application_sample,on='SK_ID_CURR')

In [None]:
# del installments_payments_sample 
# del selected_previous_application_sample 
# del POS_CASH_balance_sample 
# del selected_application_sample
# gc.collect()

In [None]:
dff.tail()

In [None]:
dff.info(verbose=True, show_counts=True)

In [None]:
categories = dff.select_dtypes(include=['object']).columns.tolist() 
# alternative 
# datatype = dff.dtypes
# categories = datatype[(datatype == 'object') | (datatype == 'category')].index.tolist()

### Model fitting

In [None]:
# pip install --upgrade scikit-learn


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import average_precision_score, roc_auc_score, precision_score, recall_score, f1_score
# This estimator is much faster than GradientBoostingClassifier for big datasets (n_samples >= 10 000).
# https://scikit-learn.org/stable/modules/ensemble.html#histogram-based-gradient-boosting
from sklearn.ensemble import HistGradientBoostingClassifier

In [None]:
X = dff.drop(["SK_ID_PREV", "SK_ID_CURR","TARGET"], axis=1)
y = dff["TARGET"]

In [None]:
random_state = 13
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
                                                    random_state=random_state)

In [None]:
# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-categorical-features
# https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
transformer = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore',drop='first'), categories)],remainder='passthrough')


In [None]:
 # Model Parameters
learning_rate = 0.01
max_depth = 6
min_samples_leaf = 10
random_state = 42
max_iter=100

In [None]:
# Model: Define a Gradient Boosting Classifier
model = HistGradientBoostingClassifier(learning_rate=learning_rate,
                               max_depth=max_depth,
                               max_iter=max_iter,
                               min_samples_leaf=min_samples_leaf,
                               random_state=random_state)

# Pipeline fit
pipeline = Pipeline(steps=[('transformer', transformer), ('model', model)])
pipeline.fit(X_train, y_train)

In [None]:
model.classes_[1]

In [None]:
pipeline.classes_[1]

In [None]:
# Predict probabilities of target
# https://scikit-learn.org/stable/modules/model_evaluation.html#roc-metrics
probs = pipeline.predict_proba(X_test)[:,1]
# Calculate average precision and area under the receiver operating characteric curve (ROC AUC)
# Precision is the ability of the classifier not to label as positive a sample that is negative, 
# and recall is the ability of the classifier to find all the positive samples.
# AP summarizes a precision-recall curve as the weighted mean of precisions achieved at each threshold, 
# with the increase in recall from the previous threshold used as the weight. 
avg_precision = average_precision_score(y_test, probs, pos_label=1)
# The roc_auc_score function computes the area under the receiver operating characteristic (ROC) curve, 
# which is also denoted by AUC or AUROC. By computing the area under the roc curve, the curve information is summarized in one number.
# A receiver operating characteristic (ROC), or simply ROC curve, is a graphical plot which illustrates the performance of 
# a binary classifier system as its discrimination threshold is varied. It is created by plotting the fraction of true positives out 
# of the positives (TPR = true positive rate) vs. the fraction of false positives out of the negatives (FPR = false positive rate), 
# at various threshold settings. TPR is also known as sensitivity, and FPR is one minus the specificity or true negative rate.
auc = roc_auc_score(y_test, probs)

In [None]:
# Using the non-thresholded decision values given by the classifier.decision_function() method
roc_auc_score(y_test, pipeline.decision_function(X_test))

In [None]:
auc

In [None]:
avg_precision