In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt 
import seaborn as sns

#feature engineering
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer


#model building
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
import lightgbm as lgb
from imblearn.pipeline import Pipeline, make_pipeline
import xgboost as xgb 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler


#evaluation
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix,ConfusionMatrixDisplay


#---
import joblib

In [2]:
df = pd.read_csv('data/train/final_df_outlier.csv')
df.head()

Unnamed: 0,uid,applied_contract_type,target,total_tl,complete_tl,active_tl,amount_overdue_sum,loan_amount_avg,loan_amount_min,loan_amount_max,...,other_loans,enquiry_count,max_enquiry_amount,time_since_last_enquiry,business_loans_enquiries,consumer_loans_enquiries,revolving_loans_enquiries,mortgages_and_real_estate_loans_enquiries,other_loans_enquiries,automobile_loans_enquiries
0,XDA69787158,Cash loans,0,1,1,1,0.0,450000.0,198508.5,450000.0,...,0,2,166000,49,0.0,1.0,0.0,1.0,0.0,0.0
1,BSE47789733,Cash loans,0,1,1,1,0.0,110155.5,110155.5,110155.5,...,0,3,151000,46,0.0,2.0,0.0,0.0,0.0,1.0
2,NTJ92213825,Cash loans,0,1,1,1,0.0,103783.5,103783.5,103783.5,...,0,4,167000,48,3.0,1.0,0.0,0.0,0.0,0.0
3,TCQ47571695,Cash loans,0,3,3,3,0.0,224292.0,63351.0,450000.0,...,0,1,148000,47,1.0,0.0,0.0,0.0,0.0,0.0
4,WJZ68772744,Cash loans,0,0,0,0,0.0,0.0,0.0,0.0,...,0,22,194000,45,5.0,8.5,2.5,1.0,2.0,2.0


In [3]:
df.describe()

Unnamed: 0,target,total_tl,complete_tl,active_tl,amount_overdue_sum,loan_amount_avg,loan_amount_min,loan_amount_max,num_times_delinquent,months_since_recent_delinquency,...,other_loans,enquiry_count,max_enquiry_amount,time_since_last_enquiry,business_loans_enquiries,consumer_loans_enquiries,revolving_loans_enquiries,mortgages_and_real_estate_loans_enquiries,other_loans_enquiries,automobile_loans_enquiries
count,261383.0,261383.0,261383.0,261383.0,261383.0,261383.0,261383.0,261383.0,261383.0,261383.0,...,261383.0,261383.0,261383.0,261383.0,261383.0,261383.0,261383.0,261383.0,261383.0,261383.0
mean,0.08056,4.45361,4.45361,3.100986,191.0237,248113.913086,60849.587485,602869.7,0.55929,0.736241,...,0.002483,7.030105,166747.067958,47.296163,1.841962,2.843525,0.42178,0.672125,0.67215,0.363501
std,0.272159,3.917763,3.917763,1.907505,15733.74,242282.746874,61494.554962,666046.7,0.942141,7.461243,...,0.054955,6.138532,52086.789866,3.495063,1.977951,2.632383,0.700426,0.851597,0.851262,0.639948
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,61500.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,1.0,0.0,68715.84375,16843.5,99211.5,0.0,0.0,...,0.0,2.0,141000.0,45.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,0.0,4.0,4.0,4.0,0.0,164534.884615,38880.0,324000.0,0.0,0.0,...,0.0,5.0,179000.0,46.0,1.0,2.0,0.0,0.0,0.0,0.0
75%,0.0,7.0,7.0,5.0,0.0,357057.462214,89509.5,900000.0,1.0,0.0,...,0.0,10.0,194000.0,49.0,3.0,4.0,1.0,1.0,1.0,1.0
max,1.0,15.0,15.0,5.0,3756681.0,789569.889911,198508.5,2101183.0,2.5,140.0,...,4.0,22.0,273500.0,55.0,7.5,8.5,2.5,2.5,2.5,2.5


In [4]:
df.drop(['uid'], inplace=True, axis=1)

In [5]:
y = df['target']
X = df.drop('target',axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=0.20,stratify=y) 

In [7]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [8]:
cat_columns = list(X_train.select_dtypes(include='object').columns)
num_columns = list(X_train.select_dtypes(exclude='object').columns)

In [9]:
print("categorical columns - ", cat_columns)
print("Numerical columns - ", num_columns)

categorical columns -  ['applied_contract_type']
Numerical columns -  ['total_tl', 'complete_tl', 'active_tl', 'amount_overdue_sum', 'loan_amount_avg', 'loan_amount_min', 'loan_amount_max', 'num_times_delinquent', 'months_since_recent_delinquency', 'automobile_loans', 'business_loans', 'consumer_loans', 'mortgages_and_real_estate_loans', 'other_loans', 'enquiry_count', 'max_enquiry_amount', 'time_since_last_enquiry', 'business_loans_enquiries', 'consumer_loans_enquiries', 'revolving_loans_enquiries', 'mortgages_and_real_estate_loans_enquiries', 'other_loans_enquiries', 'automobile_loans_enquiries']


In [10]:
X_train.shape

(209106, 24)

In [11]:
#feature selection 

In [12]:
#ANOVA

In [13]:
num_data = X_train[num_columns]

In [14]:
from sklearn.feature_selection import f_classif

f_scores, p_values = f_classif(num_data, y_train)

anova_results = pd.DataFrame({
    'Feature': num_columns,
    'F-Score': f_scores,
    'P-Value': p_values
})

significant_features = anova_results[anova_results['P-Value'] < 0.05]['Feature']
print(significant_features)

0                            total_tl
1                         complete_tl
2                           active_tl
3                  amount_overdue_sum
4                     loan_amount_avg
5                     loan_amount_min
6                     loan_amount_max
7                num_times_delinquent
8     months_since_recent_delinquency
9                    automobile_loans
11                     consumer_loans
12    mortgages_and_real_estate_loans
15                 max_enquiry_amount
16            time_since_last_enquiry
19          revolving_loans_enquiries
22         automobile_loans_enquiries
Name: Feature, dtype: object


In [15]:
X_train_significant = X_train[significant_features]
X_test_significant = X_test[significant_features]

In [16]:
X_train_significant

Unnamed: 0,total_tl,complete_tl,active_tl,amount_overdue_sum,loan_amount_avg,loan_amount_min,loan_amount_max,num_times_delinquent,months_since_recent_delinquency,automobile_loans,consumer_loans,mortgages_and_real_estate_loans,max_enquiry_amount,time_since_last_enquiry,revolving_loans_enquiries,automobile_loans_enquiries
0,8,8,5,0.0,317812.500000,67500.0,900000.000,0.0,0,0,8.0,0,130000,55,0.0,1.0
1,5,5,5,841.5,61803.000000,22500.0,101191.500,2.5,0,0,5.0,0,65000,50,0.0,0.0
2,2,2,2,0.0,46777.500000,46777.5,46777.500,0.0,0,0,2.0,0,197000,45,0.0,0.0
3,7,7,5,0.0,191006.530714,21820.5,430484.715,2.5,0,0,7.0,0,187000,44,0.0,0.0
4,0,0,0,0.0,0.000000,0.0,0.000,0.0,0,0,0.0,0,273500,49,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209101,3,3,3,0.0,99313.500000,25915.5,157500.000,0.0,0,0,3.0,0,193000,49,0.0,0.0
209102,3,3,3,0.0,340500.000000,198508.5,580500.000,1.0,0,0,3.0,0,178000,44,1.0,0.0
209103,1,1,1,0.0,42719.400000,42719.4,42719.400,0.0,0,0,1.0,0,183000,44,0.0,1.0
209104,0,0,0,0.0,0.000000,0.0,0.000,0.0,0,0,0.0,0,273500,46,0.0,1.0


In [17]:
#Chi square

In [18]:
from scipy.stats import chi2_contingency

In [19]:
contingency_table = pd.crosstab(X_train['applied_contract_type'], y_train)
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

print("P-Value:", p_value)

P-Value: 5.511210803344714e-51


since it is less than 0.05, we will keep it

In [20]:
X_train_significant['applied_contract_type'] = X_train['applied_contract_type']
X_test_significant['applied_contract_type'] = X_test['applied_contract_type']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_significant['applied_contract_type'] = X_train['applied_contract_type']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_significant['applied_contract_type'] = X_test['applied_contract_type']


In [21]:
X_train_significant.columns

Index(['total_tl', 'complete_tl', 'active_tl', 'amount_overdue_sum',
       'loan_amount_avg', 'loan_amount_min', 'loan_amount_max',
       'num_times_delinquent', 'months_since_recent_delinquency',
       'automobile_loans', 'consumer_loans', 'mortgages_and_real_estate_loans',
       'max_enquiry_amount', 'time_since_last_enquiry',
       'revolving_loans_enquiries', 'automobile_loans_enquiries',
       'applied_contract_type'],
      dtype='object')

In [22]:
num_columns = list(X_train_significant.select_dtypes(exclude='object').columns)
cat_columns = list(X_train_significant.select_dtypes(include='object').columns)

In [29]:
logistic = LogisticRegression(class_weight='balanced')
dtc = DecisionTreeClassifier(max_depth=20, min_samples_split=10, class_weight='balanced')
rfc = RandomForestClassifier(n_estimators=100, random_state=0, class_weight='balanced')
xgbc = xgb.XGBClassifier()

In [30]:
models = {
    'Logistic_reg': logistic,
    'Decision_tree': dtc,
    'Random_forest': rfc,
    'xgboost':xgbc}

In [234]:
num_columns = num_columns
ohe_columns = cat_columns


def make_my_pipeline(name,model, myX_train, myX_test, myy_train, myy_test):
    num_pipeline = Pipeline(steps=[
        ('normal_dist', FunctionTransformer(np.log1p)),
        ('num_scaling',StandardScaler())
    ])
    
    cat_ohe_pipeline = Pipeline(steps=[
        ('one-hot',OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
    ])
    
    #using column transformer
    preprocessing = ColumnTransformer([
        ('one_hot', cat_ohe_pipeline, ohe_columns),
        ('num', num_pipeline, num_columns)
    ])
    
    model = model

    pipe = Pipeline([
        ('preprocessing', preprocessing),
        ('model', model)
    ])

    pipe.fit(myX_train,myy_train)
    myy_pred = pipe.predict(myX_test)
    
    acc = accuracy_score(myy_test, myy_pred)
    prec, rec, fscore, support = score(myy_test, myy_pred)
    cm = confusion_matrix(myy_test,myy_pred)
    report = classification_report(myy_test,myy_pred)
    roc_auc = roc_auc_score(myy_test, myy_pred)
    
    accuracy.append(acc)
    precision.append(prec[1])
    recall.append(rec[1])
    f1.append(fscore[1])
    matrix.append(cm)
    reports.append(report)
    roc_auc_scores.append(roc_auc)

    

In [235]:
accuracy = []
precision = []
recall = []
f1 = []
matrix = []
reports = []
roc_auc_scores = []

for name,algo in models.items():
    print(name)
    make_my_pipeline(name, algo, X_train_significant, X_test_significant, y_train, y_test)

Logistic_reg
Decision_tree
Random_forest
xgboost


In [236]:
performance_df = pd.DataFrame({'Algorithm':models.keys(),'Accuracy':accuracy,'Precision':precision, 'Recall':recall, "f1":f1, "AUC":roc_auc_scores})
performance_df

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,f1,AUC
0,Logistic_reg,0.546646,0.096384,0.5526,0.164139,0.549362
1,Decision_tree,0.624022,0.090909,0.407504,0.148655,0.525248
2,Random_forest,0.877212,0.10029,0.06578,0.079449,0.50704
3,xgboost,0.919276,0.304348,0.001662,0.003307,0.500665


In [146]:
#undersampling with all algos
num_columns = num_columns
ohe_columns = cat_columns

def make_my_pipeline(name, model, myX_train, myX_test, myy_train, myy_test):
    num_pipeline = Pipeline(steps=[
        ('normal_dist', FunctionTransformer(np.log1p, validate=True)),
        ('num_scaling', StandardScaler())
    ])
    
    cat_ohe_pipeline = Pipeline(steps=[
        ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
    ])
    
    # Create the ColumnTransformer for preprocessing
    preprocessing = ColumnTransformer([
        ('one_hot', cat_ohe_pipeline, ohe_columns),
        ('num', num_pipeline, num_columns)
    ])
    
    # Define the undersampling step
    undersample = RandomUnderSampler()
    
    # Create the full pipeline with undersampling
    pipe = ImbPipeline([
        ('undersample', undersample),
        ('preprocessing', preprocessing),
        ('model', model)
    ])
    
    # Fit the pipeline
    pipe.fit(myX_train, myy_train)
    myy_pred = pipe.predict(myX_test)
    
    # Compute metrics
    acc = accuracy_score(myy_test, myy_pred)
    prec, rec, fscore, support = score(myy_test, myy_pred)
    cm = confusion_matrix(myy_test, myy_pred)
    report = classification_report(myy_test, myy_pred)
    roc_auc = roc_auc_score(myy_test, myy_pred)
    
    # Append results
    accuracy.append(acc)
    precision.append(prec[1])
    recall.append(rec[1])
    f1.append(fscore[1])
    matrix.append(cm)
    reports.append(report)
    roc_auc_scores.append(roc_auc)
    

In [147]:
accuracy = []
precision = []
recall = []
f1 = []
matrix = []
reports = []
roc_auc_scores = []

for name,algo in models.items():
    print(name)
    make_my_pipeline(name, algo,  X_train_significant, X_test_significant, y_train, y_test)

Logistic_reg
Random_forest
xgboost


In [148]:
performance_df = pd.DataFrame({'Algorithm':models.keys(),'Accuracy':accuracy,'Precision':precision, 'Recall':recall, "f1":f1, "AUC":roc_auc_scores})
performance_df

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,f1,AUC
0,Logistic_reg,0.549381,0.097059,0.553313,0.165149,0.551175
1,Random_forest,0.548654,0.089183,0.499644,0.151351,0.526296
2,xgboost,0.531266,0.094026,0.558062,0.160937,0.54349


In [212]:
#smote and undersampling with all algos
from imblearn.over_sampling import RandomOverSampler

num_columns = num_columns
ohe_columns = cat_columns

def make_my_pipeline(name, model, myX_train, myX_test, myy_train, myy_test):
    num_pipeline = make_pipeline(
        FunctionTransformer(np.log1p, validate=True),
        StandardScaler())
    
    cat_ohe_pipeline = make_pipeline(
        OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
    
    
    # Create the ColumnTransformer for preprocessing
    preprocessing = ColumnTransformer([
        ('one_hot', cat_ohe_pipeline, ohe_columns),
        ('num', num_pipeline, num_columns)
    ], remainder='passthrough')

    
    # Create the full pipeline with undersampling
    pipe = make_pipeline(
        preprocessing,
        SMOTE(random_state=42),
        RandomUnderSampler(random_state=42),
        model)
    
    # Fit the pipeline
    pipe.fit(myX_train, myy_train)
    myy_pred = pipe.predict(myX_test)
    
    # Compute metrics
    acc = accuracy_score(myy_test, myy_pred)
    prec, rec, fscore, support = score(myy_test, myy_pred)
    cm = confusion_matrix(myy_test, myy_pred)
    report = classification_report(myy_test, myy_pred)
    roc_auc = roc_auc_score(myy_test, myy_pred)
    
    # Append results
    accuracy.append(acc)
    precision.append(prec[1])
    recall.append(rec[1])
    f1.append(fscore[1])
    matrix.append(cm)
    reports.append(report)
    roc_auc_scores.append(roc_auc)
    

In [213]:
accuracy = []
precision = []
recall = []
f1 = []
matrix = []
reports = []
roc_auc_scores = []

for name,algo in models.items():
    print(name)
    make_my_pipeline(name, algo,  X_train_significant, X_test_significant, y_train, y_test)

Logistic_reg
Decision_tree
Random_forest
xgboost


In [214]:
performance_df = pd.DataFrame({'Algorithm':models.keys(),'Accuracy':accuracy,'Precision':precision, 'Recall':recall, "f1":f1, "AUC":roc_auc_scores})
performance_df

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,f1,AUC
0,Logistic_reg,0.533715,0.096329,0.571361,0.164862,0.550889
1,Decision_tree,0.753429,0.096363,0.246022,0.138484,0.521952
2,Random_forest,0.846682,0.092545,0.102588,0.097308,0.50723
3,xgboost,0.830958,0.106766,0.149133,0.124443,0.519913


In [149]:
#balanced xgbc
num_columns = num_columns
ohe_columns = cat_columns

def compute_scale_pos_weight(y):
    # Compute class weights
    class_counts = np.bincount(y)
    scale_pos_weight = class_counts[0] / class_counts[1]  # ratio of negative to positive samples
    return scale_pos_weight
    
def make_my_pipeline(name,model, myX_train, myX_test, myy_train, myy_test):
    num_pipeline = Pipeline(steps=[
        ('normal_dist', FunctionTransformer(np.log1p)),
        ('num_scaling',StandardScaler())
    ])
    
    cat_ohe_pipeline = Pipeline(steps=[
        ('one-hot',OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
    ])
    
    #using column transformer
    preprocessing = ColumnTransformer([
        ('one_hot', cat_ohe_pipeline, ohe_columns),
        ('num', num_pipeline, num_columns)
    ])
    
    scale_pos_weight = compute_scale_pos_weight(myy_train)
    
    # Initialize the model with scale_pos_weight
    model_instance = model(scale_pos_weight=scale_pos_weight)
    
    pipe = Pipeline([
        ('preprocessing', preprocessing),
        ('model', model_instance)
    ])

    pipe.fit(myX_train,myy_train)
    myy_pred = pipe.predict(myX_test)
    
    acc = accuracy_score(myy_test, myy_pred)
    prec, rec, fscore, support = score(myy_test, myy_pred)
    cm = confusion_matrix(myy_test,myy_pred)
    report = classification_report(myy_test,myy_pred)
    roc_auc = roc_auc_score(myy_test, myy_pred)
    
    accuracy.append(acc)
    precision.append(prec[1])
    recall.append(rec[1])
    f1.append(fscore[1])
    matrix.append(cm)
    reports.append(report)
    roc_auc_scores.append(roc_auc)

    

In [150]:
accuracy = []
precision = []
recall = []
f1 = []
matrix = []
reports = []
roc_auc_scores = []

xgbc=  xgb.XGBClassifier
make_my_pipeline("xgboost", xgbc, X_train_significant, X_test_significant, y_train, y_test)
# for name,algo in models.items():
#     make_my_pipeline(name, algo, X_train, X_test, y_train, y_test)

In [151]:
# performance_df = pd.DataFrame({'Algorithm':models.keys(),'Accuracy':accuracy,'Precision':precision, 'Recall':recall, "f1":f1, "AUC":roc_auc_scores})
# performance_df

In [152]:
print("auc_roc - ",roc_auc_scores[0])
print("f1 - ",f1)
print("recall - ",recall)
print("precision - ",precision)
print("accuracy - ",accuracy)

auc_roc -  0.5483633764754496
f1 -  [0.16398937257306356]
recall -  [0.47637140821657564]
precision -  [0.09904216451071393]
accuracy -  [0.608757197237791]


In [153]:
for i in matrix:
    print(i)

[[29818 18248]
 [ 2205  2006]]


In [175]:
#smote with balanced xgboost
from imblearn.over_sampling import RandomOverSampler

num_columns = num_columns
ohe_columns = cat_columns

def make_my_pipeline(name, model, myX_train, myX_test, myy_train, myy_test):
    num_pipeline = make_pipeline(
        FunctionTransformer(np.log1p, validate=True),
        StandardScaler())
    
    cat_ohe_pipeline = make_pipeline(
        OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
    
    
    # Create the ColumnTransformer for preprocessing
    preprocessing = ColumnTransformer([
        ('one_hot', cat_ohe_pipeline, ohe_columns),
        ('num', num_pipeline, num_columns)
    ], remainder='passthrough')
    
    scale_pos_weight = compute_scale_pos_weight(myy_train)
    
    # Initialize the model with scale_pos_weight
    model_instance = model(scale_pos_weight=scale_pos_weight)
    

    
    # Create the full pipeline with undersampling
    pipe = make_pipeline(
        preprocessing,
        SMOTE(random_state=42),
        model_instance)
    
    # Fit the pipeline
    pipe.fit(myX_train, myy_train)
    myy_pred = pipe.predict(myX_test)
    
    # Compute metrics
    acc = accuracy_score(myy_test, myy_pred)
    prec, rec, fscore, support = score(myy_test, myy_pred)
    cm = confusion_matrix(myy_test, myy_pred)
    report = classification_report(myy_test, myy_pred)
    roc_auc = roc_auc_score(myy_test, myy_pred)
    
    # Append results
    accuracy.append(acc)
    precision.append(prec[1])
    recall.append(rec[1])
    f1.append(fscore[1])
    matrix.append(cm)
    reports.append(report)
    roc_auc_scores.append(roc_auc)
    

In [176]:
accuracy = []
precision = []
recall = []
f1 = []
matrix = []
reports = []
roc_auc_scores = []

xgbc=  xgb.XGBClassifier
make_my_pipeline("xgboost", xgbc, X_train_significant, X_test_significant, y_train, y_test)
# for name,algo in models.items():
#     make_my_pipeline(name, algo, X_train, X_test, y_train, y_test)

In [177]:
print("auc_roc - ",roc_auc_scores[0])
print("f1 - ",f1)
print("recall - ",recall)
print("precision - ",precision)
print("accuracy - ",accuracy)

auc_roc -  0.5187203906272981
f1 -  [0.15270541948593788]
recall -  [0.8387556399905011]
precision -  [0.0839992389649924]
accuracy -  [0.2502438931078677]


In [199]:
#smote and undersampling with balanced xgboost
from imblearn.over_sampling import RandomOverSampler

num_columns = num_columns
ohe_columns = cat_columns

def make_my_pipeline(name, model, myX_train, myX_test, myy_train, myy_test):
    num_pipeline = make_pipeline(
        FunctionTransformer(np.log1p, validate=True),
        StandardScaler())
    
    cat_ohe_pipeline = make_pipeline(
        OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
    
    
    # Create the ColumnTransformer for preprocessing
    preprocessing = ColumnTransformer([
        ('one_hot', cat_ohe_pipeline, ohe_columns),
        ('num', num_pipeline, num_columns)
    ], remainder='passthrough')

    
    scale_pos_weight = compute_scale_pos_weight(myy_train)
    
    # Initialize the model with scale_pos_weight
    model_instance = model(scale_pos_weight=scale_pos_weight)
    

    
    # Create the full pipeline with undersampling
    pipe = make_pipeline(
        preprocessing,
        SMOTE(random_state=42),
        RandomUnderSampler(random_state=42),
        model_instance)
    
    # Fit the pipeline
    pipe.fit(myX_train, myy_train)
    myy_pred = pipe.predict(myX_test)
    
    # Compute metrics
    acc = accuracy_score(myy_test, myy_pred)
    prec, rec, fscore, support = score(myy_test, myy_pred)
    cm = confusion_matrix(myy_test, myy_pred)
    report = classification_report(myy_test, myy_pred)
    roc_auc = roc_auc_score(myy_test, myy_pred)
    
    # Append results
    accuracy.append(acc)
    precision.append(prec[1])
    recall.append(rec[1])
    f1.append(fscore[1])
    matrix.append(cm)
    reports.append(report)
    roc_auc_scores.append(roc_auc)
    

In [200]:
accuracy = []
precision = []
recall = []
f1 = []
matrix = []
reports = []
roc_auc_scores = []

xgbc=  xgb.XGBClassifier
make_my_pipeline("xgboost", xgbc, X_train_significant, X_test_significant, y_train, y_test)

In [201]:
print("auc_roc - ",roc_auc_scores[0])
print("f1 - ",f1)
print("recall - ",recall)
print("precision - ",precision)
print("accuracy - ",accuracy)

auc_roc -  0.5198427095459646
f1 -  [0.15316490897811624]
recall -  [0.8501543576347661]
precision -  [0.0841640022569118]
accuracy -  [0.24274537559538611]


In [205]:
#smote and undersampling with catboost
num_columns = num_columns
ohe_columns = cat_columns


for cat_feature in ohe_columns:
    X_train_significant[cat_feature] = X_train_significant[cat_feature].astype('category')
    X_test_significant[cat_feature] = X_test_significant[cat_feature].astype('category')    


def compute_scale_pos_weight(y):
    class_counts = np.bincount(y)
    return class_counts[0] / class_counts[1]  # Ratio of negative to positive samples


model = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.1,
    loss_function='Logloss',
    scale_pos_weight=compute_scale_pos_weight(y_train),
    verbose=100
)

def make_my_pipeline(name, model, myX_train, myX_test, myy_train, myy_test):
    num_pipeline = make_pipeline(
        FunctionTransformer(np.log1p, validate=True),
        StandardScaler())
    
    cat_ohe_pipeline = make_pipeline(
        OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
    
    
    # Create the ColumnTransformer for preprocessing
    preprocessing = ColumnTransformer([
        ('one_hot', cat_ohe_pipeline, ohe_columns),
        ('num', num_pipeline, num_columns)
    ], remainder='passthrough')
    
    # Create the full pipeline with undersampling
    pipe = make_pipeline(
        preprocessing,
        SMOTE(random_state=42),
        RandomUnderSampler(random_state=42),
        model)
    
    # Fit the pipeline
    pipe.fit(myX_train, myy_train)
    myy_pred = pipe.predict(myX_test)
    
    # Compute metrics
    acc = accuracy_score(myy_test, myy_pred)
    prec, rec, fscore, support = score(myy_test, myy_pred)
    cm = confusion_matrix(myy_test, myy_pred)
    report = classification_report(myy_test, myy_pred)
    roc_auc = roc_auc_score(myy_test, myy_pred)
    
    # Append results
    accuracy.append(acc)
    precision.append(prec[1])
    recall.append(rec[1])
    f1.append(fscore[1])
    matrix.append(cm)
    reports.append(report)
    roc_auc_scores.append(roc_auc)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_significant[cat_feature] = X_train_significant[cat_feature].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_significant[cat_feature] = X_test_significant[cat_feature].astype('category')


In [206]:
accuracy = []
precision = []
recall = []
f1 = []
matrix = []
reports = []
roc_auc_scores = []

make_my_pipeline("catboost", model, X_train_significant, X_test_significant, y_train, y_test)

0:	learn: 0.5972578	total: 131ms	remaining: 2m 10s
100:	learn: 0.2182078	total: 9.86s	remaining: 1m 27s
200:	learn: 0.1897324	total: 19.4s	remaining: 1m 17s
300:	learn: 0.1765414	total: 29.3s	remaining: 1m 8s
400:	learn: 0.1705024	total: 39s	remaining: 58.3s
500:	learn: 0.1660493	total: 48.8s	remaining: 48.6s
600:	learn: 0.1621757	total: 58.4s	remaining: 38.8s
700:	learn: 0.1597101	total: 1m 8s	remaining: 29s
800:	learn: 0.1572952	total: 1m 17s	remaining: 19.3s
900:	learn: 0.1553507	total: 1m 27s	remaining: 9.57s
999:	learn: 0.1537141	total: 1m 32s	remaining: 0us


In [195]:
print("auc_roc - ",roc_auc_scores[0])
print("f1 - ",f1)
print("recall - ",recall)
print("precision - ",precision)
print("accuracy - ",accuracy)

auc_roc -  0.5235391650538928
f1 -  [0.1539026292846327]
recall -  [0.7964853953930183]
precision -  [0.08518095238095238]
accuracy -  [0.2945654876905714]


In [198]:
#without smote
print("auc_roc - ",roc_auc_scores[0])
print("f1 - ",f1)
print("recall - ",recall)
print("precision - ",precision)
print("accuracy - ",accuracy)

auc_roc -  0.5526072566669812
f1 -  [0.16673552011897877]
recall -  [0.4792210876276419]
precision -  [0.10092523130782696]
accuracy -  [0.6141706677888937]


# hyperparameter tuning

## ```lightgbm```

In [225]:
#smote and undersampling with lightgbm randomizedcv

num_columns = num_columns
ohe_columns = cat_columns

param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__num_leaves': [31, 63, 127],
    'model__max_depth': [-1, 10, 20],
    'model__min_child_samples': [5, 10, 20],
    'model__subsample': [0.8, 0.9, 1.0],
    'model__colsample_bytree': [0.8, 0.9, 1.0]
}


model = lgb.LGBMClassifier(
    objective='binary',
    metric='binary_logloss'
)


def make_my_pipeline(name, model, myX_train, myX_test, myy_train, myy_test):
    num_pipeline = make_pipeline(
        FunctionTransformer(np.log1p, validate=True),
        StandardScaler())
    
    cat_ohe_pipeline = make_pipeline(
        OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
    
    
    # Create the ColumnTransformer for preprocessing
    preprocessing = ColumnTransformer([
        ('one_hot', cat_ohe_pipeline, ohe_columns),
        ('num', num_pipeline, num_columns)
    ], remainder='passthrough')
    
    # Create the full pipeline with undersampling
    pipe = ImbPipeline([
        ('preprocessing', preprocessing),
        ('smote', SMOTE(random_state=42)),
        ('under', RandomUnderSampler(random_state=42)),
        ('model', model)
    ])
    

    grid_search = RandomizedSearchCV(
    pipe,
    param_grid,
    n_iter=20,
    scoring='roc_auc',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)
    # Fit the pipeline
    grid_search.fit(myX_train, myy_train)
    print("Best parameters found: ", grid_search.best_params_)
    print("Best ROC AUC score found: ", grid_search.best_score_)
    

In [226]:
accuracy = []
precision = []
recall = []
f1 = []
matrix = []
reports = []
roc_auc_scores = []

make_my_pipeline("lightgbm", model, X_train_significant, X_test_significant, y_train, y_test)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Number of positive: 192260, number of negative: 192260
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014394 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3914
[LightGBM] [Info] Number of data points in the train set: 384520, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Best parameters found:  {'model__subsample': 0.9, 'model__num_leaves': 63, 'model__n_estimators': 300, 'model__min_child_samples': 10, 'model__max_depth': 10, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.8}
Best ROC AUC score found:  0.5477230817255077


In [229]:
#using tuned model

#smote and undersampling with lightgbm

num_columns = num_columns
ohe_columns = cat_columns

model = lgb.LGBMClassifier(
    objective='binary',
    metric='binary_logloss',
    subsample= 0.9, 
    num_leaves=63,
    model__n_estimators=300,
    model__min_child_samples=10,
    model__max_depth=10,
    model__learning_rate=0.01,
    model__colsample_bytree=0.8
)


def make_my_pipeline(name, model, myX_train, myX_test, myy_train, myy_test):
    num_pipeline = make_pipeline(
        FunctionTransformer(np.log1p, validate=True),
        StandardScaler())
    
    cat_ohe_pipeline = make_pipeline(
        OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
    
    
    # Create the ColumnTransformer for preprocessing
    preprocessing = ColumnTransformer([
        ('one_hot', cat_ohe_pipeline, ohe_columns),
        ('num', num_pipeline, num_columns)
    ], remainder='passthrough')
    
    # Create the full pipeline with undersampling
    pipe = ImbPipeline([
        ('preprocessing', preprocessing),
        ('smote', SMOTE(random_state=42)),
        ('under', RandomUnderSampler(random_state=42)),
        ('model', model)
    ])
    
    # Fit the pipeline
    pipe.fit(myX_train, myy_train)
    myy_pred = pipe.predict(myX_test)
    
    # Compute metrics
    acc = accuracy_score(myy_test, myy_pred)
    prec, rec, fscore, support = score(myy_test, myy_pred)
    cm = confusion_matrix(myy_test, myy_pred)
    report = classification_report(myy_test, myy_pred)
    roc_auc = roc_auc_score(myy_test, myy_pred)
    
    # Append results
    accuracy.append(acc)
    precision.append(prec[1])
    recall.append(rec[1])
    f1.append(fscore[1])
    matrix.append(cm)
    reports.append(report)
    roc_auc_scores.append(roc_auc)
    

In [230]:
accuracy = []
precision = []
recall = []
f1 = []
matrix = []
reports = []
roc_auc_scores = []

make_my_pipeline("lightgbm", model, X_train_significant, X_test_significant, y_train, y_test)

[LightGBM] [Info] Number of positive: 192260, number of negative: 192260
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014766 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3914
[LightGBM] [Info] Number of data points in the train set: 384520, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [231]:
print("auc_roc - ",roc_auc_scores[0])
print("f1 - ",f1)
print("recall - ",recall)
print("precision - ",precision)
print("accuracy - ",accuracy)

auc_roc -  0.5098900365199781
f1 -  [0.08971047981514205]
recall -  [0.07836618380432202]
precision -  [0.1048951048951049]
accuracy -  [0.8718939495380378]


## ``` xgboost ```

In [18]:
#RandomizedsearchCV on balanced xgboost
num_columns = num_columns
ohe_columns = cat_columns

def compute_scale_pos_weight(y):
    # Compute class weights
    class_counts = np.bincount(y)
    scale_pos_weight = class_counts[0] / class_counts[1]  # ratio of negative to positive samples
    return scale_pos_weight
    
def make_my_pipeline(name,model, myX_train, myX_test, myy_train, myy_test):
    num_pipeline = Pipeline(steps=[
        ('normal_dist', FunctionTransformer(np.log1p)),
        ('num_scaling',StandardScaler())
    ])
    
    cat_ohe_pipeline = Pipeline(steps=[
        ('one-hot',OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
    ])
    
    #using column transformer
    preprocessing = ColumnTransformer([
        ('one_hot', cat_ohe_pipeline, ohe_columns),
        ('num', num_pipeline, num_columns)
    ])
    
    scale_pos_weight = compute_scale_pos_weight(myy_train)
    
    # Initialize the model with scale_pos_weight
    model_instance = model(scale_pos_weight=scale_pos_weight)
    
    pipe = Pipeline([
        ('preprocessing', preprocessing),
        ('model', model_instance)
    ])

    kf = StratifiedKFold(n_splits=5, shuffle=False)
    rscv = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, cv=kf, scoring='roc_auc', n_jobs=-1, verbose=3,n_iter=100)
    rscv.fit(myX_train,myy_train)

    print("Best parameter is  - ",rscv.best_params_)

    print("Best auc score is - ",rscv.best_score_)


xgbc=  xgb.XGBClassifier
make_my_pipeline("xgboost", xgbc, X_train, X_test, y_train, y_test)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameter is  -  {'model__n_estimators': 100, 'model__max_depth': 3, 'model__learning_rate': 0.1, 'model__colsample_bytree': 0.9, 'model__alpha': 10}
Best auc score is -  0.6233287025288309


In [29]:
#using tuned model

num_columns = num_columns
ohe_columns = cat_columns

def compute_scale_pos_weight(y):
    # Compute class weights
    class_counts = np.bincount(y)
    scale_pos_weight = class_counts[0] / class_counts[1]  # ratio of negative to positive samples
    return scale_pos_weight
    
def make_my_pipeline(name,model, myX_train, myX_test, myy_train, myy_test):
    num_pipeline = Pipeline(steps=[
        ('normal_dist', FunctionTransformer(np.log1p)),
        ('num_scaling',StandardScaler())
    ])
    
    cat_ohe_pipeline = Pipeline(steps=[
        ('one-hot',OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
    ])
    
    #using column transformer
    preprocessing = ColumnTransformer([
        ('one_hot', cat_ohe_pipeline, ohe_columns),
        ('num', num_pipeline, num_columns)
    ])
    
    scale_pos_weight = compute_scale_pos_weight(myy_train)
    
    # Initialize the model with scale_pos_weight
    model_instance = model(scale_pos_weight=scale_pos_weight, n_estimators=100, max_depth=3,learning_rate=0.1,colsample_bytree=0.9,alpha=10)
    
    pipe = Pipeline([
        ('preprocessing', preprocessing),
        ('model', model_instance)
    ])

    pipe.fit(myX_train,myy_train)
    myy_pred = pipe.predict(myX_test)
    
    acc = accuracy_score(myy_test, myy_pred)
    prec, rec, fscore, support = score(myy_test, myy_pred)
    cm = confusion_matrix(myy_test,myy_pred)
    report = classification_report(myy_test,myy_pred)
    roc_auc = roc_auc_score(myy_test, myy_pred)
    
    accuracy.append(acc)
    precision.append(prec[1])
    recall.append(rec[1])
    f1.append(fscore[1])
    matrix.append(cm)
    reports.append(report)
    roc_auc_scores.append(roc_auc)

    

In [30]:
accuracy = []
precision = []
recall = []
f1 = []
matrix = []
reports = []
roc_auc_scores = []

xgbc=  xgb.XGBClassifier
make_my_pipeline("xgboost", xgbc, X_train, X_test, y_train, y_test)

In [31]:
print("auc_roc - ",roc_auc_scores[0])
print("f1 - ",f1)
print("recall - ",recall)
print("precision - ",precision)
print("accuracy - ",accuracy)

auc_roc -  0.5633715833003822
f1 -  [0.17271368913068466]
recall -  [0.5352647827119449]
precision -  [0.1029693924166286]
accuracy -  [0.5869502840637374]


In [32]:
for i in matrix:
    print(i)

[[28430 19636]
 [ 1957  2254]]


## ```randomforest classifier```

In [25]:
#smote and undersampling with weighted randomforest randomizedcv

num_columns = num_columns
ohe_columns = cat_columns

    
param_grid = {
    'model__n_estimators': [100, 200, 300, 400],
    'model__max_features': ['sqrt', 'log2'],
    'model__max_depth': [None, 10, 20, 30, 40],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__bootstrap': [True, False]
}

model = RandomForestClassifier(
    class_weight='balanced'
)

def make_my_pipeline(name, model, myX_train, myX_test, myy_train, myy_test):
    num_pipeline = make_pipeline(
        FunctionTransformer(np.log1p, validate=True),
        StandardScaler())
    
    cat_ohe_pipeline = make_pipeline(
        OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
    
    
    # Create the ColumnTransformer for preprocessing
    preprocessing = ColumnTransformer([
        ('one_hot', cat_ohe_pipeline, ohe_columns),
        ('num', num_pipeline, num_columns)
    ], remainder='passthrough')
    
    # Create the full pipeline with undersampling
    pipe = ImbPipeline([
        ('preprocessing', preprocessing),
        ('smote', SMOTE(random_state=42)),
        ('under', RandomUnderSampler(random_state=42)),
        ('model', model)
    ])
    

    grid_search = RandomizedSearchCV(
    pipe,
    param_grid,
    n_iter=10,
    scoring='roc_auc',
    cv=2,
    verbose=2,
    n_jobs=-1,
    random_state=42
)
    # Fit the pipeline
    grid_search.fit(myX_train, myy_train)
    print("Best parameters found: ", grid_search.best_params_)
    print("Best ROC AUC score found: ", grid_search.best_score_)

make_my_pipeline("randomforest", model, X_train_significant, X_test_significant, y_train, y_test)

Fitting 2 folds for each of 10 candidates, totalling 20 fits
Best parameters found:  {'model__n_estimators': 200, 'model__min_samples_split': 10, 'model__min_samples_leaf': 2, 'model__max_features': 'sqrt', 'model__max_depth': 10, 'model__bootstrap': False}
Best ROC AUC score found:  0.5528651496643232


### ``` voting classifier ```

In [30]:
#using tuned model - voting classifier
#with smote and undersampling 

num_columns = num_columns
ohe_columns = cat_columns

def compute_scale_pos_weight(y):
    # Compute class weights
    class_counts = np.bincount(y)
    scale_pos_weight = class_counts[0] / class_counts[1]  # ratio of negative to positive samples
    return scale_pos_weight
    

#lightgbm
lgbm = lgb.LGBMClassifier(
    objective='binary',
    metric='binary_logloss',
    subsample= 0.9, 
    num_leaves=63,
    n_estimators=300,
    min_child_samples=10,
    max_depth=10,
    learning_rate=0.01,
    colsample_bytree=0.8
)

#xgboost
xgboost = xgb.XGBClassifier(scale_pos_weight=compute_scale_pos_weight(y_train),
                            n_estimators=100,
                            max_depth=3,
                            learning_rate=0.1,
                            colsample_bytree=0.9,
                            alpha=10)

#randomforest
rfc = RandomForestClassifier(
        n_estimators=200,
        min_samples_split=10,
        min_samples_leaf=2,
        max_features='sqrt',
        max_depth=10,
        bootstrap=False)


voting_clf = VotingClassifier(
    estimators=[
        ('lgbm', lgbm),
        ('xgboost', xgboost),
        ('rfc', rfc)
    ],
    voting='soft' 
)


def make_my_pipeline(name, model, myX_train, myX_test, myy_train, myy_test):
    num_pipeline = make_pipeline(
        FunctionTransformer(np.log1p, validate=True),
        StandardScaler())
    
    cat_ohe_pipeline = make_pipeline(
        OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
    
    
    preprocessing = ColumnTransformer([
        ('one_hot', cat_ohe_pipeline, ohe_columns),
        ('num', num_pipeline, num_columns)
    ], remainder='passthrough')
    
    pipe = ImbPipeline([
        ('preprocessing', preprocessing),
        ('smote', SMOTE(random_state=42)),
        ('under', RandomUnderSampler(random_state=42)),
        ('model', model)
    ])
    
    # Fit the pipeline
    pipe.fit(myX_train, myy_train)
    myy_pred = pipe.predict(myX_test)
    
    # Compute metrics
    acc = accuracy_score(myy_test, myy_pred)
    prec, rec, fscore, support = score(myy_test, myy_pred)
    cm = confusion_matrix(myy_test, myy_pred)
    report = classification_report(myy_test, myy_pred)
    roc_auc = roc_auc_score(myy_test, myy_pred)
    
    # Append results
    accuracy.append(acc)
    precision.append(prec[1])
    recall.append(rec[1])
    f1.append(fscore[1])
    matrix.append(cm)
    reports.append(report)
    roc_auc_scores.append(roc_auc)

    # Save the pipeline
    joblib.dump(pipe, 'voting_clf_pipeline.pkl')
    
accuracy, precision, recall, f1, matrix, reports, roc_auc_scores = [], [], [], [], [], [], []

make_my_pipeline('voting_clf', voting_clf, X_train_significant, X_test_significant, y_train, y_test)

# Output the results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix: {matrix}')
print(f'Classification Report: {reports}')
print(f'ROC AUC Score: {roc_auc_scores}')

[LightGBM] [Info] Number of positive: 192260, number of negative: 192260
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013914 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3914
[LightGBM] [Info] Number of data points in the train set: 384520, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Accuracy: [0.28645484629951984]
Precision: [0.08690984445609568]
Recall: [0.8266445024934694]
F1 Score: [0.15728357129947587]
Confusion Matrix: [array([[11494, 36572],
       [  730,  3481]], dtype=int64)]
Classification Report: ['              precision    recall  f1-score   support\n\n           0       0.94      0.24      0.38     48066\n           1       0.09      0.83      0.16      4211\n\n    accuracy                           0.29     52277\n   macro avg       0.51      0.53      