In [1]:
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import os
from joblib import dump
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, plot_confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from lightgbm import LGBMClassifier

In [2]:
df = pd.read_csv(r'C:\Users\Dell\Desktop\Rakesh_project_pipeline\data\Rakesh_Modeling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
df.shape

(10000, 14)

In [4]:
# Are there missing values?
df.isna().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [5]:
df['Geography'].value_counts()

France     5014
Germany    2509
Spain      2477
Name: Geography, dtype: int64

In [6]:
cat_cols = ['Geography', 'Gender']
num_cols = ['CreditScore', 'Age', 'Tenure', 
               'Balance', 'NumOfProducts', 'HasCrCard',
               'IsActiveMember', 'EstimatedSalary']
targ_col = 'Exited'

In [7]:
X, y = df[cat_cols + num_cols], df[targ_col]

In [8]:
y.mean()

0.2037

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [10]:
random_state = 42
train_params = {'n_estimators': 100, 'max_depth': 10}

clf = LGBMClassifier(random_state=random_state, 
                                **train_params)

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer()),
        ("scaler", StandardScaler())
        ]
    )
categorical_transformer = OrdinalEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)
model = Pipeline(
    steps=[("preprocessor", preprocessor), ("clf", clf)]
    )

model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 1540, number of negative: 5960
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000119 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 858
[LightGBM] [Info] Number of data points in the train set: 7500, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.205333 -> initscore=-1.353288
[LightGBM] [Info] Start training from score -1.353288


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['CreditScore', 'Age',
                                                   'Tenure', 'Balance',
                                                   'NumOfProducts', 'HasCrCard',
                                                   'IsActiveMember',
                                                   'EstimatedSalary']),
                                                 ('cat', OrdinalEncoder(),
                                                  ['Geography', 'Gender'])])),
                ('clf', LGBMClassifier(max_depth=10, random_s

In [11]:
import eli5
from sklearn.metrics import make_scorer
from eli5.sklearn import PermutationImportance


preprocessor = model.named_steps['preprocessor']
clf = model.named_steps['clf']
X_test_transformed = preprocessor.transform(X_test)

perm = PermutationImportance(clf, scoring=make_scorer(f1_score)).fit(X_test_transformed, y_test)
eli5.show_weights(perm, feature_names=list(X_test.columns))
# list(zip(X_test.columns.tolist(), perm.feature_importances_))

Collecting eli5
  Using cached eli5-0.14.0-py2.py3-none-any.whl (106 kB)
Collecting tabulate>=0.7.7
  Using cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Collecting jinja2>=3.0.0
  Using cached jinja2-3.1.6-py3-none-any.whl (134 kB)
Collecting scikit-learn>=1.6.0
  Using cached scikit_learn-1.6.1-cp39-cp39-win_amd64.whl (11.2 MB)
Collecting graphviz
  Using cached graphviz-0.20.3-py3-none-any.whl (47 kB)
Collecting MarkupSafe>=2.0
  Using cached MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl (15 kB)
Collecting threadpoolctl>=3.1.0
  Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Collecting joblib>=1.2.0
  Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: threadpoolctl, MarkupSafe, joblib, tabulate, scikit-learn, jinja2, graphviz, eli5
  Attempting uninstall: threadpoolctl
    Found existing installation: threadpoolctl 2.2.0
    Uninstalling threadpoolctl-2.2.0:


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'c:\\programdata\\anaconda3\\lib\\site-packages\\__pycache__\\threadpoolctl.cpython-39.pyc'
Consider using the `--user` option or check the permissions.



ModuleNotFoundError: No module named 'eli5'

In [None]:
plot_confusion_matrix(model, X_test, y_test, normalize='true', cmap=plt.cm.Blues)

In [None]:
y_prob = model.predict_proba(X_test)
y_pred = y_prob[:, 1] >= 0.5

In [None]:
f1_score(y_test, y_pred)

In [None]:
roc_auc_score(y_test, y_prob[:, 1])

In [None]:
df_test = X_test.copy(deep=True)
df_test['true'] = y_test
df_test['pred'] = y_pred
df_test['prob'] = y_prob[:, 1]
df_test