## Scikit Learn Tutorial

### Imports

In [165]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report,
                             roc_auc_score)
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [166]:
PATH_CSV ='/home/ramses2099/Sources/IAProject/machine_learning/data/Default.csv'
df = pd.read_csv(PATH_CSV)
df.head()

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.138947
3,No,No,529.250605,35704.493935
4,No,No,785.655883,38463.495879


## EDA

In [167]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   default  10000 non-null  str    
 1   student  10000 non-null  str    
 2   balance  10000 non-null  float64
 3   income   10000 non-null  float64
dtypes: float64(2), str(2)
memory usage: 312.6 KB


In [168]:
df.columns = df.columns.str.upper()
COLUMNS = df.columns
# ['DEFAULT', 'STUDENT', 'BALANCE', 'INCOME']

In [169]:
df['STUDENT'].unique()
df['DEFAULT'].unique()
# ['No', 'Yes']
yes_or_no = {'Yes':1, 'No':0}
# tranform
df_final = df.copy()

for col in ['DEFAULT', 'STUDENT']:
    df_final[col] = df_final[col].map(yes_or_no)

df_final.head()

Unnamed: 0,DEFAULT,STUDENT,BALANCE,INCOME
0,0,0,729.526495,44361.625074
1,0,1,817.180407,12106.1347
2,0,0,1073.549164,31767.138947
3,0,0,529.250605,35704.493935
4,0,0,785.655883,38463.495879


In [170]:
df_final.describe()

Unnamed: 0,DEFAULT,STUDENT,BALANCE,INCOME
count,10000.0,10000.0,10000.0,10000.0
mean,0.0333,0.2944,835.374886,33516.981876
std,0.179428,0.455795,483.714985,13336.639563
min,0.0,0.0,0.0,771.967729
25%,0.0,0.0,481.731105,21340.462903
50%,0.0,0.0,823.636973,34552.644802
75%,0.0,1.0,1166.308386,43807.729272
max,1.0,1.0,2654.322576,73554.233495


In [171]:
df_final.corr()['STUDENT'].sort_values(ascending=False)

STUDENT    1.000000
BALANCE    0.203578
DEFAULT    0.035420
INCOME    -0.753985
Name: STUDENT, dtype: float64

In [172]:
y = df_final['STUDENT']
# X = df_final.drop(columns='INCOME')
X = df_final[['BALANCE','INCOME']]

In [173]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [174]:
print(f"train data {X_train.shape}")
print(f"test data {X_test.shape}")

train data (8000, 2)
test data (2000, 2)


### Pipelines: chaining pre-processors and estimators

In [179]:
pipe = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('model', LogisticRegression())
    ]    
)

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
print(f"Score : {(accuracy*100):.2f}%")
class_rep = classification_report(y_pred, y_test)
print("Classification Report;\n",class_rep)
conf_matrix = confusion_matrix(y_pred, y_test)
print(f"Confusion Matrix:\n{conf_matrix}")
auc =roc_auc_score(y_pred, y_test)
print(f"\nAUC Score: {auc:.2f}")

Score : 93.25%
Classification Report;
               precision    recall  f1-score   support

           0       0.94      0.96      0.95      1380
           1       0.91      0.87      0.89       620

    accuracy                           0.93      2000
   macro avg       0.93      0.92      0.92      2000
weighted avg       0.93      0.93      0.93      2000

Confusion Matrix:
[[1326   54]
 [  81  539]]

AUC Score: 0.92
