## Scikit Learn Tutorial

### Imports

In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import (accuracy_score, mean_squared_error, r2_score)
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [82]:
PATH_CSV ='/home/ramses2099/Sources/IAProject/machine_learning/data/Default.csv'
df = pd.read_csv(PATH_CSV)
df.head()

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.138947
3,No,No,529.250605,35704.493935
4,No,No,785.655883,38463.495879


## EDA

In [83]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   default  10000 non-null  str    
 1   student  10000 non-null  str    
 2   balance  10000 non-null  float64
 3   income   10000 non-null  float64
dtypes: float64(2), str(2)
memory usage: 312.6 KB


In [84]:
df.columns = df.columns.str.upper()
COLUMNS = df.columns
# ['DEFAULT', 'STUDENT', 'BALANCE', 'INCOME']

In [85]:
df['STUDENT'].unique()
df['DEFAULT'].unique()
# ['No', 'Yes']
yes_or_no = {'Yes':1, 'No':0}
# tranform
df_final = df.copy()

for col in ['DEFAULT', 'STUDENT']:
    df_final[col] = df_final[col].map(yes_or_no)

df_final.head()

Unnamed: 0,DEFAULT,STUDENT,BALANCE,INCOME
0,0,0,729.526495,44361.625074
1,0,1,817.180407,12106.1347
2,0,0,1073.549164,31767.138947
3,0,0,529.250605,35704.493935
4,0,0,785.655883,38463.495879


In [86]:
df_final.describe()

Unnamed: 0,DEFAULT,STUDENT,BALANCE,INCOME
count,10000.0,10000.0,10000.0,10000.0
mean,0.0333,0.2944,835.374886,33516.981876
std,0.179428,0.455795,483.714985,13336.639563
min,0.0,0.0,0.0,771.967729
25%,0.0,0.0,481.731105,21340.462903
50%,0.0,0.0,823.636973,34552.644802
75%,0.0,1.0,1166.308386,43807.729272
max,1.0,1.0,2654.322576,73554.233495


In [87]:
df_final.corr()['INCOME'].sort_values(ascending=False)

INCOME     1.000000
DEFAULT   -0.019871
BALANCE   -0.152243
STUDENT   -0.753985
Name: INCOME, dtype: float64

In [88]:
y = df_final['INCOME']
X = df_final.drop(columns='INCOME')

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [90]:
print(f"train data {X_train.shape}")
print(f"test data {X_test.shape}")


train data (8000, 3)
test data (2000, 3)


### Scalar

In [91]:
scaler = StandardScaler()
X_trian_scaler = scaler.fit_transform(X_train)
X_test_scaler = scaler.fit_transform(X_test)

print(f"X_trian_scaler {X_trian_scaler}")
print(f"X_test_scaler {X_test_scaler}")

X_trian_scaler [[-0.18473268 -0.64512003  0.38108223]
 [-0.18473268 -0.64512003 -1.59177438]
 [-0.18473268 -0.64512003  0.43920243]
 ...
 [-0.18473268 -0.64512003 -0.23061793]
 [-0.18473268  1.55009913 -1.38184384]
 [-0.18473268 -0.64512003 -0.40531852]]
X_test_scaler [[-0.18903116 -0.64920267  1.23960228]
 [-0.18903116 -0.64920267 -0.14558539]
 [-0.18903116 -0.64920267 -1.7559418 ]
 ...
 [-0.18903116 -0.64920267 -1.69897579]
 [-0.18903116  1.54035103  1.05868641]
 [-0.18903116 -0.64920267 -1.13265638]]


### Pipelines: chaining pre-processors and estimators

In [None]:
pipe = make_pipeline(
    LinearRegression()
)

pipe.fit(X_trian_scaler, y_train)

y_pred = pipe.predict(X_test_scaler)

r2_sco = r2_score(y_pred, y_test)
print(f"R2 Score: {r2_sco:.2f}")

result = cross_validate(pipe, X_trian_scaler, y_train)
print(f"Result {result['test_score']}")


R2 Score: 0.23
Result [0.56795804 0.57296789 0.5711505  0.56322251 0.58531248]
