# Prepare Credit Risk Data

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import sqlite3


In [5]:
# Establish connection
conn = sqlite3.Connection("Resources/credit_risk.sqlite")

In [6]:
# Load data 
df = pd.read_sql('SELECT * FROM credit_risk', conn)

# Display data
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2


In [7]:
# Split data into features and target
X = df.drop('loan_status', axis=1) 
y = df['loan_status']

display(X)
display(y)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.10,N,2
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,0.25,N,2
...,...,...,...,...,...,...,...,...,...,...,...
28631,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0.11,N,30
28632,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0.15,N,19
28633,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,0.46,N,28
28634,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0.10,N,26


0        0
1        1
2        1
3        1
4        1
        ..
28631    0
28632    0
28633    1
28634    0
28635    0
Name: loan_status, Length: 28636, dtype: int64

In [8]:
# Preprocessing for numerical features
numeric_features = ['person_age', 'person_income', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])


In [9]:
# Preprocessing for categorical features
categorical_features = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']

categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])


In [10]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)])


In [12]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
X_train.shape

(22908, 11)

In [14]:
X_test.shape

(5728, 11)

# Train Model


In [16]:
# Import tensorflow
import tensorflow as tf

ModuleNotFoundError: No module named 'tensorflow'