# Explainable AI

In [3]:
import os
import kaggle
import pandas as pd
import io
import zipfile
import pandas as pd
import scipy

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [19]:
# Import credit risk analysis data from kaggle https://www.kaggle.com/datasets/nanditapore/credit-risk-analysis
kaggle.api.authenticate()
dataset_name = "nanditapore/credit-risk-analysis"
tmp_data_save_path = r"C:\Users\praka\Acads\M_Tech\new_git_test\mlops_asmt\test-repo-asmt2\tmp"
kaggle.api.dataset_download_files(dataset_name,
    path=tmp_data_save_path, unzip=True)
df = pd.read_csv('tmp/credit_risk.csv')

Dataset URL: https://www.kaggle.com/datasets/nanditapore/credit-risk-analysis


In [20]:
df.head()  # 1.a) data collection complete

Unnamed: 0,Id,Age,Income,Home,Emp_length,Intent,Amount,Rate,Status,Percent_income,Default,Cred_length
0,0,22,59000,RENT,123.0,PERSONAL,35000,16.02,1,0.59,Y,3
1,1,21,9600,OWN,5.0,EDUCATION,1000,11.14,0,0.1,N,2
2,2,25,9600,MORTGAGE,1.0,MEDICAL,5500,12.87,1,0.57,N,3
3,3,23,65500,RENT,4.0,MEDICAL,35000,15.23,1,0.53,N,2
4,4,24,54400,RENT,8.0,MEDICAL,35000,14.27,1,0.55,Y,4


In [16]:
# data eda with sweetviz - autoeda tool
import sweetviz as sv
report = sv.analyze(df)
report.show_html('sweetviz_report.html')

                                             |          | [  0%]   00:00 -> (? left)

Report sweetviz_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [21]:
# Preprocess the data
df = df.drop(columns=['Id'])  # Removing the unique identifier

# Numeric feature standard scaling
numerical_features = list(df.select_dtypes(include=['float64', 'int64']).columns)
numerical_features.remove('Status')
print('numeric features in df {}'.format(numerical_features))
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# One-hot encoding categorical variables
df = pd.get_dummies(df, columns=['Home', 'Intent', 'Default'])
print('string features in df {}'.format(['Home', 'Intent', 'Default']))

# Handling missing values with SimpleImputer
imputer = SimpleImputer(strategy='mean')  # Using mean for imputation
data_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Splitting data into features (X) and target (y)
X = data_imputed.drop(columns=['Status'])
y = data_imputed['Status']
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric features in df ['Age', 'Income', 'Emp_length', 'Amount', 'Rate', 'Percent_income', 'Cred_length']
string features in df ['Home', 'Intent', 'Default']


In [22]:
X_train.head()

Unnamed: 0,Age,Income,Emp_length,Amount,Rate,Percent_income,Cred_length,Home_MORTGAGE,Home_OTHER,Home_OWN,Home_RENT,Intent_DEBTCONSOLIDATION,Intent_EDUCATION,Intent_HOMEIMPROVEMENT,Intent_MEDICAL,Intent_PERSONAL,Intent_VENTURE,Default_N,Default_Y
32377,5.712903,-0.323881,-0.67342,-0.757573,0.024165,-0.657458,4.487315,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1338,-0.273252,-0.646554,-1.156213,-0.172315,1.67828,1.496501,-0.691554,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7047,-0.745843,-0.243213,-0.432024,1.014021,0.647544,1.3092,-0.691554,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
8225,-0.903374,-0.16248,0.292165,-0.56776,-0.966452,-0.563808,-0.444942,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7178,-0.588313,0.20853,-0.432024,-0.409582,0.47164,-0.751109,-0.691554,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [23]:
y_train.head()

32377    0.0
1338     1.0
7047     0.0
8225     0.0
7178     0.0
Name: Status, dtype: float64

In [24]:
X_test.head()

Unnamed: 0,Age,Income,Emp_length,Amount,Rate,Percent_income,Cred_length,Home_MORTGAGE,Home_OTHER,Home_OWN,Home_RENT,Intent_DEBTCONSOLIDATION,Intent_EDUCATION,Intent_HOMEIMPROVEMENT,Intent_MEDICAL,Intent_PERSONAL,Intent_VENTURE,Default_N,Default_Y
14668,-0.588313,-0.614287,0.292165,0.064952,-0.1980293,1.777452,-0.938167,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
24614,-0.115722,-0.033475,-1.156213,0.064952,1.314128,-0.095556,1.034735,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
11096,-0.273252,0.095594,1.257751,1.014021,3.067403e-16,0.466346,-0.691554,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
10424,-0.745843,-0.614352,0.533562,0.064952,3.067403e-16,1.777452,-0.938167,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
26007,0.356869,-0.348082,-0.67342,0.539487,1.638161,1.121899,0.048284,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [25]:
y_test.head()

14668    0.0
24614    0.0
11096    0.0
10424    1.0
26007    1.0
Name: Status, dtype: float64

In [34]:
y_train_df = y_train.to_frame(name='y_train')
y_test_df = y_test.to_frame(name='y_test')

In [35]:
X_train.to_parquet('tmp/X_train.parquet', index=False)
X_test.to_parquet('tmp/X_test.parquet', index=False)
y_train_df.to_parquet('tmp/y_train.parquet', index=False)
y_test_df.to_parquet('tmp/y_test.parquet', index=False)