# Dummy variables

## Load data

In [1]:
import pandas as pd

path = '../../../data/default_credit_card/output/simplified_features_cat.csv'
df = pd.read_csv(path)
df

Unnamed: 0,Industry,Ethnicity,Gender,Age,CivilStatus,YearsEmployed,Income,Approved
0,Industrials,White,Male,30,Married,1.25,0.000000,1
1,Materials,Black,Female,58,Married,3.04,632.793678,1
...,...,...,...,...,...,...,...,...
687,ConsumerStaples,White,Male,17,Married,0.04,662.007321,0
688,Energy,Black,Male,35,Married,8.29,0.000000,0


## Feature selection

In [2]:
target = 'Approved'

y = df[target]
X = df.drop(columns=target)

## Train test split

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

## Data preprocessing

**One hot encoding (dummy variables)**

<table>
    <tr>
        <!-- Table headers -->
        <th>Original</th>
        <th>Dummy</th>
    </tr>
    <tr>
        <!-- Table data cells containing images -->
        <td>
            <img src="src/dummy-1.png" height="150"/>
        </td>
        <td>
            <img src="src/dummy-2.png" height="150"/>
        </td>
    </tr>
</table>


In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [5]:
features = X_train.dtypes

features_categorical = features[features == 'object'].index
features_categorical

Index(['Industry', 'Ethnicity', 'Gender', 'CivilStatus'], dtype='object')

In [6]:
transformer = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(), features_categorical)
], remainder='passthrough')

In [7]:
X_train_ohe = transformer.fit_transform(X_train)

## Modelling

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()
model.fit(X_train_ohe, y_train)

In [9]:
X_test_ohe = transformer.transform(X_test)

In [10]:
model.score(X_test_ohe, y_test)

0.7198067632850241

In [11]:
model.score(X_train_ohe, y_train)

0.9024896265560166