In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'DejaVu Sans Mono'
input_path = '../../input'

In [3]:
from sklearn.model_selection import train_test_split

df = pd.read_csv(f'../../input/default of credit card clients.csv', header=1)
df = df.rename(columns={'PAY_0': 'PAY_1'})
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['default payment next month'], random_state=42)
df_train = df_train.sort_values(by='ID').reset_index(drop=True)
df_test = df_test.sort_values(by='ID').reset_index(drop=True)

In [4]:
col_names = df_train.columns[1:-1]

X_train, X_test = df_train[col_names], df_test[col_names]
y_train, y_test = df_train['default payment next month'], df_test['default payment next month']

In [5]:
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin



class MyFeatureTransformer(BaseEstimator, TransformerMixin):
    pow_tfm_names = [
        'LIMIT_BAL', 'AGE',
        'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
        'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
    ]

    def __init__(self):
        super(MyFeatureTransformer, self).__init__()
        self.power_transformer1 = PowerTransformer()
        self.power_transformer2 = PowerTransformer()

    def fit(self, X, y=None):
        self.power_transformer1.fit(X[self.pow_tfm_names], y)
        delta = X.loc[:, 'BILL_AMT1':'BILL_AMT6'].values - X.loc[:, 'PAY_AMT1':'PAY_AMT6'].values
        self.power_transformer2.fit(delta)
        return self
    
    def transform(self, X, y=None):
        zero_or_not = (X.loc[:, 'BILL_AMT1':'PAY_AMT6'] == 0)
        pow_tfmed = self.power_transformer1.transform(X[self.pow_tfm_names])
        delta = X.loc[:, 'BILL_AMT1':'BILL_AMT6'].values - X.loc[:, 'PAY_AMT1':'PAY_AMT6'].values
        delta_pow_tfmed = self.power_transformer2.transform(delta)
        return np.concatenate([X, zero_or_not, pow_tfmed, delta, delta_pow_tfmed], axis=1)

model = make_pipeline(
    MyFeatureTransformer(),
    StandardScaler(),
    SVC(
        random_state=42
    )
)
model.fit(X_train, y_train)
print(accuracy_score(y_train, model.predict(X_train)))
print(accuracy_score(y_test, model.predict(X_test)))

0.8262083333333333
0.814


In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.pipeline import make_pipeline

class MyFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X_new = dict()
        X_new['PAY_mean'] = X.loc[:, 'PAY_1':'PAY_6'].mean(axis=1)
        X_new['PAY_std'] = X.loc[:, 'PAY_1':'PAY_6'].std(axis=1)

        pay_diff = X.loc[:, 'PAY_1':'PAY_6'].diff(axis=1).iloc[:, 1:]
        pay_diff_sign = np.sign(pay_diff)
        X_new['PAY_diff_mean'] = pay_diff.mean(axis=1)
        X_new['PAY_inc_cnt'] = (pay_diff_sign == 1.).sum(axis=1)
        X_new['PAY_dec_cnt'] = (pay_diff_sign == -1.).sum(axis=1)

        bill_amt = X.loc[:, 'BILL_AMT1':'BILL_AMT6']
        bill_amt_diff = bill_amt.diff(axis=1).iloc[:, 1:]
        X_new['BILL_AMT_diff_mean'] = bill_amt_diff.mean(axis=1)
        X_new['BILL_AMT_diff_abs_mean'] = bill_amt_diff.abs().mean(axis=1)
        X_new['BILL_AMT_net_delta'] = X['BILL_AMT1'] - X['BILL_AMT6']

        pay_amt = X.loc[:, 'PAY_AMT1':'PAY_AMT6']
        pay_amt_diff = pay_amt.diff(axis=1).iloc[:, 1:]
        X_new['PAY_AMT_net_delta'] = X['PAY_AMT1'] - X['PAY_AMT6']
        X_new['PAY_AMT_sum'] = X.loc[:, 'PAY_AMT1':'PAY_AMT6'].sum(axis=1)
        # X_new['PAY_AMT_cumsum'] = np.cumsum(pay_amt, axis=1)
        X_new['PAY_diff_mean'] = pay_amt_diff.mean(axis=1)
        X_new['PAY_diff_abs_mean'] = pay_amt_diff.abs().mean(axis=1)

        return pd.concat([X, pd.DataFrame(X_new)], axis=1)

model = make_pipeline(
    MyFeatures(),
    RandomForestClassifier(
        random_state=42
    ),
)
model.fit(X_train, y_train)
print(accuracy_score(y_train, model.predict(X_train)))
print(accuracy_score(y_test, model.predict(X_test)))

0.9994583333333333
0.8138333333333333


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Categorical, Integer, Continuous

param_grid = {
    'randomforestclassifier__n_estimators': Integer(10, 50, random_state=42),
    'randomforestclassifier__criterion': Categorical(['gini', 'entropy'], random_state=42),
    'randomforestclassifier__max_depth': Integer(2, 20, random_state=42),
    'randomforestclassifier__min_samples_split': Integer(2, 20, random_state=42),
    'randomforestclassifier__class_weight': Categorical(['balanced', None], random_state=42)
}

model = make_pipeline(
    MyFeatures(),
    RandomForestClassifier(
        random_state=42
    ),
)
ga_cv = GASearchCV(
    model,
    param_grid=param_grid,
    population_size=30,
    generations=10,
    scoring='accuracy'
)
ga_cv.fit(X_train, y_train)
accuracy_score(y_test, ga_cv.best_estimator_.predict(X_test))

gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	30    	0.801293	0.025092   	0.820708   	0.747833   
1  	60    	0.818135	0.00876014 	0.820708   	0.773917   
2  	60    	0.820519	0.0003703  	0.82125    	0.819375   
3  	60    	0.820769	0.000385611	0.821333   	0.819875   
4  	60    	0.821047	0.000435775	0.821708   	0.819708   
5  	60    	0.82141 	0.000240583	0.821917   	0.820958   
6  	60    	0.821579	0.000257604	0.821917   	0.82125    
7  	60    	0.82166 	0.000379919	0.821958   	0.820417   
8  	60    	0.821835	0.000168216	0.821958   	0.821167   
9  	60    	0.8219  	4.38326e-05	0.821917   	0.821708   
10 	60    	0.821849	0.000211773	0.821917   	0.820792   


GASearchCV(estimator=Pipeline(steps=[('myfeatures', MyFeatures()),
                                     ('randomforestclassifier',
                                      RandomForestClassifier(max_depth=5,
                                                             min_samples_split=10,
                                                             n_estimators=33,
                                                             random_state=42))]),
           generations=10,
           param_grid={'randomforestclassifier__class_weight': <sklearn_genetic.space.space.Categorical object at 0x0000020E864AC340>,
                       'randomforestclassifier__criterio...
                       'randomforestclassifier__max_depth': <sklearn_genetic.space.space.Integer object at 0x0000020EDB371F10>,
                       'randomforestclassifier__min_samples_split': <sklearn_genetic.space.space.Integer object at 0x0000020E8B1F61C0>,
                       'randomforestclassifier__n_estimators': <sk

In [None]:
accuracy_score(y_test, ga_cv.best_estimator_.predict(X_test))

0.8181666666666667

In [None]:
(1-y_test).mean()

0.7788333333333334