# Feature Engineering from my Feature Engineering notebook

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import h2o
#connecting to cluster
h2o.init(strict_version_check=False)

In [None]:
data_csv = "/kaggle/input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv"
data = h2o.import_file(data_csv)

In [None]:
data.describe()

In [None]:
data.rename(columns={"PAY_0": "PAY_1"}) #for consistency
data.rename(columns={'default.payment.next.month': "DEFAULT"}) #easier

cols_names = data.columns #because we know the data type for all the columns (they are all ints)
cols_names

In [None]:
not_categorical = ['ID',
 'LIMIT_BAL',
  'AGE',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'BILL_AMT5',
 'BILL_AMT6',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'PAY_AMT5',
 'PAY_AMT6']

target = "DEFAULT"

categorical = [item for item in cols_names if item not in not_categorical and item != target]
categorical

In [None]:
data.head()

In [None]:
#Onehot encoding (as labels are already encoded as numbers)

data_onehot = pd.get_dummies(data.as_data_frame(), columns=categorical)
data_onehot.head()

In [None]:
#Drop the ID column

data_onehot = data_onehot.drop(columns=['ID'])

In [None]:
data_onehot.columns

In [None]:
#Creating equally sized bins for age - 5 categories

print(data_onehot['AGE'].describe())

#add age bins to make it all-inclusive - in case new data may come

data_onehot['AGE_BINS'] = pd.qcut(data_onehot['AGE'], 5)

#Add age bins for ages (0, 20.999] and (79.0, ) - even though there may be no data for this in the present dataset, it is important to do this in case we have future data

data_onehot['AGE_BINS_(0, 20.999]'] = 0 #in the same format as after one hot encoding (doing this two cells later)
data_onehot['AGE_BINS_(79.0, )'] = 0


In [None]:
data_onehot.head() #it works!

In [None]:
#Now we use one hot encoding for these categories

data_age = pd.get_dummies(data_onehot, columns=['AGE_BINS'])
data_age = data_age.drop(columns=['AGE'])
data_age.head()

In [None]:
#some statistical featurs

bill_amt_cols = ['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']
pay_amt_cols = ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

#mean of Bill_amt and Pay_amt, max, min, std, var

data_age['BILL_AMT_MEAN'] = data_age[bill_amt_cols].mean(axis=1)
data_age['PAY_AMT_MEAN'] = data_age[pay_amt_cols].mean(axis=1)

data_age['BILL_AMT_MAX'] = data_age[bill_amt_cols].max(axis=1)
data_age['PAY_AMT_MAX'] = data_age[pay_amt_cols].max(axis=1)

data_age['BILL_AMT_MIN'] = data_age[bill_amt_cols].min(axis=1)
data_age['PAY_AMT_MIN'] = data_age[pay_amt_cols].min(axis=1)

data_age['BILL_AMT_MED'] = data_age[bill_amt_cols].median(axis=1)
data_age['PAY_AMT_MED'] = data_age[pay_amt_cols].median(axis=1)

data_age['BILL_AMT_STD'] = data_age[bill_amt_cols].std(axis=1)
data_age['PAY_AMT_STD'] = data_age[pay_amt_cols].std(axis=1)

data_age['BILL_AMT_VAR'] = data_age[bill_amt_cols].var(axis=1)
data_age['PAY_AMT_VAR'] = data_age[pay_amt_cols].var(axis=1)


data_age.head()

In [None]:
#some new variables

#payment fraction of bill statement
for i in range(1, 7):        
    data_age['PAY_FRAC_' + str(i)] = data_age[pay_amt_cols[i-1]] / data_age[bill_amt_cols[i-1]]
data_age = data_age.fillna(0)


#fraction of credit limit used (bill_amt / limit_bal)
for i in range(1, 7):        
    data_age['USED_CREDIT' + str(i)] = data_age[bill_amt_cols[i-1]] / data_age['LIMIT_BAL']
data_age = data_age.fillna(0)


data_age.head()

In [None]:
data_age['PAY_FRAC_1'].max()



#There are 540. Three simple ways to deal: delete feature, delete rows, set to zero. Have to test.

#Setting to zero

for i in range (1, 7):
    #print(len(data_age[data_age['PAY_FRAC_' + str(i)] == np.inf])) #0 of them are -np.inf
    data_age['PAY_FRAC_' + str(i)] = data_age['PAY_FRAC_' + str(i)].replace({np.inf: 0})
    #print(len(data_age[data_age['PAY_FRAC_' + str(i)] == np.inf]))

In [None]:
#Scaling

#Using standard scalar scaling
#Multiple methods such as min-max scaling, standard scaling, etc. All have different advantages and depend on the distribution of data.
#Can always change this in the next iterations of the ML pipeline. Trial and error process.

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

scaled_features = data_age.copy()

col_names = ['LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4' ,'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4' ,'PAY_AMT5', 'PAY_AMT6', 'PAY_FRAC_1', 'PAY_FRAC_2', 'PAY_FRAC_3', 'PAY_FRAC_4', 'PAY_FRAC_5', 'PAY_FRAC_6', 'USED_CREDIT1', 'USED_CREDIT2', 'USED_CREDIT3', 'USED_CREDIT4', 'USED_CREDIT5', 'USED_CREDIT6', 'BILL_AMT_MEAN',
 'PAY_AMT_MEAN',
 'BILL_AMT_MAX',
 'PAY_AMT_MAX',
 'BILL_AMT_MIN',
 'PAY_AMT_MIN',
 'BILL_AMT_MED',
 'PAY_AMT_MED',
 'BILL_AMT_STD',
 'PAY_AMT_STD',
 'BILL_AMT_VAR',
 'PAY_AMT_VAR']
features = scaled_features[col_names]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)

scaled_features[col_names] = features
scaled_features

In [None]:
scaled_df = pd.DataFrame(scaled_features, columns=['LIMIT_BAL', 'BILL_AMT1', 'PAY_AMT1', 'USED_CREDIT1'])

fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(6, 5))

ax1.set_title('Before Scaling')
sns.kdeplot(data_age['LIMIT_BAL'], ax=ax1) #kernel density estimate plot (non-parametric way to estimate the probability density function of a random variable.)
sns.kdeplot(data_age['BILL_AMT1'], ax=ax1)
sns.kdeplot(data_age['PAY_AMT1'], ax=ax1)
sns.kdeplot(data_age['USED_CREDIT1'], ax=ax1)
ax2.set_title('After Standard Scaler')
sns.kdeplot(scaled_df['LIMIT_BAL'], ax=ax2)
sns.kdeplot(scaled_df['BILL_AMT1'], ax=ax2)
sns.kdeplot(scaled_df['PAY_AMT1'], ax=ax2)
sns.kdeplot(scaled_df['USED_CREDIT1'], ax=ax2)
plt.show()

We can see here how the data is scaled. Now, we have the dataframe *scaled_features.*

In [None]:
scaled_features.columns

# Model development

### Logistic Regression

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
y_data = scaled_features['DEFAULT']
X_data = scaled_features.copy().drop(columns=['DEFAULT'])

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size = 0.3)

In [None]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

In [None]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [None]:
# K-Fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
model_accuracy = accuracies.mean()
model_standard_deviation = accuracies.std()

In [None]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
#Generating reports on metrics
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
#ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

area_under_curve = roc_auc_score(y_test, classifier.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, classifier.predict_proba(X_test)[:,1])

plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % area_under_curve)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")

plt.show()

Incredibly small area under the ROC curve. This means the model isn't that good at discriminating, which is concerning.


### H2O Rulefit Analysis

In [None]:
#H2O Rulefit

data = h2o.H2OFrame(scaled_features.copy())
data.types

Looks like all the 'int' variables are actually the one-hot-encoded variables. Converting them to factor:

In [None]:
for col in data.types:
    if data.types[col] == 'int':
        data[col] = data[col].asfactor()
        
data.types

In [None]:
# Splitting the dataset into the Training set and Test set
train, test = data.split_frame(ratios = [0.7], destination_frames=["train", "test"], seed = 1234)

In [None]:
from h2o3_rule_fit import H2ORuleFit

cols = data.columns
cols.remove('DEFAULT')

rulefit_model = H2ORuleFit(algorithm = "DRF", seed = 1234)
#GLM model with Lasso regularization

rulefit_model.train(training_frame = train, x = cols, y = 'DEFAULT')

In [None]:
print("Intercept: " + str(round(rulefit_model.intercept.get("Intercept"), 10)))
print("\n\n")

rules = rulefit_model.rule_importance
for i in range(len(rules)):
    print("Coefficient:" + str(round(rules.iloc[i]["coefficient"], 15)) 
          + "\nRule: " + rules.iloc[i]["rule"] + "\n\n")

In [None]:
rulefit_model.varimp_plot()

In [None]:
rulefit_model.coverage_table(data)

In [None]:
predictions = rulefit_model.predict(test)
predictions = test["DEFAULT"].cbind(predictions)
predictions.head()

In [None]:
positives = predictions[predictions["predict"] == "1"]
negatives = predictions[predictions["predict"] == "0"]
negatives

In [None]:
print("How many times we correctly predicted defaulted: {:.2%}".format(positives[positives["DEFAULT"] == positives["predict"]].nrow/positives.nrow))
print("How many times we correctly predicted not defaulted: {:.2%}".format(negatives[negatives["DEFAULT"] == negatives["predict"]].nrow/negatives.nrow))

In [None]:
print("Accuracy with RuleFit Model: {:.2%}".format(predictions[predictions["DEFAULT"] == predictions["predict"]].nrow/predictions.nrow))
print("Accuracy with Constant Model: {:.2%}".format(predictions[predictions["DEFAULT"] == "0"].nrow/predictions.nrow))

Some important conclusions:
1. The GLM is weak when trying to predict defaulting (low recall).
2. RuleFit doesn't seem to help much.

## Other H2O Models

### GLM

### Binomial Classification

In [None]:
data = h2o.H2OFrame(scaled_features.copy())

for col in data.types:
    if data.types[col] == 'int':
        data[col] = data[col].asfactor()

#As H2O prefers the predictor/response columns instead of the whole dataframes
data_y = 'DEFAULT'
data_X = [col for col in data.columns if col != 'DEFAULT']

# Splitting the dataset into the Training set and Test set
train, test = data.split_frame(ratios = [0.7], destination_frames=["train", "test"], seed = 1234)

In [None]:
#Binomial classification, chooses automatic solver, didn't specify model_id (although it makes it easier for flow) because wanted to use default
#No validation frame being defined (being consistent with the others - choosing default option)

from h2o.estimators.glm import H2OGeneralizedLinearEstimator

binom_model = H2OGeneralizedLinearEstimator(family='binomial', solver='AUTO')
binom_model.train(data_X, data_y, training_frame=train)

In [None]:
binom_model.model_performance(test)

#what is threshold?

### Gradient Boosting Machine - Automatically selecting best distribution

In [None]:
data = h2o.H2OFrame(scaled_features.copy())

for col in data.types:
    if data.types[col] == 'int':
        data[col] = data[col].asfactor()

#As H2O prefers the predictor/response columns instead of the whole dataframes
data_y = 'DEFAULT'
data_X = [col for col in data.columns if col != 'DEFAULT']

# Splitting the dataset into the Training set and Test set
train, test = data.split_frame(ratios = [0.7], destination_frames=["train", "test"], seed = 1234)

In [None]:
#Not defining number of trees, max_depth and lr (typical parameters) as want to use default value. Alos choosing AUTO

from h2o.estimators.gbm import H2OGradientBoostingEstimator

gb_model = H2OGradientBoostingEstimator(distribution='AUTO')
gb_model.train(data_X, data_y, training_frame=train)

In [None]:
predict_gb = gb_model.predict(test)

from sklearn.metrics import f1_score

f1_sc = f1_score(test['DEFAULT'].as_data_frame(), predict_gb['predict'].as_data_frame())
roc_score = roc_auc_score(test['DEFAULT'].as_data_frame(), predict_gb['p1'].as_data_frame())

print("F1 Score :", f1_sc)
print("ROC Score :", roc_score, gb_model.model_performance(test).auc())

In [None]:
gb_model.model_performance(test)

In [None]:
gb_model.F1()

In [None]:
#Find out what distribution is selected - for some reason it has been remarkably difficult

### H2O's AutoML

In [None]:
data = h2o.H2OFrame(scaled_features.copy())

for col in data.types:
    if data.types[col] == 'int':
        data[col] = data[col].asfactor()

#As H2O prefers the predictor/response columns instead of the whole dataframes
data_y = 'DEFAULT'
data_X = [col for col in data.columns if col != 'DEFAULT']

# Splitting the dataset into the Training set and Test set
train, test = data.split_frame(ratios = [0.7], destination_frames=["train", "test"], seed = 1234)

In [None]:
# Run AutoML for 20 base models (limited to 1 hour max runtime by default)
# Time: 12 minutes

from h2o.automl import H2OAutoML

aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=data_X, y=data_y, training_frame=train)

In [None]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

In [None]:
# Storing the best model
best_aml = aml.leader

In [None]:
predict_aml = best_aml.predict(test)

from sklearn.metrics import f1_score


f1_sc = f1_score(test['DEFAULT'].as_data_frame(), predict_aml['predict'].as_data_frame())
roc_score = roc_auc_score(test['DEFAULT'].as_data_frame(), predict_aml['p1'].as_data_frame())

print("F1 Score :", f1_sc)
print("ROC Score :", roc_score)

In [None]:
best_aml.F1(test_data)

Some ideas for going forward:

- Compare the different models (not going to be that useful though)
- Tune the best models
- Add to sheet

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score

y_data1 = scaled_features['DEFAULT']
X_data1 = scaled_features.copy().drop(columns=['DEFAULT'])

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data1, y_data1, test_size = 0.3)

model = GradientBoostingClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
f1_sc = f1_score(y_test,y_pred)
proba = model.predict_proba(X_test)
roc_score = roc_auc_score(y_test, proba[:,1])

print("F1 Score :", f1_sc)
print("ROC Score :", roc_score)

In [None]:
from sklearn.metrics import roc_auc_score

proba = model.predict_proba(X_test)
roc_score = roc_auc_score(y_test, proba[:,1])

print(classification_report(y_test,y_pred, zero_division=1))
print("F1 Score :", f1_sc)
print("ROC Score :", roc_score)

# **WIP** - to be continued