In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Import Data

In [None]:
data = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')

In [None]:
# Lets delete those strange variables
data = data.drop(["CLIENTNUM", 
              "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1", 
              "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"
             ], axis=1)

data.head()

Feature Engineering

In [None]:
# Encode binary variables
data['Gender'] = data['Gender'].map({'M':1, 'F':0})

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['class'] = le.fit_transform(data['Attrition_Flag'])
data = data.drop('Attrition_Flag', axis=1)

In [None]:
categorical_columns = data.select_dtypes(exclude=['int64','float64']).columns
numerical_columns = data.drop('class', axis=1).select_dtypes(include=['int64','float64']).columns
categorical_columns

In [None]:
# One hot encoding independent variable x
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

In [None]:
for feature in categorical_columns:
    data = encode_and_bind(data, feature)

data.head()

In [None]:
# Generate x and y sets
x = data.drop('class', axis=1).values
y = data['class']

Modelling

In [None]:
# Splitting the dataset into training set and test set
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, data['class'], test_size = 0.2, random_state=1234)

In [None]:
!pip install h2o

import h2o
from h2o.automl import H2OAutoML

h2o.init()

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler

over = BorderlineSMOTE(sampling_strategy=0.3)
under = RandomUnderSampler(sampling_strategy=0.6)

steps = [('o', over), ('u', under)]

In [None]:
from imblearn.pipeline import Pipeline

pipeline = Pipeline(steps=steps)

# transform the dataset
x_sm_us, y_sm_us = pipeline.fit_resample(x_train, y_train)

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(random_state=1234)
tree_clf.fit(x_sm_us, y_sm_us)

In [None]:
# Names of the independent variables
feature_names = list(data.drop('class', axis=1).columns)

In [None]:
features_to_plot = 25

importances = tree_clf.feature_importances_
indices = np.argsort(importances)

best_vars = np.array(feature_names)[indices][-features_to_plot:]
values = importances[indices][-features_to_plot:]
best_vars

In [None]:
sm_us_x = np.concatenate((x_sm_us, x_test))
sm_us_y = np.concatenate((y_sm_us, y_test))

In [None]:
sm_us_df = pd.DataFrame(np.column_stack([sm_us_y, sm_us_x]), columns=['class'] + feature_names)
sm_us_df.head()

In [None]:
hf = h2o.H2OFrame(sm_us_df[['class'] + list(best_vars)])
hf.head()

In [None]:
hf['class'] = hf['class'].asfactor()
predictors = hf.drop('class').columns
response = 'class'

In [None]:
# Split into train and test
train, valid = hf.split_frame(ratios=[.8], seed=1234)

In [None]:
# Add a Stopping Creterias: max number of models and max time
# We are going to exclude DeepLearning algorithms because they are too slow
aml = H2OAutoML(
    max_models=20,
    max_runtime_secs=300,
    seed=1234,
    exclude_algos = ["DeepLearning"]
)

In [None]:
# Train the model
aml.train(x=predictors,
        y=response,
        training_frame=train,
        validation_frame=valid
)

In [None]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=5)  # Print the first 5 rows

In [None]:
print('The model performance in Accuracy: {}'.format(aml.leader.accuracy(valid=True)))
print('The model performance in AUC: {}'.format(aml.leader.auc(valid=True)))

In [None]:
# Get third model
m = h2o.get_model(lb[2,"model_id"])

In [None]:
m.varimp(use_pandas=True)