# **Training Notebook for Santander Dataset with AutoML**

## **1. Required Libraries & Setup**

In [None]:
# General Data Manipulation Libraries
import numpy as np; print('Numpy Version:', np.__version__)
import pandas as pd; print('Pandas Version:', pd.__version__)

# Model & Helper Libraries
from sklearn.model_selection import train_test_split

## **2. Results with Different Classifier models**

In [None]:
# Load Data
input_dir = '/kaggle/input/santander-customer-transaction-prediction/'
df_train = pd.read_csv(input_dir + 'train.csv')

# Train-Validation Split
var_colums = [c for c in df_train.columns if c not in ['ID_code','target']]
X = df_train.loc[:, var_colums]
y = df_train.loc[:, 'target']

# We are performing a 80-20 split for Training and Validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=11)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

### **2.1 Using LightGBM**

In [None]:
# Import LightGBM 
import lightgbm as lgb

In [None]:
# Create Dataset for LightGBM
lgb_train = lgb.Dataset(X_train.values, label=y_train.values)
lgb_eval = lgb.Dataset(X_valid.values, y_valid.values, reference=lgb_train)

In [None]:
clf = lgb.LGBMClassifier()
params = {    'boosting_type': 'gbdt',
              'objective': 'binary',
              'num_leaves': 40,
              'learning_rate': 0.05,
              'feature_fraction': 0.5,
              'metric': 'auc',  
              'bagging_fraction': 0.5,
              'is_unbalance': 'true',
              'n_estimators': 200
              }
clf.set_params(**params)
clf.get_params()

In [None]:
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train,eval_set=[(X_valid.values, y_valid.values)], eval_metric='auc', early_stopping_rounds=5, verbose=True)

In [None]:
# Predictions
y_pred=clf.predict(X_valid)
np.unique(y_pred, return_counts=True)

**View Accuracy**

In [None]:
# view accuracy
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_pred, y_valid)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_valid, y_pred)))

**Classification Metrics**

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_valid, y_pred))

**Confusion-matrix**

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
cm = confusion_matrix(y_valid, y_pred)

# visualize confusion matrix with seaborn heatmap

cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

### **6.1 Using AutoML H20**

In [None]:
# Install Dependencies
!pip install requests
!pip install tabulate
!pip install "colorama>=0.3.8"
!pip install future

In [None]:
# Install and import h2o
!pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o
import h2o

In [None]:
from h2o.automl import H2OAutoML
h2o.init(max_mem_size='16G')

In [None]:
df_h2o = h2o.import_file('/kaggle/input/santander-customer-transaction-prediction/train.csv')
type(df_h2o)

In [None]:
# For binary classification, response should be a factor
df_h2o["target"] = df_h2o["target"].asfactor()

# Settign up Response and Predictor Columns
y_col = "target"
x_col = df_h2o.columns
x_col.remove(y_col) 
x_col.remove("ID_code")

In [None]:
aml = H2OAutoML(max_models=20, max_runtime_secs=3600, seed=11, verbosity="info", balance_classes = True) #max_runtime_secs
aml.train(x=x_col,y=y_col, training_frame=df_h2o)  

    
# View the AutoML Leaderboard
lb = aml.leaderboard
print(lb.head(rows=lb.nrows))  # Print all rows instead of default (10 rows)  

In [None]:
# Get model ids for all models in the AutoML Leaderboard
model_ids = list(aml.leaderboard['model_id'].as_data_frame().iloc[:,0])
# Get the "All Models" Stacked Ensemble model
se = h2o.get_model([mid for mid in model_ids if "StackedEnsemble_AllModels" in mid][0])
# Get the Stacked Ensemble metalearner model
metalearner = h2o.get_model(se.metalearner()['name'])

In [None]:
metalearner.std_coef_plot()