## Import Libraries

In [None]:
import pandas as pd
import numpy as np

#from sklearn.model_selection import StratifiedKFold,KFold,GroupKFold
#from sklearn.metrics import accuracy_score

#Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

#For Missing Value and Feature Engineering
from sklearn.feature_selection import SelectKBest, chi2, f_classif, VarianceThreshold
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from xgboost import XGBClassifier
#from catboost import CatBoostClassifier

import time


## Import Data

In [None]:
train = pd.read_csv("../input/santander-customer-satisfaction/train.csv")
test = pd.read_csv("../input/santander-customer-satisfaction/test.csv")

In [None]:
train.head()

## Divide Dataset into X and Y

In [None]:
#create X and y datasets for splitting 
X = train.drop(['ID', 'TARGET'], axis=1)
y = train['TARGET']

In [None]:
all_features = X.columns

In [None]:
all_features = all_features.tolist()

In [None]:
numerical_features = [c for c, dtype in zip(X.columns, X.dtypes)
                     if dtype.kind in ['i','f'] and c !='PassengerId']
categorical_features = [c for c, dtype in zip(X.columns, X.dtypes)
                     if dtype.kind not in ['i','f']]

In [None]:
numerical_features

In [None]:
categorical_features

In [None]:
#import train_test_split library
from sklearn.model_selection import train_test_split

# create train test split
X_train, X_test, y_train, y_test = train_test_split( X,  y, test_size=0.3, random_state=0)  

## Setup Pipeline 

In [None]:
preprocessor = make_column_transformer(
    
    (make_pipeline(
    #SimpleImputer(strategy = 'median'),
    KNNImputer(n_neighbors=2, weights="uniform"),
    MinMaxScaler()), numerical_features),
    
    (make_pipeline(
    SimpleImputer(strategy = 'constant', fill_value = 'missing'),
    OneHotEncoder(categories = 'auto', handle_unknown = 'ignore')), categorical_features),
    
)

In [None]:
preprocessor_best = make_pipeline(preprocessor, 
                                  VarianceThreshold(), 
                                  SelectKBest(f_classif, k = 15), 
                                  PCA(n_components = 4))

In [None]:
p1 = make_pipeline(preprocessor_best, XGBClassifier())
p2 = make_pipeline(preprocessor_best, RandomForestClassifier())
p3 = make_pipeline(preprocessor_best, ExtraTreesClassifier())
p4 = make_pipeline(VotingClassifier(estimators=[("xgbclassifier",p1), ("randomforestclassifier",p2), ('extratreesclassifier',p3)], voting = 'soft'))

## Grid Search

- https://stackoverflow.com/questions/46793110/using-votingclassifier-in-sklearn-pipeline/46793305

In [None]:
# Create the param grid - xgbclassifier
#param_grid = {
# "xgbclassifier__learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
# "xgbclassifier__max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
# "xgbclassifier__min_child_weight" : [ 1, 3, 5, 7 ],
# "xgbclassifier__gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
# "xgbclassifier__colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]   
#}


In [None]:
#from sklearn.model_selection import RandomizedSearchCV
#voting_classfier = RandomizedSearchCV(estimator = p4, 
 #                                   param_distributions = param_grid,
 #                                   cv = 5, verbose=3, n_jobs = -1, scoring = 'roc_auc', n_iter = 5)

In [None]:
p4.fit(X_train, y_train)

In [None]:
#p4.best_estimator_

## Accuracy

In [None]:
print(f'Train : {p4.score(X_train, y_train):.3f}')
print(f'Test : {p4.score(X_test, y_test):.3f}')

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
print(f'Train AUC : {roc_auc_score(y_train, p4.predict_proba(X_train)[:,1]):.3f}')
print(f'Train AUC : {roc_auc_score(y_test, p4.predict_proba(X_test)[:,1]):.3f}')

## Submission 

In [None]:
test_pred = p4.predict_proba(test[X.columns])[:,1]
#test_pred = rf_RandomGrid.predict(test[X.columns])

In [None]:
AllSub = pd.DataFrame({ 'ID': test['ID'],
                       'TARGET' : test_pred
    
})

In [None]:
#AllSub['TARGET'] = AllSub['TARGET'].apply(lambda x: 1 if x > 0.5 else 0)

In [None]:
AllSub.to_csv('Santander_Voting.csv', index = False)