## Import Libraries

In [None]:
import pandas as pd
import numpy as np
#from catboost import CatBoostClassifier
#from sklearn.model_selection import StratifiedKFold,KFold,GroupKFold
#from sklearn.metrics import accuracy_score

#Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

#For Missing Value and Feature Engineering
from sklearn.feature_selection import SelectKBest, chi2, f_classif, VarianceThreshold
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

import time


## Import Data

In [None]:
train = pd.read_csv("../input/santander-customer-satisfaction/train.csv")
test = pd.read_csv("../input/santander-customer-satisfaction/test.csv")

In [None]:
train.head()

## Divide Dataset into X and Y

In [None]:
#create X and y datasets for splitting 
X = train.drop(['ID', 'TARGET'], axis=1)
y = train['TARGET']

In [None]:
all_features = X.columns

In [None]:
all_features = all_features.tolist()

In [None]:
numerical_features = [c for c, dtype in zip(X.columns, X.dtypes)
                     if dtype.kind in ['i','f'] and c !='PassengerId']
categorical_features = [c for c, dtype in zip(X.columns, X.dtypes)
                     if dtype.kind not in ['i','f']]

In [None]:
numerical_features

In [None]:
categorical_features

In [None]:
#import train_test_split library
from sklearn.model_selection import train_test_split

# create train test split
X_train, X_test, y_train, y_test = train_test_split( X,  y, test_size=0.3, random_state=0)  

## Setup Pipeline 

In [None]:
preprocessor = make_column_transformer(
    
    (make_pipeline(
    #SimpleImputer(strategy = 'median'),
    KNNImputer(n_neighbors=2, weights="uniform"),
    MinMaxScaler()), numerical_features),
    
    (make_pipeline(
    SimpleImputer(strategy = 'constant', fill_value = 'missing'),
    OneHotEncoder(categories = 'auto', handle_unknown = 'ignore')), categorical_features),
    
)

In [None]:
preprocessor_best = make_pipeline(preprocessor, 
                                  VarianceThreshold(), 
                                  SelectKBest(f_classif, k = 10), 
                                  PCA(n_components = 3))

In [None]:
RF_Model = make_pipeline(preprocessor_best, RandomForestClassifier(n_estimators = 100))

## Grid Search

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 50)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
#Maximum number of levels in tree
max_depth = [2,4,6,8]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [None]:
# Create the param grid
param_grid = {'randomforestclassifier__n_estimators': n_estimators,
               'randomforestclassifier__max_features': max_features,
               'randomforestclassifier__max_depth': max_depth,
               'randomforestclassifier__min_samples_split': min_samples_split,
               'randomforestclassifier__min_samples_leaf': min_samples_leaf,
               'randomforestclassifier__bootstrap': bootstrap
             }
print(param_grid)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rf_RandomGrid = RandomizedSearchCV(estimator = RF_Model, param_distributions = param_grid, cv = 5, verbose=3, n_jobs = -1, scoring = 'roc_auc', n_iter = 5)

In [None]:
rf_RandomGrid.fit(X_train, y_train)

In [None]:
rf_RandomGrid.best_estimator_

## Accuracy

In [None]:
print(f'Train : {rf_RandomGrid.score(X_train, y_train):.3f}')
print(f'Test : {rf_RandomGrid.score(X_test, y_test):.3f}')

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
print(f'Train AUC : {roc_auc_score(y_train, rf_RandomGrid.predict_proba(X_train)[:,1]):.3f}')
print(f'Train AUC : {roc_auc_score(y_test, rf_RandomGrid.predict_proba(X_test)[:,1]):.3f}')

## Submission 

In [None]:
test_pred = rf_RandomGrid.predict_proba(test[X.columns])[:,1]
#test_pred = rf_RandomGrid.predict(test[X.columns])

In [None]:
AllSub = pd.DataFrame({ 'ID': test['ID'],
                       'TARGET' : test_pred
    
})

In [None]:
#AllSub['TARGET'] = AllSub['TARGET'].apply(lambda x: 1 if x > 0.5 else 0)

In [None]:
AllSub.to_csv('Santander_RF_Better_Pipe.csv', index = False)