In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train=pd.read_csv('/kaggle/input/santander-customer-satisfaction/train.csv')
df_train.head()

In [None]:
df_train['TARGET'].unique()

In [None]:
df_test=pd.read_csv('/kaggle/input/santander-customer-satisfaction/test.csv')
df_test.head()

In [None]:
#combine train and test data for data preprocessing
df_merge=pd.concat([df_test.assign(ind="test"), df_train.assign(ind="train")])
df_merge.head()

In [None]:
df_merge.shape

In [None]:
#Get count of missing values in each column
def get_cols_with_missing_values(DataFrame):
    missing_na_columns=(DataFrame.isnull().sum())
    return missing_na_columns[missing_na_columns > 0]

get_cols_with_missing_values(df_merge)

In [None]:
#Get categorical columns
cat_cols=[cname for cname in df_merge.columns if df_merge[cname].dtype == "object"]
print(cat_cols)

In [None]:
#Get numerical columns
num_cols=[cname for cname in df_merge.columns if df_merge[cname].dtype != "object"]
print(num_cols)

In [None]:
#split test and train data
test, train= df_merge[df_merge["ind"].eq("test")], df_merge[df_merge["ind"].eq("train")]
test.drop(["TARGET", "ind"], axis=1, inplace=True)
train.drop(["ind"], axis=1, inplace=True)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
X = train.loc[:, train.columns!='TARGET']
y = train['TARGET']

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)

**Variance Threshold - Feature Selection**

In [None]:
from sklearn.feature_selection import VarianceThreshold as VT
var_thres = VT(threshold=0)
var_thres.fit(X_train)

In [None]:
#get features that have constant values
const_cols = [ col for col in X_train.columns if col not in X_train.columns[var_thres.get_support()]]
print(const_cols)

In [None]:
#drop constant value features
X_train.drop(columns=const_cols, axis=1, inplace=True)
X_valid.drop(columns=const_cols, axis=1, inplace=True)
test.drop(columns=const_cols, axis=1, inplace=True)

In [None]:
X_train.shape, X_valid.shape, test.shape

**Pearson Correlation - Feature Selection**

In [None]:
def correlation(dataset, threshold):
    col_corr = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if(corr_matrix.iloc[i,j]) > threshold:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.85)
print("Features with high correlation ", corr_features)

In [None]:
X_train.drop(columns=corr_features, axis=1, inplace=True)
X_valid.drop(columns=corr_features, axis=1, inplace=True)
test.drop(columns=corr_features, axis=1, inplace=True)

In [None]:
X_train.shape, X_valid.shape, test.shape

In [None]:
from sklearn.feature_selection import mutual_info_classif
mutual_info = mutual_info_classif(X_train, y_train)
mutual_info

In [None]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info.sort_values(ascending=False)

In [None]:
#select top 10 features
from sklearn.feature_selection import SelectKBest
sel_ten_features = SelectKBest(mutual_info_classif, k=10)
sel_ten_features.fit(X_train.fillna(0), y_train)
X_train.columns[sel_ten_features.get_support()]

In [None]:
X_train2 = X_train[['var15', 'ind_var5_0', 'ind_var5', 'ind_var30_0', 'ind_var30',
       'num_var4', 'saldo_var5', 'saldo_var30', 'saldo_medio_var5_hace2',
       'saldo_medio_var5_hace3']]

X_valid2 = X_valid[['var15', 'ind_var5_0', 'ind_var5', 'ind_var30_0', 'ind_var30',
       'num_var4', 'saldo_var5', 'saldo_var30', 'saldo_medio_var5_hace2',
       'saldo_medio_var5_hace3']]


test2 = test[['var15', 'ind_var5_0', 'ind_var5', 'ind_var30_0', 'ind_var30',
       'num_var4', 'saldo_var5', 'saldo_var30', 'saldo_medio_var5_hace2',
       'saldo_medio_var5_hace3']]

In [None]:
X_train2.shape, X_valid2.shape, test2.shape

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler= MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train2)
X_valid_scaled = scaler.transform(X_valid2)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
#from sklearn.ensemble import GradientBoostingClassifier

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost

In [None]:
#Hyper parameter optimization
params={
   "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    "max_depth": [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],
    "min_child_weight": [1,3,5,7,9],
    "gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree": [0.3, 0.4, 0.5, 0.6, 0.7]
}

In [None]:
classifier=xgboost.XGBClassifier()

In [None]:
random_search=RandomizedSearchCV(classifier, param_distributions=params, n_iter=5, n_jobs=-1, cv=5,verbose=0)

In [None]:
random_search.fit(X_train_scaled, y_train)

In [None]:
print("train score: "+ str(random_search.score(X_train_scaled, y_train)))
print("test score: "+ str(random_search.score(X_valid_scaled, y_valid)))

In [None]:
random_search.best_estimator_

In [None]:
random_search.best_params_

In [None]:
classifier=xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6,
              enable_categorical=False, gamma=0.0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=10,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [None]:
classifier.fit(X_train_scaled, y_train)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = classifier.predict(X_valid_scaled)
print(confusion_matrix(y_valid, y_pred))

In [None]:
#Get test predictions
pred_test=classifier.predict(test2)
# Save predictions in format used for competition scoring
output = pd.DataFrame({'ID': test2.index,
                       'TARGET': pred_test})
output.to_csv('submission.csv', index=False)