## Importing requried libraries and data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [None]:
df = pd.read_csv('../input/santander-customer-satisfaction/train.csv')
df.head()

In [None]:
df.drop('ID', axis=1, inplace=True)
df.shape

## Sampling and Feature Processing
- Going to try mixed approach for sampling this time... i.e., first under sampling the rows with target=0 to 15000 and then over sampling the rows with target=1 to 15000
- Also applying PCA with 99% of information retention cause the number of columns is too large to handle

In [None]:
df0 = df.loc[df['TARGET']==0]
df1 = df.loc[df['TARGET']==1]

np.random.seed = 2021
idx_0 = np.random.choice(df0.index, size=15000, replace=False)
len(idx_0)

In [None]:
df0 = df0.loc[idx_0]
df_balanced = pd.concat([df0,df1])
X_bal = df_balanced.drop('TARGET', axis=1)
y_bal = df_balanced['TARGET']

In [None]:
smote = SMOTE(random_state=2021)
X_bal, y_bal = smote.fit_resample(X_bal, y_bal)
y_bal.value_counts()

In [None]:
sc = StandardScaler()
X_sc_bal = sc.fit_transform(X_bal)

In [None]:
pca = PCA(0.99)
X_sc_bal_pca = pca.fit_transform(X_sc_bal)
X_sc_bal_pca.shape

## Importing and prorcessing Test Data

In [None]:
test_df = pd.read_csv('../input/santander-customer-satisfaction/test.csv')
X_test = test_df.drop('ID', axis=1)
X_sc_test = sc.transform(X_test)
X_sc_test_pca = pca.transform(X_sc_test)
X_sc_test_pca.shape

## Fitting model and predicting values

In [None]:
kfold = StratifiedKFold(n_splits=5, random_state=2021,shuffle=True)

parameters = {'learning_rate' : [0.4,0.3,0.2,0.1],
              'max_depth' : np.arange(10,1,-2),
              'n_estimators' : [100,50]}

clf = XGBClassifier(random_state=2021,
                    n_jobs=8,
                    use_label_encoder=False,
                    verbosity=0,
                    silent=True)
cv = GridSearchCV(clf,
                  param_grid=parameters,
                  cv=kfold,
                  scoring='roc_auc',
                  verbose=3)

cv.fit(X_sc_bal_pca,y_bal)
df_cv = pd.DataFrame(cv.cv_results_)
print(cv.best_params_)
print(cv.best_score_)

In [None]:
y_pred = cv.predict_proba(X_sc_test_pca)

out = pd.DataFrame([test_df['ID'].values, y_pred[:,1]]).T
out.columns = ['ID','TARGET']
out['ID'] = out['ID'].astype('int')
out.to_csv('submission.csv', index=False)