In [None]:
!pip install feature-engine

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from feature_engine.selection import DropDuplicateFeatures
from feature_engine.selection import DropCorrelatedFeatures
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import time
from imblearn.over_sampling import RandomOverSampler

rng = np.random.RandomState(42)
train_path = '../input/santander-customer-satisfaction/train.csv'
test_path = '../input/santander-customer-satisfaction/test.csv'

preprocessing

In [None]:
data_train = pd.read_csv(train_path, sep=',').drop(columns=['ID'])
data_train = data_train.drop_duplicates()
data_train = data_train.drop_duplicates(
    subset=data_train.columns[:-1], keep=False)
data_train['var3'] = data_train['var3'].replace(-999999, 2)

X_train, X_test, y_train, y_test = train_test_split(data_train.drop('TARGET', axis=1), data_train.TARGET, train_size=0.8,
                                                    stratify=data_train.TARGET, random_state=rng)

correlated = DropCorrelatedFeatures(
    variables=None, method='pearson', threshold=0.9)
correlated.fit(X_train)
X_train = correlated.transform(X_train)
X_test = correlated.transform(X_test)
print('delete columns with high corr: done!')

var = VarianceThreshold(threshold=.9 * (1 - .9))
var.fit(X_train)
var.transform(X_train)
X_train = X_train[X_train.columns[var.get_support()]]
var.transform(X_test)
X_test = X_test[X_test.columns[var.get_support()]]
print('delete columns with low var: done!')

X_train_clean = X_train.copy()
X_test_clean = X_test.copy()

# standardlized
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

SelectKBest

In [None]:
# select 60 features
selector_fc = SelectKBest(score_func=f_classif, k=60)
selector_fc.fit(X_train, y_train)
mask_selected = selector_fc.get_support()
# Saving the selected columns in a list
selected_col = X_train_clean.columns[mask_selected]


# data for final training
X_train_k = X_train_clean[selected_col]
y_train_k = y_train

df_test = pd.read_csv(test_path)
id_test = df_test['ID']
x_test = df_test[selected_col]

XGB

In [None]:
d_train = xgb.DMatrix(X_train_k, label=y_train_k)
watchlist = [(d_train, 'train')]
params = {}

params['objective'] = 'binary:logistic'
params['alpha'] = 1e-2
params['booster'] = 'gbtree'
params['eval_metric'] = 'auc'
params['eta'] = 0.03
params['max_depth'] = 4
params['subsample'] = 0.65
params['colsample_bytree'] = 0.65
params['verbose'] = 2
params['maximise'] = False
params['min_child_weight'] = 3
params['scale_pos_weight'] = 24

clf = xgb.train(params, d_train, 250, watchlist)
d_test = xgb.DMatrix(x_test)
y_pred = clf.predict(d_test)
submission = pd.DataFrame({"ID": id_test, "TARGET": y_pred})
submission.to_csv("submissionhy-pure-code.csv", index=False)