In [None]:
import pandas as pd
pd.options.display.max_columns = 100

from matplotlib import pyplot as plt
import numpy as np

import seaborn as sns
sns.set()
import pylab as plot

Load and check data

In [None]:
#Load the Data
train = pd.read_csv('../input/santander-customer-transaction-prediction/train.csv')
test = pd.read_csv('../input/santander-customer-transaction-prediction/test.csv')
submission = pd.read_csv('../input/santander-customer-transaction-prediction/sample_submission.csv')


Below is the first 5 rows of test dataset:

In [None]:
train.head()

The dimension and number of missing values in the train dataset is as below:

In [None]:
print(f'Number of rows: {train.shape[0]};  Number of columns: {train.shape[1]}; No of missing values: {sum(train.isna().sum())}')

Infos

In [None]:
train.info()

Summarie and statistics¶


In [None]:
train.describe().T

In [None]:
target_count = train['target'].value_counts().sort_index()
target_count_df = pd.DataFrame(target_count)
#pd.options.display.float_format = '{:,.2f}%'.format
target_count_df['target(%)'] = (target_count_df/target_count.sum()*100)
target_count_df.sort_values('target(%)', ascending=False, inplace=True)
display(target_count_df)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(17, 8))

target_count = train['target'].value_counts().sort_index()

ax.bar(target_count.index, target_count, color=['#1520E6' if i%2==0 else '#93D1FF' for i in range(9)],
       width=0.55, 
       edgecolor='black', 
       linewidth=0.7)

ax.margins(0.02, 0.05)

for i in range(1,2):
    ax.annotate(f'{target_count[i]/len(train)*100:.3}', xy=(i, target_count[i]+1000),
                   va='center', ha='center',
               )
#Annotate the point xy with text text.

#In the simplest form, the text is placed at xy.

ax.set_title('target Distribution', weight='bold', fontsize=15)
ax.grid(axis='y', linestyle='-', alpha=0.4)

fig.tight_layout()
plt.show()

In [None]:
train.drop(["ID_code"] , axis = 1 , inplace = True)


In [None]:
y=train['target']
X=train.drop(labels=['target'], axis=1)

Feature Selection- Dropping constant features
In this step we will be removing the features which have constant features which are actually not important for solving the problem statement

In [None]:
### It will zero variance features
from sklearn.feature_selection import VarianceThreshold
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(X)

In [None]:
var_thres.get_support()

In [None]:
### Finding non constant features
sum(var_thres.get_support())

In [None]:
# Lets Find non-constant features 
len(X.columns[var_thres.get_support()])

In [None]:
constant_columns = [column for column in X.columns
                    if column not in X.columns[var_thres.get_support()]]

print(len(constant_columns))

2. Feature Selection- With Correlation¶
In this step we will be removing the features which are highly correlated

In [None]:
from sklearn.datasets import load_boston
X.corr()

In [None]:
import seaborn as sns
#Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = X.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.CMRmap_r)
plt.show()

In [None]:
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(X, 0.7)
len(set(corr_features))

corr_features

Below is the first 5 rows of test dataset:

In [None]:
test.head()

The dimension and number of missing values in the train dataset is as below:


In [None]:
print(f'Number of rows: {test.shape[0]};  Number of columns: {test.shape[1]}; No of missing values: {sum(test.isna().sum())}')

Summarie and statistics¶

In [None]:
test.describe().T

In [None]:
test.drop(["ID_code"] , axis = 1 , inplace = True)

In [None]:
x_test=test

In [None]:
submission.head()

In [None]:
submission.drop(["ID_code"] , axis = 1 , inplace = True)

In [None]:
y_test=submission.target

In [None]:
# Importing Classifier Modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
#Cross Validation (K-fold)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
# calculate manually
def my_function(y,y_preds):
  
  d = y - y_preds
  mse_f = np.mean(d**2)
  mae_f = np.mean(abs(d))
  rmse_f = np.sqrt(mse_f)


  print("Results by manual calculation:")
  print("MAE:",mae_f)
  print("MSE:", mse_f)
  print("RMSE:", rmse_f)


In [None]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from lightgbm import LGBMClassifier
from matplotlib import pyplot
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape


In [None]:
import lightgbm
from sklearn.metrics import roc_auc_score
#Step2: Create a simple Light GBM Model and evaluate performance
#LightGBM has function Dataset to read the data. This is required for using LightGBM
train_data = lightgbm.Dataset(X_train, label=y_train)
valid_data = lightgbm.Dataset(X_valid, label=y_valid)

parameters = {'objective': 'binary',
              'metric': 'auc',
              'is_unbalance': 'true',
              'boosting': 'gbdt',
              'num_leaves': 63,
              'feature_fraction': 0.5,
              'bagging_fraction': 0.5,
              'bagging_freq': 20,
              'learning_rate': 0.01,
              'verbose': -1
             }

model_lgbm = lightgbm.train(parameters,
                            train_data,
                            valid_sets=valid_data,
                            num_boost_round=5000,
                            early_stopping_rounds=50)
y_train_pred = model_lgbm.predict(X_train)
y_valid_pred = model_lgbm.predict(X_valid)

print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
                                                    roc_auc_score(y_valid, y_valid_pred)))


In [None]:
import xgboost
model_xgboost = xgboost.XGBClassifier(learning_rate=0.1,
                                      max_depth=5,
                                      n_estimators=5000,
                                      subsample=0.5,
                                      colsample_bytree=0.5,
                                      eval_metric='auc',
                                      verbosity=1)

eval_set = [(X_valid, y_valid)]

model_xgboost.fit(X_train,
                  y_train,
                  early_stopping_rounds=10,
                  eval_set=eval_set,
                  verbose=True)

In [None]:
# castboost
#importing library and building model
from catboost import CatBoostRegressor
model=CatBoostRegressor(iterations=50, depth=5, learning_rate=0.1, loss_function='RMSE')
model.fit(X_train, y_train,eval_set=(X_valid, y_valid), verbose=True,plot=True)

In [None]:
print('lightgbm model')
y_preds_lgbm = model_lgbm.predict(x_test )
my_function(y_test,y_preds_lgbm)
print('*********************************')
print(" xgboost model")
y_preds_lgbm = model_xgboost.predict(x_test )
my_function(y_test,y_preds_lgbm)

print('*********************************')
print(" castboost   model")
y_preds_catboost = model.predict(x_test )
my_function(y_test,y_preds_catboost)