In [None]:
!pip install feature-engine

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures, SmartCorrelatedSelection
from feature_engine import transformation as vt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("../input/santander-customer-satisfaction/train.csv")
test = pd.read_csv("../input/santander-customer-satisfaction/test.csv")

In [None]:
train.shape, test.shape

In [None]:
train.head()

In [None]:
train.dtypes.value_counts()

In [None]:
train.isnull().sum().value_counts()

In [None]:
x_train = train.iloc[:,:-1]
y_train = train.TARGET

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size=0.3, random_state=42)

In [None]:
X_train.shape, X_test.shape

# Target Distribution

In [None]:
Y_train.plot.hist()

In [None]:
Y_train.value_counts()

Unbalanced target problem !!!

## Dropping Constant Features

In [None]:
disc = DropConstantFeatures(tol=0.95)
X_train = disc.fit_transform(X_train)
X_test = disc.transform(X_test)
test = disc.transform(test)

In [None]:
X_train.shape

## Dropping Duplicate Features

In [None]:
disc1 = DropDuplicateFeatures()
X_train = disc1.fit_transform(X_train)
X_test = disc1.transform(X_test)
test = disc1.transform(test)

In [None]:
X_train.shape

## Dropping Correlated Features (Smart)

In [None]:
clf = LogisticRegression(random_state=42)
disc2 = SmartCorrelatedSelection(selection_method="model_performance", estimator=clf)
X_train = disc2.fit_transform(X_train, Y_train)
X_test = disc2.transform(X_test)
test = disc2.transform(test)

In [None]:
X_train.shape

# Finding Skewness

In [None]:
X_train.skew()[abs(X_train.skew())>1].sort_values(ascending=False)

## Solving skewness (values >= 0)

In [None]:
for feature in ["saldo_var37", "imp_op_var41_efect_ult3", "imp_trans_var37_ult1", "imp_op_var39_efect_ult1", "num_var43_emit_ult1", "num_var43_recib_ult1", "num_var30_0", "num_var37_0", "imp_op_var41_comer_ult1", "saldo_var13", "num_op_var41_hace2", "num_op_var41_efect_ult3", "num_var22_ult1", "num_var45_hace3", "num_op_var41_ult1", "num_med_var22_ult3", "num_var22_hace2", "num_var13_0", "num_var22_hace3", "num_var43_emit_ult1"]:
    X_train[feature] = X_train[feature].apply(lambda x: np.log(x+1))
    X_test[feature] = X_test[feature].apply(lambda x: np.log(x+1))
    test[feature] = test[feature].apply(lambda x: np.log(x+1))

## Solving skewness (values > 0)

In [None]:
tf = vt.BoxCoxTransformer(["var38"])
X_train = tf.fit_transform(X_train)
X_test  = tf.transform(X_test)
test = tf.transform(test)

In [None]:
X_train.skew()[abs(X_train.skew())>1].sort_values(ascending=False)

Some features are still skewed.

In [None]:
submission_col = test["ID"]

# Normalizing the Data

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
test = scaler.transform(test)

# Logistic Regression

In [None]:
clf1 = LogisticRegression(random_state=42, solver='liblinear', class_weight="balanced")
clf1.fit(X_train, Y_train)
y_pred = clf1.predict(X_test)
ras = roc_auc_score(Y_test, y_pred)
print("Logistic Regression ROC-AUC Score:", ras)

# Random Forest Classifier

In [None]:
gclf = RandomForestClassifier(class_weight="balanced", random_state=42, max_depth=8)
gclf.fit(X_train, Y_train)
y_pred = gclf.predict(X_test)
ras = roc_auc_score(Y_test, y_pred)
print("Random Forest Classifier ROC-AUC Score:", ras)

# ANN

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(12, input_dim=X_train.shape[1], activation="relu"))
model.add(Dense(8, activation="relu"))
model.add(Dense(4, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_train, Y_train, epochs=50, batch_size=64)

In [None]:
y_pred = model.predict(X_test)
ras = roc_auc_score(Y_test, y_pred)
print("ANN ROC-AUC Score:", ras)

# Creating Submission File

In [None]:
submission_pred = model.predict(test)

In [None]:
submission_df = pd.DataFrame(submission_pred, columns=["TARGET"])

In [None]:
submission = pd.concat([submission_col, submission_df], axis=1)
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)