## Importing required libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Flatten, Conv1D, MaxPool1D

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split


In [None]:
!pip install imblearn

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
train=pd.read_csv('../input/santander-customer-satisfaction/train.csv')
test=pd.read_csv('../input/santander-customer-satisfaction/test.csv')

# Data Preprocessing, Feature Engineering and transformation

In [None]:
# checking the shape of the data

train.shape, test.shape

In [None]:
# checking for null values
train.isnull().sum().sum() , test.isnull().sum().sum()

In [None]:
train.head()

In [None]:
train['TARGET'].value_counts()

In [None]:
test.head()

In [None]:
# Let's remove ID column from both datasets and target column from train to make it a seperate series
y_train_full=train['TARGET']
x_train_full=train.drop(['ID', 'TARGET'], axis=1)
x_test_final=test.drop(['ID'], axis=1)

## Oversampling using SMOTE



In [None]:
# Checking shape before any action
x_train_full.shape, y_train_full.shape

In [None]:
smt=SMOTE()
x_train_full, y_train_full = smt.fit_sample(x_train_full, y_train_full)

In [None]:
x_train_full.shape, y_train_full.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full)


In [None]:
# checking the shape of the datasets
x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_test_final.shape

## Feature selection methods: Filtering method

In [None]:
quasi_filter=VarianceThreshold(0.01)
x_train=quasi_filter.fit_transform(x_train)
x_test=quasi_filter.transform(x_test)
x_test_final=quasi_filter.transform(x_test_final)

In [None]:
x_train.shape, x_test.shape, x_test_final.shape

Significant reduction of the useless features

In [None]:
# Let's check duplicated features now

x_train_T=x_train.T
x_test_T = x_test.T
x_test_final_T=x_test_final.T

In [None]:
x_train_T=pd.DataFrame(x_train_T)
x_test_T=pd.DataFrame(x_test_T)
x_test_final_T=pd.DataFrame(x_test_final_T)

In [None]:
x_train_T.shape, x_test_T.shape, x_test_final_T.shape

In [None]:
x_train_T.duplicated().sum()

In [None]:
duplicated_features=x_train_T.duplicated()
duplicated_features

In [None]:
# features to keep will be inverse of duplicatd features
features_to_keep=[not index for index in duplicated_features]

In [None]:
x_train=x_train_T[features_to_keep].T
x_test=x_test_T[features_to_keep].T
x_test_final=x_test_final_T[features_to_keep].T

In [None]:
x_train.shape, x_test.shape, x_test_final.shape

In [None]:
# Transformation of the data now

sc=StandardScaler()
x_train_tx=sc.fit_transform(x_train)
x_test_tx=sc.transform(x_test)
x_test_final_tx=sc.transform(x_test_final)

In [None]:
# let's check the type of the dataset now
type(x_train_tx), type(x_test_tx), type(y_train), type(y_test), type(x_test_final_tx)

### As we can see above, y train and y test are of pandas series type and not numpy array, hence we need to convert them to numpy array in order to proceed towards neural networks

In [None]:
y_train=y_train.to_numpy()
y_test=y_test.to_numpy()

In [None]:
type(y_train), type(y_test)

In [None]:
x_train_tx=x_train_tx.reshape(x_train_tx.shape[0], x_train_tx.shape[1], 1)
x_test_tx=x_test_tx.reshape(x_test_tx.shape[0], x_test_tx.shape[1], 1)
x_test_final_tx=x_test_final_tx.reshape(x_test_final_tx.shape[0], x_test_final_tx.shape[1], 1)

In [None]:
x_train_tx[0].shape

In [None]:
model=Sequential()
model.add(Conv1D(32, 3, activation='relu', input_shape=x_train_tx[0].shape))
model.add(BatchNormalization())
model.add(MaxPool1D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv1D(64, 3, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPool1D(pool_size=2))
model.add(Dropout(0.3))          

model.add(Conv1D(128, 3, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPool1D(pool_size=2))
          
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
          
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(256, activation='relu'))
model.add(Dropout(0.4))
          
model.add(Dense(1, activation='sigmoid'))


In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

In [None]:
history=model.fit(x_train_tx, y_train, validation_data=(x_test_tx, y_test), epochs=20, verbose=1)

In [None]:
y_pred=model.predict(x_test_final_tx)

In [None]:
y_pred.shape

In [None]:
y_pred


In [None]:
y_pred=pd.DataFrame(y_pred, columns=['TARGET'])

In [None]:
y_pred['TARGET']=np.where(y_pred['TARGET']>0.5, 1, 0)

In [None]:
test_new=pd.concat([test, y_pred], axis=1)

In [None]:
submission=test_new[['ID', 'TARGET']]

In [None]:
submission.to_csv('submission4.csv', index=None)

In [None]:
# score on Kaggle is 0.71813