# NOTE:
## This is not the EXACT "Portuguese Bank Marketing" dataset. It has  contains a few different columns, however, this dataset is **almost similar** and all the concepts can be conveniently translated as most of the columns are identical.  
## The dataset used can be found here - https://www.kaggle.com/jinxzed/av-hacklive-guided-hackathon
## In-depth visualisations, analysis, insights and strong predictive models.

## This dataset was provided in “FactElytics” organized by Drishti, the Annual fest of SIOM Nashik.
## From a total of 330 participants, 87 teams were shortlisted after a quiz round held on Dare2Compete. Making it to the top 8 Teams, being the only individual participant, based on F1 score on a model for a banking institution’s campaigning dataset.
#### If this notebook helps you, an upvote would be huge!

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
train = pd.read_csv('../input/factelytics-siom/Train data.csv')
test = pd.read_csv('../input/factelytics-siom/Test data.csv')
train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

In [None]:
print('Total records in train is', train.shape[0], ', and in test is '+ str(test.shape[0])+ '.')

In [None]:
train.isnull().sum()

In [None]:
from sklearn.ensemble import RandomForestClassifier
X = train.copy()


y = train.term_deposit_subscribed
X = X.drop('term_deposit_subscribed', axis=1)


X = X.fillna(-999)


for c in train.columns[train.dtypes == 'object']:
    X[c] = X[c].factorize()[0]
    
rf = RandomForestClassifier()
rf.fit(X,y)

plt.plot(rf.feature_importances_)
plt.ylabel('Importance of Feature')
plt.xticks(np.arange(X.shape[1]), X.columns.tolist(), rotation=90);

## Few features are very dominant in the model, namely, Last contact duration, month, previous campaign outcome, day of month, balance and customer age. 

In [None]:
corr=abs(train.corr())
core = corr.term_deposit_subscribed.sort_values(ascending=False)
core

In [None]:
train.groupby('prev_campaign_outcome')['term_deposit_subscribed'].value_counts()

## Customers who subscribed in the previous campaign are most likely to subscribe again. 

In [None]:
train.groupby('marital')['term_deposit_subscribed'].value_counts()

## Single customers are more inclined towards taking up term deposit subscription.

In [None]:
train.groupby('job_type')['term_deposit_subscribed'].value_counts()

## Customers with job types as management, student and unemployed have a higher chance of taking the subscription. 

In [None]:
(train.balance<0).value_counts()

## Around 3900 bank accounts with balances less than 0, implying customers tried to make payments larger than the amount of money in thier account.

In [None]:
print("Subsctiption breakdown of poeple with negative balance\n", 
      train.loc[train.balance<0, 'term_deposit_subscribed'].value_counts(), '\n')

print("Subsctiption breakdown of poeple with positive balance\n", 
      train.loc[train.balance>0, 'term_deposit_subscribed'].value_counts())

## Only about 5 percent customers with negative bank balance tend to not shy away from subscribing for term deposit, , whereas for customers with positive balances have a subscription percentage of 11%. 

In [None]:
print("Subsctiption breakdown of poeple with balancemore than 1000\n", 
      train.loc[train.balance>1000, 'term_deposit_subscribed'].value_counts())

print("Subsctiption breakdown of poeple with balancemore than 2000\n", 
      train.loc[train.balance>2000, 'term_deposit_subscribed'].value_counts())

## Further, subscription percentage rate gradually increases with increase in balance.


In [None]:
months = train.month.unique().tolist()
for m in months:
    print(m,'\t')
    print(train.loc[train.month==m, 'term_deposit_subscribed'].value_counts(), '\t')

## Month has a significant impact on the **reach** as well as the success of the campaign.


In [None]:
train.groupby('communication_type').term_deposit_subscribed.value_counts()

## Cellular and telephone communication types are the most efficient with 13% success rate. "Unknown" has an efficiency of only 3.5%

In [None]:
train.groupby('month').communication_type.value_counts()

## A very peculiar insight here is that "unknown" is very prominent on the month of May and June. For the remaning months, it has negligible contribution in terms of campaign reach.


In [None]:
plt.figure(figsize=(10,5))
sns.kdeplot(data=train, x=train.loc[train.last_contact_duration<1200, 'last_contact_duration'], hue='term_deposit_subscribed', multiple = 'fill')
plt.xlabel('Time in seconds')

## If the last contact duration was high, the chances of the customer subscribing is high, as longer duration implies interest of the customer in services offered by the bank. 

# -----------------------------------------------------------------------------------------------------------

# Comparing the distribution of test and training set.
### If the distribution is different, improving our model in training set will not result in any imporvement in test set. 

In [None]:
plt.figure(figsize=(15,4))
plt.tight_layout()

plt.subplot(1,3,1)
plt.hist((train.balance, test.balance), range=(-10000,30000), bins = 10, log = 1)
plt.title('Balance Distribution')
plt.xlabel('Balance')

plt.subplot(1,3,2)
plt.hist((train.customer_age, test.customer_age), bins = 10)
plt.title('Age Distribution')
plt.xlabel('Age')

plt.subplot(1,3,3)
plt.hist((train.month, test.month), bins = 10)
plt.title('Month Distribution')
plt.xlabel('Month')

plt.show()

In [None]:
plt.figure(figsize=(13,4))
plt.tight_layout()

plt.subplot(1,3,1)
plt.hist((train.days_since_prev_campaign_contact, test.days_since_prev_campaign_contact), bins = 10)
plt.title('Days since previous contact distribution')
plt.xlabel('Days')

plt.subplot(1,3,2)
plt.hist((train.num_contacts_prev_campaign, test.num_contacts_prev_campaign), bins = 15, log=1)
plt.title('Number of contacts distribution')
plt.xlabel('Count')

plt.subplot(1,3,3)
plt.hist((train.last_contact_duration, test.last_contact_duration), bins = 10, log=1)
plt.title('Call Duration distribution')
plt.xlabel('Time in Seconds')

plt.tight_layout()

plt.show()

In [None]:
#Checking the distribution between train and test data. This will help us tune the model to get highest possible F1 score
columns = test.columns.to_list()
columns.remove('day_of_month')
columns.remove('month')
columns.remove('balance')
columns.remove('customer_age')
columns.remove('days_since_prev_campaign_contact')
columns.remove('num_contacts_prev_campaign')
columns.remove('last_contact_duration')
for c in columns:
    print(train[c].value_counts().sort_index(),"\n", test[c].value_counts().sort_index(), "\n") 

## The distribution of train and test appears to be the same. Thus, working on training set will eventually improve the test set. 

# -----------------------------------------------------------------------------------------------------------

# Data preprocessing 
## Working with Nan values

In [None]:
corr = abs(train.corr())
core = corr.balance.sort_values(ascending = False)
core

## Balance and customer age have highest correlation. We will use either of these to fill NaN values of the other. This is more effective and gives higher accuracy than filling the NaN values of any feature with its Mean or Median. 

### NOTE : We are not considering the feature "days_since_prev_campaign_contact" as 80% NaN values in the dataset, hence the correlation value is not an accurate representation for this particular feature. 

In [None]:
print("Rows where both customer age and account balance are not avaiable in : ",len(train.loc[(train.balance.isnull()==True) & (train.customer_age.isnull()==True)]), "\nWe will drop these rows.")
#Dropping specified rows 
train = train.loc[(train.balance.isnull()!=True) | (train.customer_age.isnull()!=True)]
train.reset_index(drop=True, inplace =True)
print('')

In [None]:
balances = [-5000, 0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000, 70000, 80000, 90000, 100000, 110000]
for b in balances : 
    m1 = train.customer_age.isnull() == True
    m2 = (train.balance<(b+7500))
    m3 = (train.balance>(b-7500)) 
    value = round(train.loc[m2 & m3, 'customer_age'].mean(),0) 
    i = train.loc[m1 & m2 & m3,'customer_age'].index
    train.loc[i,'customer_age'] = value
del m1, m2, m3, i 


for b in balances : 
    m1 = test.customer_age.isnull() == True
    m2 = (test.balance<(b+7500))
    m3 = (test.balance>(b-7500))
    value = round(test.loc[m2 & m3, 'customer_age'].mean(),0) 
    i = test.loc[m1 & m2 & m3,'customer_age'].index
    test.loc[i,'customer_age'] = value
del m1, m2, m3, i, balances, b

In [None]:
ages = train.customer_age.unique().tolist()
for age in ages:
    value = train.loc[(train.customer_age == age), ['balance']].mean().get('balance')
    m1 =  train.customer_age == age
    m2 = train.balance.isnull()==True
    i = train.loc[m2 & m1, 'balance'].index
    train.loc[i, 'balance'] = value
del ages, m1, m2, i



ages = test.customer_age.unique().tolist()
for age in ages:
    value = test.loc[(test.customer_age == age), ['balance']].mean().get('balance')
    m1 =  test.customer_age == age
    m2 = test.balance.isnull()==True
    i = test.loc[m2 & m1, 'balance'].index
    test.loc[i, 'balance'] = value
del ages, m1, m2, i

In [None]:
i = test.loc[(test.balance.isnull()==True) & test.customer_age.isnull()==True].index
mean_balance = test.balance.mean()
mean_age = round(test.customer_age.mean(), 0)
test.loc[i, ['balance', 'customer_age']] = mean_balance, mean_age

__________________________________________________________________________________________________________________________

# Feature generation
## Combining "personal_loan" and "housing_loan" to generate a new feature "loan"

In [None]:
train['personal_loan'] = train['personal_loan'].replace({'no':0, 'yes':1})
test['personal_loan'] = test['personal_loan'].replace({'no':0, 'yes':1})

from sklearn.impute import KNNImputer
knn = KNNImputer(n_neighbors=10)

col_pl = ['balance', 'personal_loan']
knn.fit(train[col_pl])

r = pd.DataFrame(np.round(knn.transform(train[col_pl]), 0), columns=col_pl)
train['personal_loan'] = r['personal_loan'].astype('int64')

r = pd.DataFrame(np.round(knn.transform(test[col_pl]), 0), columns=col_pl)
test['personal_loan'] = r['personal_loan'].astype('int64')

In [None]:
train['housing_loan'] = train['housing_loan'].replace({'no':0, 'yes':1})
test['housing_loan'] = test['housing_loan'].replace({'no':0, 'yes':1})

train['loan'] = (train.personal_loan | train.housing_loan)
train['loan'] = train['loan'].astype('int64')

test['loan'] = (test.personal_loan | test.housing_loan)
test['loan'] = test['loan'].astype('int64')


train.drop('personal_loan', axis=1, inplace=True)
test.drop('personal_loan', axis=1, inplace=True)

train.drop('housing_loan', axis=1, inplace=True)
test.drop('housing_loan', axis=1, inplace=True)

## Data scaling and imputing 

### For a Neural Network architecture, One-Hot encoding is more effective in recognizing patterns than Label/Frequency Encoding. 
### Features are scaled about mean=0 and unit variance. Gradients in Neural Networks tend to explode or vanish on unscaled features.
### Normalising help Neural Networks converge faster to the "Saddle point"

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [None]:
train.last_contact_duration.fillna(value = train.last_contact_duration.mean(), inplace= True)
test.last_contact_duration.fillna(value = test.last_contact_duration.mean(), inplace=True)

In [None]:
#Label encoding of education as the data is ordinal 
train['education'] = train['education'].replace({'unknown':0, 'primary':1, 'secondary':2, 'tertiary':3})
test['education'] = test['education'].replace({'unknown':0, 'primary':1, 'secondary':2, 'tertiary':3})

In [None]:
scaler = StandardScaler()
scaler.fit(train.education.values.reshape(-1,1))

new_edu = scaler.transform(train.education.values.reshape(-1,1))
train['education'] = new_edu

new_edu = scaler.transform(test.education.values.reshape(-1,1))
test['education'] = new_edu

In [None]:
train['last_contact_duration'] = np.clip(train.last_contact_duration,a_min=None, a_max=3000) 
test['last_contact_duration'] = np.clip(test.last_contact_duration,a_min=None, a_max=3000) 

scaler = StandardScaler()
scaler.fit(train.last_contact_duration.values.reshape(-1,1))

new_con = scaler.transform(train.last_contact_duration.values.reshape(-1,1))
train['last_contact_duration'] = new_con

new_con = scaler.transform(test.last_contact_duration.values.reshape(-1,1))
test['last_contact_duration'] = new_con

In [None]:
train['num_contacts_prev_campaign'] = np.clip(train.num_contacts_prev_campaign,a_min=None, a_max=26) 
test['num_contacts_prev_campaign'] = np.clip(test.num_contacts_prev_campaign,a_min=None, a_max=26) 

scaler = StandardScaler()
scaler.fit(train.num_contacts_prev_campaign.values.reshape(-1,1))

new_con = scaler.transform(train.num_contacts_prev_campaign.values.reshape(-1,1))
train['num_contacts_prev_campaign'] = new_con

new_con = scaler.transform(test.num_contacts_prev_campaign.values.reshape(-1,1))
test['num_contacts_prev_campaign'] = new_con

In [None]:
train['balance'] = np.clip(train.balance,a_min=None, a_max=40000) 
test['balance'] = np.clip(train.balance,a_min=None, a_max=40000) 

scaler = StandardScaler()
scaler.fit(train.balance.values.reshape(-1,1))

new_balance = scaler.transform(train.balance.values.reshape(-1,1))
train['balance'] = new_balance

new_balance = scaler.transform(test.balance.values.reshape(-1,1))
test['balance'] = new_balance

In [None]:
scaler = StandardScaler()
scaler.fit(train.customer_age.values.reshape(-1,1))

new_age = scaler.transform(train.customer_age.values.reshape(-1,1))
train['customer_age'] = new_age

new_age = scaler.transform(test.customer_age.values.reshape(-1,1))
test['customer_age'] = new_age

In [None]:
le = LabelEncoder()
le.fit(train.month)
new_m = le.transform(train.month)
train['month'] = new_m

new_m = le.transform(test.month)
test['month'] = new_m

scaler = StandardScaler()
scaler.fit(train.month.values.reshape(-1,1))

new_month = scaler.transform(train.month.values.reshape(-1,1))
train['month'] = new_month

new_month = scaler.transform(test.month.values.reshape(-1,1))
test['month'] = new_month

In [None]:
le = LabelEncoder()
le.fit(train.job_type)
new_m = le.transform(train.job_type)
train['job_type'] = new_m

new_m = le.transform(test.job_type)
test['job_type'] = new_m

scaler = StandardScaler()
scaler.fit(train.job_type.values.reshape(-1,1))

new_jt = scaler.transform(train.job_type.values.reshape(-1,1))
train['job_type'] = new_jt

new_jt = scaler.transform(test.job_type.values.reshape(-1,1))
test['job_type'] = new_jt

In [None]:
scaler = StandardScaler()
scaler.fit(train.day_of_month.values.reshape(-1,1))

new_day = scaler.transform(train.day_of_month.values.reshape(-1,1))
train['day_of_month'] = new_day

new_day = scaler.transform(test.day_of_month.values.reshape(-1,1))
test['day_of_month'] = new_day

In [None]:
train['marital'] = train['marital'].replace({'single':3, 'married':1, 'divorced':2})
test['marital'] = test['marital'].replace({'single':3, 'married':1, 'divorced':2})

In [None]:
knn = KNNImputer(n_neighbors=10)

col_marital = ['customer_age', 'education', 'marital']
knn.fit(train[col_marital])

r = pd.DataFrame(np.round(knn.transform(train[col_marital]), 0), columns=col_marital)
train['marital'] = r['marital']

r = pd.DataFrame(np.round(knn.transform(test[col_marital]), 0), columns=col_marital)
test['marital'] = r['marital']

In [None]:
train = train.join(pd.get_dummies(train.marital, prefix = 'marital'))
train.drop('marital', axis=1, inplace = True)

test = test.join(pd.get_dummies(test.marital, prefix='marital'))
test.drop('marital', axis=1, inplace = True)

In [None]:
train = train.join(np.round(pd.get_dummies(train.default, prefix='default'), 0))
train.drop('default', axis=1, inplace = True)

test = test.join(pd.get_dummies(test.default, prefix = 'default'))
test.drop('default', axis=1, inplace = True)

In [None]:
train = train.join(pd.get_dummies(train.communication_type, prefix='communication_type'))
train.drop('communication_type', axis=1, inplace = True)

test = test.join(pd.get_dummies(test.communication_type, prefix = 'communication_type'))
test.drop('communication_type', axis=1, inplace = True)

In [None]:
train = train.join(pd.get_dummies(train.prev_campaign_outcome, prefix='prev_campaign_outcome'))
train.drop('prev_campaign_outcome', axis=1, inplace = True)

test = test.join(pd.get_dummies(test.prev_campaign_outcome, prefix = 'prev_campaign_outcome'))
test.drop('prev_campaign_outcome', axis=1, inplace = True)

In [None]:
knn = KNNImputer(n_neighbors=10)

col_days = ['month', 'balance', 'customer_age', 'education', 'days_since_prev_campaign_contact']
knn.fit(train[col_days])

r = pd.DataFrame(knn.transform(train[col_days]), columns=col_days)
train['days_since_prev_campaign_contact'] = r['days_since_prev_campaign_contact']

r =  pd.DataFrame(knn.transform(test[col_days]), columns=col_days)
test['days_since_prev_campaign_contact'] = r['days_since_prev_campaign_contact']

In [None]:
knn = KNNImputer(n_neighbors=10)
col_nums = ['day_of_month', 'num_contacts_in_campaign']
knn.fit(train[col_nums])
r =  pd.DataFrame(knn.transform(train[col_nums]), columns=col_nums)
train['num_contacts_in_campaign'] = r['num_contacts_in_campaign']


r =  pd.DataFrame(knn.transform(test[col_nums]), columns=col_nums)
test['num_contacts_in_campaign'] = r['num_contacts_in_campaign']

In [None]:
train['num_contacts_in_campaign'] = np.clip(train.num_contacts_in_campaign,a_min=None, a_max=30) 
test['num_contacts_in_campaign'] = np.clip(test.num_contacts_in_campaign,a_min=None, a_max=30) 

scaler = StandardScaler()
scaler.fit(train.num_contacts_in_campaign.values.reshape(-1,1))

new_con = scaler.transform(train.num_contacts_in_campaign.values.reshape(-1,1))
train['num_contacts_in_campaign'] = new_con

new_con = scaler.transform(test.num_contacts_in_campaign.values.reshape(-1,1))
test['num_contacts_in_campaign'] = new_con

In [None]:
train['days_since_prev_campaign_contact'] = np.clip(train.days_since_prev_campaign_contact,a_min=None, a_max=600) 
test['days_since_prev_campaign_contact'] = np.clip(test.days_since_prev_campaign_contact,a_min=None, a_max=600) 

scaler = StandardScaler()
scaler.fit(train.days_since_prev_campaign_contact.values.reshape(-1,1))

new_con = scaler.transform(train.days_since_prev_campaign_contact.values.reshape(-1,1))
train['days_since_prev_campaign_contact'] = new_con

new_con = scaler.transform(test.days_since_prev_campaign_contact.values.reshape(-1,1))
test['days_since_prev_campaign_contact'] = new_con

In [None]:
print(train.isnull().sum(axis=0).sum(), test.isnull().sum(axis=0).sum())

## Converting dataset to feed into Neural Network

In [None]:
y = train['term_deposit_subscribed'].values
X = train.drop('term_deposit_subscribed', axis=1)
x = X.values
x.shape

In [None]:
print(train.term_deposit_subscribed.value_counts().to_list())
print("Ration of No to Yes is " ,train.term_deposit_subscribed.value_counts()[1]/train.term_deposit_subscribed.value_counts()[0])

## Because of the skewed dataset (ratio of term deposit not subscribed to term deposit  subscribed is around 0.1), we will use Stratified shuffle split so the distribution of target values are the same in train and validation set. This is a  very significant when working with skewed data.

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=14)

for train_index, test_index in sss.split(x, y):
    xtrain, xval = x[train_index], x[test_index]
    ytrain, yval = y[train_index], y[test_index]

## Building Neural Network model.

In [None]:
import tensorflow as tf 

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(23,)),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(8, activation='relu'),
    #tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(4, activation='relu'),
    #tf.keras.layers.Dropout(0.4), 
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model.summary()

In [None]:
from keras import backend as K

def f1(y_true, y_pred):
    
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    precision = true_positives / (predicted_positives + K.epsilon())
    f1 = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1


model.compile(loss='binary_crossentropy', optimizer= "adam", metrics=[f1,'AUC'])

In [None]:
annealer = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1, min_lr=1e-3)

In [None]:
model.fit(xtrain, ytrain, epochs=200, validation_data=(xval, yval), shuffle=True, callbacks=[annealer])

In [None]:
xtest = pd.read_csv('../input/factelytics-siom/Test data.csv')
sample = xtest['id']
sample = pd.DataFrame(sample)

In [None]:
p_y = model.predict(test)
p_y = p_y.flatten()
print(p_y)
p_y=np.round(p_y,0)
print(p_y)
sample['term_deposit_subscribed'] = p_y

In [None]:
sample.to_csv('Predictions.csv', index=False)