In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
df=pd.read_csv("../input/bank-marketing-dataset/bank.csv")
df.tail(10)

In [None]:
df.info()

No missing data.

In [None]:
df['deposit']=(df['deposit']=='yes').astype(int)

*Visualization of some important aspects within the data.*

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(20,16))
df['job'].value_counts().plot.pie(autopct='%1.1f%%')

**Blue-collar,Management and Technicians constitutes 60% percent of the population .**

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(15,12))
sns.countplot(df['job'],hue='deposit',data=df)
plt.show()

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(16,12))
sns.catplot(x="education", hue="marital", col="deposit",
                data=df, kind="count",
                height=10, aspect=.7);
plt.show()

In [None]:
df['deposit'].value_counts()/df.shape[0]

**As it can be seen,the dataset is balanced.**

In [None]:
df.describe()

**Let's detect outliers in the numerical features in the dataset.We will denote that values as outliers which are atleast 3 standard deviations away from mean**

In [None]:
numerical_features=(df.select_dtypes(exclude=['object'])).columns
numerical_features=list(numerical_features)

# drop the target variable 'deposit'
numerical_features.remove('deposit')

In [None]:
for features in numerical_features:
    mean=df[features].mean(axis=0)
    std=df[features].std(axis=0)
    upper_thres=mean+3*std
    lower_thres=mean-3*std
    outliers=0
    for i in range(df.shape[0]):
        if df[features].iloc[i]>=upper_thres or  df[features].iloc[i]<=lower_thres:
            outliers+=1
    print("{features}->>  outliers: {outlier}".format(features=features,outlier=outliers))

In [None]:
corr=df.corr()
plt.figure(figsize=(14,8))
sns.heatmap(corr,annot=True)

**Most of the numerical features except duration are uncorrelated.The target perhaps is much dependent on the categorical features or the relationships are non-linear.**

In [None]:
# Day and month columns appear to be uncorrelated.
df['month'][df['deposit']==1].value_counts()

In [None]:
df['month'][df['deposit']==0].value_counts()

In [None]:
import warnings
warnings.simplefilter("ignore")


subscribers_w_loan=df[df['loan']=='yes'][df['deposit']==1].shape[0]
subscribers_wo_loan=df[df['loan']=='no'][df['deposit']==1].shape[0]

subscribers=df[df['deposit']==1]

print("subscribers_w_loan ->>",subscribers_w_loan/subscribers.shape[0])
print("\n")
print("subscribers_wo_loan ->>",subscribers_wo_loan/subscribers.shape[0])

**90% subscribers are without loan**.

# Another very crucial aspect is the mode of communication.
 

In [None]:
df['contact'].value_counts()

In [None]:
# subscribers who are contacted by various modes.
cellular=0
unknown=0
telephone=0
for x in range(df.shape[0]):
    if df['deposit'].iloc[x]==1:
        if df['contact'].iloc[x]=='cellular':
            cellular+=1
        if df['contact'].iloc[x]=='unknown':
            unknown+=1
        if df['contact'].iloc[x]=='telephone':
            telephone+=1


plt.figure(figsize=(15,8))
contacts = ['cellular', 'unknown', 'telephone']
sizes = [cellular,unknown,telephone]
plt.pie(sizes, labels = contacts,autopct='%1.2f%%')
plt.show()

**Most of the people who subscribed were contacted via cellular mode(82.61%).

In [None]:
plt.figure(figsize=(18,10))
sns.violinplot(x="balance",y="job",hue='deposit',data=df)
plt.title("Job distribution with Deposit status",fontsize=16)
plt.show()

In [None]:
df['campaign'].value_counts()

**Most of the population is targeted in the first 10 campaigns.**

In [None]:
# Duration of the campaign also plays a great role,has a good correlation with deposit subscription.

# this code snippet is forked.

sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_style('whitegrid')
avg_duration = df['duration'].mean()

lst = [df]
df["duration_status"] = np.nan

for col in lst:
    col.loc[col["duration"] < avg_duration, "duration_status"] = "below_average"
    col.loc[col["duration"] > avg_duration, "duration_status"] = "above_average"
    
pct_term = pd.crosstab(df['duration_status'], df['deposit']).apply(lambda r: round(r/r.sum(), 2) * 100, axis=1)


ax = pct_term.plot(kind='bar', stacked=False, cmap='RdBu')
plt.title("The Impact of Duration \n in Opening a Term Deposit", fontsize=18)
plt.xlabel("Duration Status", fontsize=18);
plt.ylabel("Percentage (%)", fontsize=18)

for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.02, p.get_height() * 1.02))
    

plt.show()

**Above average duration has clearly a greater impact than below average(77% over 23%).**

# Has the previous marketing campaign has any impact on this campaign??

In [None]:
df['poutcome'].value_counts()    

*But sadly,this data is not reliable as most of the previous campaign outcomes are unknown.

In [None]:
# housing with marital status
plt.figure(figsize=(15,8))
sns.catplot(x="housing", hue="marital", col="deposit",
                data=df, kind="count",
                height=10, aspect=.7);
plt.show()

*Let us see the distribution of pdays and previous in the population*

In [None]:
plt.figure(figsize=(16,20))
sns.set_style("darkgrid")
sns.boxplot(x='job',y='balance',data=df,hue='deposit',whis=1.5)
plt.show()

**A closer look in the output will show that the subscribers having different jobs have more bank balance than the non-subscriber.Just look minutely in the 75 percentile mark in the boxplot of all the jobs.**

In [None]:
plt.figure(figsize=(15,8))
sns.set_style("darkgrid")
sns.boxplot(x='deposit',y='age',data=df,whis=1.5)
plt.show()

In [None]:
# drop irrelevant features
df=df.drop(['day','month','poutcome'],axis=1)
df.head()