In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
from matplotlib import cbook, rc_params_from_file, rcParamsDefault
import plotly.express as px

#classifier
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_error
from sklearn import metrics

from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import *
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

import warnings
warnings.filterwarnings('ignore')

# Bank Client Data:
1. age (numeric)
2. job : type of job (categorical: ‘admin.’,’bluecollar’,’entrepreneur’,’housemaid’,’management’,’retired’,’self-employed’,’services’,’student’,’technician’,’unemployed’,’unknown’)
3. marital : marital status (categorical: ‘divorced’,’married’,’single’,’unknown’; note: ‘divorced’ means divorced or widowed)
4. education (categorical:‘basic.4y’,’basic.6y’,’basic.9y’,’high.school’,’illiterate’,’professional.course’,
’university.degree’,’unknown’)
5. default: has credit in default? (categorical: ‘no’,’yes’,’unknown’)
6. housing: has housing loan? (categorical: ‘no’,’yes’,’unknown’)
7. loan: has personal loan? (categorical: ‘no’,’yes’,’unknown’)
# Related with the last contact of the current campaign:
8. contact: contact communication type (categorical: ‘cellular’,’telephone’)
9. month: last contact month of year (categorical: ‘jan’, ‘feb’, ‘mar’, …, ‘nov’, ‘dec’)
10. day_of_week: last contact day of the week (categorical: ‘mon’,’tue’,’wed’,’thu’,’fri’)
11. duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y=’no’). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
# other attributes:
12. campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
13. pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
14. previous: number of contacts performed before this campaign and for this client (numeric)
15. poutcome: outcome of the previous marketing campaign (categorical: ‘failure’,’nonexistent’,’success’)
# social and economic context attributes
16. emp.var.rate: employment variation rate — quarterly indicator (numeric)
17. cons.price.idx: consumer price index — monthly indicator (numeric)
18. cons.conf.idx: consumer confidence index — monthly indicator (numeric)
19. euribor3m: euribor 3 month rate — daily indicator (numeric)
20. nr.employed: number of employees — quarterly indicator (numeric)

In [None]:
df = pd.read_csv(r"/kaggle/input/d/depekha/bank-marketing-campaigns-dataset/Bank Additional Full.csv")

In [None]:
print(df.shape)
df.head()

In [None]:
df.isnull().head()

In [None]:
df.notnull().head()

In [None]:
df.shape

In [None]:
df.dropna(how = 'any').shape

In [None]:
df.duplicated().sum()

In [None]:
df.loc[df.duplicated(keep = 'last'), :]

In [None]:
df.loc[df.duplicated(keep = False), :]

In [None]:
df.drop_duplicates(keep = 'first').shape

In [None]:
df.drop_duplicates(keep = 'last').shape

In [None]:
df.drop_duplicates(keep = False).shape

In [None]:
df.drop_duplicates(subset = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 
                             'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 
                             'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']).shape

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df.nunique()

In [None]:
total_miss = df.isnull().sum()
perc_miss = total_miss/df.isnull().count()*100

missing_data = pd.DataFrame({'Total missing':total_miss,'% missing':perc_miss})

missing_data.sort_values(by='Total missing',ascending=False).head(3)

In [None]:
#Checking for percentage of missing values in each columns
(df.isnull().sum()/len(df))*100

In [None]:
df.dropna(subset=['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 
                  'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 
                  'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'])

In [None]:
# find the unique values from categorical features
for col in df.select_dtypes(include='object').columns:
    print(col)
    print(df[col].unique())

In [None]:
for column in df.columns:
    print(column,df[column].nunique())

In [None]:
categorical_features = [feature for feature in df.columns if ((df[feature].dtypes=='O') & (feature not in ['deposit']))]
categorical_features

In [None]:
for feature in categorical_features:
    print('The feature is {} and number of categories are {}'.format(feature,len(df[feature].unique())))

In [None]:
print('Min Age :',df['age'].min())
print('Max Age :',df['age'].max())

In [None]:
numerical_data = df.select_dtypes(include=np.number) # select_dtypes selects data with numeric features
numerical_col = numerical_data.columns 

print("Numeric Features:")
print(numerical_data.head())
print("===="*20)

In [None]:
categorical_data = df.select_dtypes(exclude=np.number) # we will exclude data with numeric features
categorical_col = categorical_data.columns                          # we will store the categorical features in a variable

print("Categorical Features:")
print(categorical_data.head())
print("===="*20)

In [None]:
### numerical 
numerical_cols = list(df.select_dtypes(exclude=['object']))
numerical_cols

In [None]:
### categorical
categorical_cols = list(df.select_dtypes(include=['object']))
categorical_cols

In [None]:
#Check target label split over categorical features and find the count
for categorical_feature in categorical_features:
    print(df.groupby(['y',categorical_feature]).size())

In [None]:
# list of numerical variables
numerical_features = [feature for feature in df.columns if ((df[feature].dtypes != 'O') & (feature not in ['y']))]
print('Number of numerical variables: ', len(numerical_features))

# visualise the numerical variables
df[numerical_features].head()

In [None]:
#Discrete Numerical Features
discrete_feature=[feature for feature in numerical_features if len(df[feature].unique())<25]
print("Discrete Variables Count: {}".format(len(discrete_feature)))

In [None]:
#Continuous Numerical Features
continuous_features=[feature for feature in numerical_features if feature not in discrete_feature+['deposit']]
print("Continuous feature Count: {}".format(len(continuous_features)))

In [None]:
cols_with_missing = [col for col in df.columns 
                                 if df[col].isnull().any()]
cols_with_missing

In [None]:
df.describe()

In [None]:
df.corr()

In [None]:
df.hist(figsize=(20,20))
plt.show()

In [None]:
matrix = df.corr() 
f, ax = plt.subplots(figsize=(25, 12)) 
sns.heatmap(matrix, vmax=.8, square=True, cmap="RdYlGn",annot = True);

In [None]:
sns.pairplot(df)

In [None]:
def bar_plot(variable):
    var = df[variable]
    varValue = var.value_counts()
    plt.figure(figsize=(15,3))
    plt.bar(varValue.index, varValue,color=['#00008b','#00e5ee','#cd1076', '#008080','#cd5555','red','blue'])
    plt.xticks(varValue.index, varValue.index.values)
    plt.ylabel("Frequency")
    plt.title(variable)
    
    plt.show()
    print("{}: \n {}".format(variable,varValue))

In [None]:
categorical_cols = ["job", "marital", "default", "education", "housing", "loan", "contact", "day_of_week", 
                    "poutcome", "month", "y"]
for c in categorical_cols:
    bar_plot(c)

In [None]:
categorcial_variables = ['job', 'marital', 'education', 'default', 'loan', 'contact', 'month', 'day_of_week', 'poutcome','y']
for col in categorcial_variables:
    plt.figure(figsize=(10,4))
    #Returns counts of unique values for each outcome for each feature.
    pos_counts = df.loc[df.y.values == 'yes', col].value_counts() 
    neg_counts = df.loc[df.y.values == 'no', col].value_counts()
    
    all_counts = list(set(list(pos_counts.index) + list(neg_counts.index)))
    
    #Counts of how often each outcome was recorded.
    freq_pos = (df.y.values == 'yes').sum()
    freq_neg = (df.y.values == 'no').sum()
    
    pos_counts = pos_counts.to_dict()
    neg_counts = neg_counts.to_dict()
    
    all_index = list(all_counts)
    all_counts = [pos_counts.get(k, 0) / freq_pos - neg_counts.get(k, 0) / freq_neg for k in all_counts]

    sns.barplot(all_counts, all_index)
    plt.title(col)
    plt.tight_layout()

In [None]:
#check count based on categorical features
plt.figure(figsize=(15,80), facecolor='white')
plotnumber =1
for categorical_feature in categorical_features:
    ax = plt.subplot(11,2,plotnumber)
    sns.countplot(y=categorical_feature,data=df)
    plt.xlabel(categorical_feature)
    plt.title(categorical_feature)
    plotnumber+=1
plt.show()

In [None]:
#check target label split over categorical features
#Find out the relationship between categorical variable and dependent variable
for categorical_feature in categorical_features:
    sns.catplot(x='y', col=categorical_feature, kind='count', data= df)
plt.show()

In [None]:
#boxplot to show target distribution with respect numerical features
plt.figure(figsize=(20,60), facecolor='white')
plotnumber =1
for feature in continuous_features:
    ax = plt.subplot(12,3,plotnumber)
    sns.boxplot(x = "y", y = df[feature], data = df)
    plt.xlabel(feature)
    plotnumber+=1
plt.show()

In [None]:
fig = plt.figure(figsize = [15,20])
cols = ['marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'job', 'y']
cnt = 1
for col in cols :
    ax = plt.subplot(5,2,cnt)
    sns.countplot(data = df, x = col, order = df[col].value_counts().index)
    if col == 'job' :
        plt.xticks(rotation = 90)
    cnt+=1
    plot_name = "Countplot for column : "+col
    ax.set_title(plot_name,fontsize = 15)
plt.tight_layout()
plt.show()  

In [None]:
def Count_categorcial_variables(df):
    categorcial_variables = df.select_dtypes(include=['object']).columns.tolist()
    #fig = plt.figure(figsize=(14, 18))

    for index, col in enumerate(categorcial_variables):
        print("------------",col," value counts---------------------")
        print(df[col].value_counts())
        #fig.add_subplot(3, 2, index+1)
        #dataframe[col].value_counts()[:20].plot(kind='bar', title=col, color = "royalblue")
        #plt.tight_layout()
        
    print("\n\n------------Number of categories in each columns---------------------")
    for i in categorcial_variables:
        a = df[i].unique()
        print("There are {} categories in {}".format(len(a),i))
Count_categorcial_variables(df)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(20, 8)
sns.countplot(x = 'age',data = df)
ax.set_xlabel('age',fontsize = 15)
ax.set_ylabel('count',fontsize = 15)
ax.set_title('age count distribution',fontsize = 10)
sns.despine()

In [None]:
# What kind of jobs clients this bank have, if you cross jobs with default, loan or housing, there is no relation
fig, ax = plt.subplots()
fig.set_size_inches(20, 10)
sns.countplot(x = 'job', data = df)
ax.set_xlabel('Job', fontsize=15)
ax.set_ylabel('Count', fontsize=15)
ax.set_title('Job Count Distribution', fontsize=15)
ax.tick_params(labelsize=15)
sns.despine()

In [None]:
# What kind of 'marital clients' this bank have, if you cross marital with default, loan or housing, there is no relation
fig, ax = plt.subplots()
fig.set_size_inches(10, 5)
sns.countplot(x = 'marital', data = df)
ax.set_xlabel('Marital', fontsize=15)
ax.set_ylabel('Count', fontsize=15)
ax.set_title('Marital Count Distribution', fontsize=15)
ax.tick_params(labelsize=15)
sns.despine()

In [None]:
# What kind of 'education clients this bank have, if you cross education with default, loan or housing, there is no relation
fig, ax = plt.subplots()
fig.set_size_inches(20, 5)
sns.countplot(x = 'education', data = df)
ax.set_xlabel('Education', fontsize=15)
ax.set_ylabel('Count', fontsize=15)
ax.set_title('Education Count Distribution', fontsize=15)
ax.tick_params(labelsize=15)
sns.despine()

In [None]:
# Default, has credit in default ?
fig, (ax1, ax2, ax3) = plt.subplots(nrows = 1, ncols = 3, figsize = (20,8))
sns.countplot(x = 'default', data = df, ax = ax1, order = ['no', 'unknown', 'yes'])
ax1.set_title('Default', fontsize=15)
ax1.set_xlabel('')
ax1.set_ylabel('Count', fontsize=15)
ax1.tick_params(labelsize=15)

# Housing, has housing loan ?
sns.countplot(x = 'housing', data = df, ax = ax2, order = ['no', 'unknown', 'yes'])
ax2.set_title('Housing', fontsize=15)
ax2.set_xlabel('')
ax2.set_ylabel('Count', fontsize=15)
ax2.tick_params(labelsize=15)

# Loan, has personal loan ?
sns.countplot(x = 'loan', data = df, ax = ax3, order = ['no', 'unknown', 'yes'])
ax3.set_title('Loan', fontsize=15)
ax3.set_xlabel('')
ax3.set_ylabel('Count', fontsize=15)
ax3.tick_params(labelsize=15)

plt.subplots_adjust(wspace=0.25)

In [None]:
print('Default:\n No credit in default:'     , df[df['default'] == 'no']     ['age'].count(),
              '\n Unknown credit in default:',df[df['default'] == 'unknown']['age'].count(),
              '\n Yes to credit in default:' , df[df['default'] == 'yes']    ['age'].count())

In [None]:
print('Housing:\n No housing in loan : '     , df[df['housing'] == 'no']     ['age'].count(),
              '\n Unknown housing in loan : ', df[df['housing'] == 'unknown']['age'].count(),
              '\n Yes to housing in loan : ' , df[df['housing'] == 'yes']    ['age'].count())

In [None]:
print('Housing:\n No to personal loan:'     , df[df['loan'] == 'no']     ['age'].count(),
              '\n Unknown to personal loan:', df[df['loan'] == 'unknown']['age'].count(),
              '\n Yes to personal loan:'    , df[df['loan'] == 'yes']    ['age'].count())

In [None]:
print('Default:\n No credit in default:'     , df[df['default'] == 'no']     ['age'].count(),
            '\n Unknown to credit in default:', df[df['default'] == 'unknown']['age'].count(),
              '\n Yes to credit in default:' , df[df['default'] == 'yes']    ['age'].count())

In [None]:
print('Housing:\n No housing in loan:'     , df[df['housing'] == 'no']     ['age'].count(),
               '\n Unknown to housing in loan:', df[df['default'] == 'unknown']['age'].count(),
               '\n Yes to housing in loan:' , df[df['housing'] == 'yes']    ['age'].count())

In [None]:
print('Housing:\n No to personal loan:'     , df[df['loan'] == 'no']     ['age'].count(),
            '\n Unknown to personal loan:', df[df['default'] == 'unknown']['age'].count(),
              '\n Yes to personal loan:'    , df[df['loan'] == 'yes']    ['age'].count())

In [None]:
print("Kind of Contact: \n", df['contact'].unique())
print("\nWhich Months this Campaing Work: \n", df['month'].unique())
print("\nWhich Days of Week this Campaing Work: \n", df['day_of_week'].unique())

In [None]:
print("Max duration  call in minutes:  ", round((df['duration'].max()/60),1))
print("Min duration  call in minutes:   ", round((df['duration'].min()/60),1))
print("Mean duration call in minutes:   ", round((df['duration'].mean()/60),1))
print("STD duration  call in minutes:   ", round((df['duration'].std()/60),1))
# Std close to the mean means that the data values are close to the mean

In [None]:
# Quartiles
print('1º Quartile: ', df['duration'].quantile(q = 0.25))
print('2º Quartile: ', df['duration'].quantile(q = 0.50))
print('3º Quartile: ', df['duration'].quantile(q = 0.75))
print('4º Quartile: ', df['duration'].quantile(q = 1.00))
#Calculate the outliers:
  # Interquartile range, IQR = Q3 - Q1
  # lower 1.5*IQR whisker = Q1 - 1.5 * IQR 
  # Upper 1.5*IQR whisker = Q3 + 1.5 * IQR
    
print('Duration calls above: ', df['duration'].quantile(q = 0.75) + 
                      1.5*(df['duration'].quantile(q = 0.75) - df['duration'].quantile(q = 0.25)), 'are outliers')

In [None]:
print('Numerber of outliers: ', df[df['duration'] > 644.5]['duration'].count())
print('Number of clients: ', len(df))
#Outliers in %
print('Outliers are:', round(df[df['duration'] > 644.5]['duration'].count()*100/len(df),2), '%')

In [None]:
bank_se = df.loc[: , ['emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']]
bank_se.head()

In [None]:
bank_o = df.loc[: , ['campaign', 'pdays','previous', 'poutcome']]
bank_o.head()

In [None]:
pd.crosstab(index=df['job'],columns=df['y'])

In [None]:
df.groupby('job').mean()

In [None]:
df.groupby('marital').mean()

In [None]:
df.groupby('education').mean()

In [None]:
plt.figure(figsize=(10,10))
pd.crosstab(df['job'], df['y']).plot(kind='bar')
plt.title('Purchase Frequency for Job Title')
plt.xlabel('Job')
plt.ylabel('Frequency of Purchase')
plt.savefig('purchase_fre_job')

In [None]:
table=pd.crosstab(df['marital'], df['y'])
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Marital Status vs Purchase')
plt.xlabel('Marital Status')
plt.ylabel('Proportion of Customers')
plt.savefig('mariral_vs_pur_stack')

In [None]:
table=pd.crosstab(df['education'], df['y'])
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Education vs Purchase')
plt.xlabel('Education')
plt.ylabel('Proportion of Customers')
plt.savefig('edu_vs_pur_stack')

In [None]:
pd.crosstab(df['day_of_week'], df['y']).plot(kind='bar')
plt.title('Purchase Frequency for Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Frequency of Purchase')
plt.savefig('pur_dayofweek_bar')

In [None]:
pd.crosstab(df['month'], df['y']).plot(kind='bar')
plt.title('Purchase Frequency for Month')
plt.xlabel('Month')
plt.ylabel('Frequency of Purchase')
plt.savefig('pur_fre_month_bar')

In [None]:
df['age'].hist()
plt.title('Histogram of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.savefig('hist_age')

In [None]:
pd.crosstab(df['poutcome'], df['y']).plot(kind='bar')
plt.title('Purchase Frequency for Poutcome')
plt.xlabel('Poutcome')
plt.ylabel('Frequency of Purchase')
plt.savefig('pur_fre_pout_bar')

# Univariate Analysis 

In [None]:
df['age'].value_counts()

In [None]:
plt.subplots(figsize=(14,7))
sns.countplot(df['age'])

In [None]:
df['job'].value_counts()

In [None]:
plt.subplots(figsize=(14,7))
sns.countplot(df['job'])

In [None]:
plt.figure(figsize=(15,10))
df['job'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['marital'].value_counts()

In [None]:
plt.subplots(figsize=(14,7))
sns.countplot(df['marital'])

In [None]:
plt.figure(figsize=(15,10))
df['marital'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['education'].value_counts()

In [None]:
plt.subplots(figsize=(14,7))
sns.countplot(df['education'])

In [None]:
plt.figure(figsize=(15,10))
df['education'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['default'].value_counts()

In [None]:
plt.subplots(figsize=(10,7))
sns.countplot(df['default'])

In [None]:
plt.figure(figsize=(5,5))
df['default'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['housing'].value_counts()

In [None]:
plt.subplots(figsize=(14,7))
sns.countplot(df['housing'])

In [None]:
plt.figure(figsize=(10,10))
df['housing'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['loan'].value_counts()

In [None]:
plt.subplots(figsize=(14,7))
sns.countplot(df['loan'])

In [None]:
plt.figure(figsize=(10,10))
df['loan'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['contact'].value_counts()

In [None]:
plt.subplots(figsize=(14,7))
sns.countplot(df['contact'])

In [None]:
plt.figure(figsize=(15,10))
df['contact'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['month'].value_counts()

In [None]:
plt.subplots(figsize=(14,7))
sns.countplot(df['month'])

In [None]:
plt.figure(figsize=(15,10))
df['month'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['day_of_week'].value_counts()

In [None]:
plt.subplots(figsize=(14,7))
sns.countplot(df['day_of_week'])

In [None]:
plt.figure(figsize=(15,10))
df['day_of_week'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['campaign'].value_counts()

In [None]:
plt.subplots(figsize=(14,7))
sns.countplot(df['campaign'])

In [None]:
df['previous'].value_counts()

In [None]:
plt.subplots(figsize=(14,7))
sns.countplot(df['previous'])

In [None]:
plt.figure(figsize=(15,10))
df['previous'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['poutcome'].value_counts()

In [None]:
plt.subplots(figsize=(14,7))
sns.countplot(df['poutcome'])

In [None]:
df['emp.var.rate'].value_counts()

In [None]:
plt.subplots(figsize=(14,7))
sns.countplot(df['emp.var.rate'])

In [None]:
plt.figure(figsize=(15,10))
df['emp.var.rate'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['cons.price.idx'].value_counts()

In [None]:
df['nr.employed'].value_counts()

In [None]:
plt.subplots(figsize=(14,7))
sns.countplot(df['nr.employed'])

In [None]:
plt.figure(figsize=(17,12))
df['nr.employed'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['y'].value_counts()

In [None]:
plt.subplots(figsize=(5,5))
sns.countplot(df['y'])

In [None]:
plt.figure(figsize=(17,12))
df['y'].value_counts().plot.pie(autopct="%0.2f%%")

# Count Plot

In [None]:
fig=plt.figure(figsize=(20,20))

ax1=fig.add_subplot(331)
ax2=fig.add_subplot(332)
ax3=fig.add_subplot(333)

sns.countplot(x = 'loan',data = df,ax = ax1)
ax1.set_title('Loan Taken ')

sns.countplot(x = 'contact',data = df,ax = ax2)
ax2.set_title('Contact Medium ')

sns.countplot(x='marital',data = df)
ax3.set_title('Marital Staus')

In [None]:
#Marital, education and contact, Default, housing and loan vs Y
plt.figure(figsize = [20,10])

plt.subplot(231)
sns.countplot(x = 'marital', hue = 'y', data = df)

plt.subplot(232)
sns.countplot(x = 'education', hue = 'y', data = df)

plt.subplot(233)
sns.countplot(x = 'contact', hue = 'y', data = df)

plt.subplot(234)
sns.countplot(x = 'default', hue = 'y', data = df)

plt.subplot(235)
sns.countplot(x = 'housing', hue = 'y', data = df)

plt.subplot(236)
sns.countplot(x = 'loan', hue = 'y', data = df)

In [None]:
#Job and Month vs y
plt.figure(figsize=(14,12))

plt.subplot(211)
sns.countplot(y = 'job', data = df, hue = 'y')
plt.title('Job vs Term Deposit')

plt.subplot(212)
sns.countplot(x = 'month', data = df, hue = 'y')
plt.title('Last contact month vs Term Deposit')

In [None]:
#Poutcome vs Y
plt.figure(figsize=(17,5))
sns.countplot(x = 'poutcome', data = df, hue = 'y')
plt.title('Outcome of the previous campaign vs Y')

In [None]:
#Age against Y
g = sns.FacetGrid(data = df, hue = 'y', height = 4, aspect = 2)
g.map(sns.kdeplot,'age', shade = True, legend = True)
g.add_legend()
plt.title('Age against Y')

In [None]:
plt.figure(figsize = (15,8))
sns.countplot(x = df['job'], hue = df['y'])

In [None]:
plt.figure(figsize = (15,8))
sns.countplot(x = df['marital'],hue = df['y'])

In [None]:
plt.figure(figsize = (15,8))
sns.countplot(x=df['education'],hue=df['y'])

In [None]:
plt.figure(figsize = (15,8))
sns.countplot(x=df['default'],hue=df['y'])

In [None]:
sns.countplot(x=df['housing'],hue=df['y'])

In [None]:
sns.countplot(x=df['loan'],hue=df['y'])

In [None]:
sns.countplot(x=df['contact'],hue=df['y'])

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(x=df['month'],hue=df['y'])

In [None]:
f, ax = plt.subplots(figsize = (15, 4))
sns.countplot(y = "education", hue = 'marital', data = df).set_title('Education Level vs Marital Status Distribution')

In [None]:
plt.subplots(figsize=(10,5))
sns.countplot(x = "day_of_week",hue = "y",data = df)
plt.show()

In [None]:
plt.figure(figsize = (10,5))
df["pdays_bin"] = pd.cut(df.pdays,bins = 5,labels = ["c1","c2","c3","c4","c5"])
sns.countplot(x = "pdays_bin",hue = "y",data = df)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x = "poutcome",hue = "y",data = df)

# Cross Tab Plot

In [None]:
pd.crosstab(index=df['job'],columns=df['education'])

In [None]:
# Y VS JOB
pd.crosstab(df['job'],df['y']).style.background_gradient(cmap='winter')

In [None]:
# Y VS MARITAL
pd.crosstab(df['marital'],df['y']).style.background_gradient(cmap='spring')

In [None]:
# Y VS EDUCATION
pd.crosstab(df['education'],df['y']).style.background_gradient(cmap='autumn')

In [None]:
# Y VS LOAN DEFAULT
pd.crosstab(df['default'],df['y']).style.background_gradient(cmap='cool')

In [None]:
# y VS HOUSING LOAN
pd.crosstab(df['housing'],df['y']).style.background_gradient(cmap='Wistia')

In [None]:
# Y VS PERSONAL LOAN
pd.crosstab(df['loan'],df['y']).style.background_gradient(cmap='bwr')

In [None]:
# Y VS CONTACT
pd.crosstab(df['contact'],df['y']).style.background_gradient(cmap='seismic')

In [None]:
# Y VS MONTH
pd.crosstab(df['month'],df['y']).style.background_gradient(cmap='PRGn')

In [None]:
#Analysis of job and education
pd.crosstab(index = df['job'],columns = df['education'])

# Pivot Table Plot

In [None]:
df.pivot_table(df,index=["y"])

In [None]:
# Age and Y

a_df = pd.DataFrame()
a_df['age_yes'] = (df[df['y'] == 'yes'][['y','age']].describe())['age']
a_df['age_no'] = (df[df['y'] == 'no'][['y','age']].describe())['age']

a_df

In [None]:
a_df.drop(['count', '25%', '50%', '75%']).plot.bar(title = 'Age and Y Statistics')

In [None]:
# Number of contacts performed during this campaign ('campaign') and y

c_df = pd.DataFrame()
c_df['campaign_yes'] = (df[df['y'] == 'yes'][['y','campaign']].describe())['campaign']
c_df['campaign_no'] = (df[df['y'] == 'no'][['y','campaign']].describe())['campaign']

c_df

In [None]:
c_df.drop(['count', '25%', '50%', '75%']).plot.bar(title = 'Number of Contacts Performed during this Campaign and Y Statistics')

In [None]:
# Number of contacts performed during previous campaign ('previous') and y

p_df = pd.DataFrame()
p_df['previous_yes'] = (df[df['y'] == 'yes'][['y','previous']].describe())['previous']
p_df['previous_no'] = (df[df['y'] == 'no'][['y','previous']].describe())['previous']

p_df

In [None]:
p_df.drop(['count', '25%', '50%', '75%']).plot.bar(title = 'Number of Contacts performed during previous Campaign and Y Statistics')

In [None]:
plt.subplots(figsize=(10,5))
sns.countplot(x = "day_of_week",hue = "y",data = df)
plt.show()

In [None]:
plt.figure(figsize = (10,5))
df["pdays_bin"] = pd.cut(df.pdays,bins = 5,labels = ["c1","c2","c3","c4","c5"])
sns.countplot(x = "pdays_bin",hue = "y",data = df)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x = "poutcome",hue = "y",data = df)

# Line Plot

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(df['age'],df['y'])

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(df['job'],df['y'])

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(df['month'],df['y'])

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(df['day_of_week'],df['y'])

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(df['campaign'],df['y'])

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(df['pdays'],df['y'])

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(df['job'],df['age'])

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(df['marital'],df['age'])

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(df['education'],df['age'])

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(df['default'],df['age'])

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(df['month'],df['age'])

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(df['day_of_week'],df['age'])

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(df['campaign'],df['age'])

# Violin Plot

In [None]:
plt.figure(figsize=(10,10))
sns.violinplot(x = df['age'], y = df['y'])

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x = df['job'], y = df['age'])

In [None]:
plt.figure(figsize=(10,10))
sns.violinplot(x = df['marital'], y = df['age'])

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x = df['education'], y = df['age'])

In [None]:
plt.figure(figsize=(10,10))
sns.violinplot(x = df['housing'], y = df['age'])

In [None]:
plt.figure(figsize=(10,10))
sns.violinplot(x = df['loan'], y = df['age'])

In [None]:
plt.figure(figsize=(10,10))
sns.violinplot(x = df['contact'], y = df['age'])

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x = df['month'], y = df['age'])

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x = df['day_of_week'], y = df['age'])

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x = df['campaign'], y = df['age'])

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x = df['pdays'], y = df['age'])

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x = df['previous'], y = df['age'])

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x = df['poutcome'], y = df['age'])

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x = df['emp.var.rate'], y = df['age'])

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x = df['nr.employed'], y = df['age'])

# Bar Plot

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x = df['age'],y = df['y'])

In [None]:
plt.figure(figsize=(15,15))
sns.barplot(x = df['job'],y = df['age'])

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x = df['marital'], y = df['age'])

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(x = df['education'], y = df['age'])

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x = df['housing'], y = df['age'])

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x = df['loan'], y = df['age'])

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x = df['contact'], y = df['age'])

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(x = df['month'], y = df['age'])

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(x = df['day_of_week'], y = df['age'])

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(x = df['campaign'], y = df['age'])

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(x = df['pdays'], y = df['age'])

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(x = df['previous'], y = df['age'])

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(x = df['poutcome'], y = df['age'])

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(x = df['emp.var.rate'], y = df['age'])

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(x = df['nr.employed'], y = df['age'])

# Box Plot

In [None]:
plt.figure(figsize = (15, 10))
sns.boxplot(df['job'], df['age'])

In [None]:
plt.figure(figsize = (15, 10))
sns.boxplot(df['marital'], df['age'])

In [None]:
plt.figure(figsize = (15, 10))
sns.boxplot(df['education'], df['age'])

In [None]:
plt.figure(figsize = (10, 10))
sns.boxplot(df['housing'], df['age'])

In [None]:
plt.figure(figsize = (10, 10))
sns.boxplot(df['loan'], df['age'])

In [None]:
plt.figure(figsize = (10, 10))
sns.boxplot(df['contact'], df['age'])

In [None]:
plt.figure(figsize = (15, 10))
sns.boxplot(df['month'], df['age'])

In [None]:
plt.figure(figsize = (15, 10))
sns.boxplot(df['day_of_week'], df['age'])

In [None]:
plt.figure(figsize = (15, 10))
sns.boxplot(df['campaign'], df['age'])

In [None]:
plt.figure(figsize = (15, 10))
sns.boxplot(df['pdays'], df['age'])

In [None]:
plt.figure(figsize = (15, 10))
sns.boxplot(df['previous'], df['age'])

In [None]:
plt.figure(figsize = (10, 10))
sns.boxplot(df['poutcome'], df['age'])

In [None]:
plt.figure(figsize = (10, 10))
sns.boxplot(df['emp.var.rate'], df['age'])

In [None]:
plt.figure(figsize = (15, 10))
sns.boxplot(df['cons.conf.idx'], df['age'])

In [None]:
plt.figure(figsize = (15, 10))
sns.boxplot(df['nr.employed'], df['age'])

In [None]:
plt.figure(figsize = (5, 5))
sns.boxplot(df['y'], df['age'])

# GroupBy Plot

In [None]:
df.groupby(["age"])[["y"]].describe()

In [None]:
df.groupby(["job"])[["y"]].describe()

In [None]:
df.groupby(["marital"])[["y"]].describe()

In [None]:
df.groupby(["education"])[["y"]].describe()

In [None]:
df.groupby(["default"])[["y"]].describe()

In [None]:
df.groupby(["housing"])[["y"]].describe()

In [None]:
df.groupby(["loan"])[["y"]].describe()

In [None]:
df.groupby(["contact"])[["y"]].describe()

In [None]:
df.groupby(["month"])[["y"]].describe()

In [None]:
df.groupby(["day_of_week"])[["y"]].describe()

In [None]:
df.groupby(["duration"])[["y"]].describe()

In [None]:
df.groupby(["campaign"])[["y"]].describe()

In [None]:
df.groupby(["pdays"])[["y"]].describe()

In [None]:
df.groupby(["previous"])[["y"]].describe()

In [None]:
df.groupby(["poutcome"])[["y"]].describe()

In [None]:
df.groupby(["emp.var.rate"])[["y"]].describe()

In [None]:
df.groupby(["cons.price.idx"])[["y"]].describe()

In [None]:
df.groupby(["cons.conf.idx"])[["y"]].describe()

In [None]:
df.groupby(["euribor3m"])[["y"]].describe()

In [None]:
df.groupby(["nr.employed"])[["y"]].describe()

Here, 'y' is the Target Variable. After changing all(categorical) columns("job", "marital", "default", "education", "housing", "loan", "contact", "day_of_week", "poutcome", "month", "y") to the Numerical values. 

In [None]:
df['job'] = df['job'].map({'admin.': 0, 'blue-collar': 1, 'technician': 2, 'services': 3, 'management': 4, 
                           'retired': 5, 'entrepreneur': 6, 'self-employed': 7, 'housemaid': 8, 'unemployed': 9, 
                           'student': 10, 'unknown': 11})
df.head()

In [None]:
df['marital'] = df['marital'].map({'married': 0, 'single': 1, 'divorced': 2, 'unknown': 3})
df.head()

In [None]:
df['education'] = df['education'].map({'university.degree': 0, 'high.school': 1, 'basic.9y': 2, 
                                       'professional.course': 3, 'basic.4y': 4, 'basic.6y': 5, 
                                       'unknown': 6, 'illiterate': 7})
df.head()

In [None]:
df['default'] = df['default'].map({'no': 0, 'unknown': 1, 'yes': 2})
df.head()

In [None]:
df['housing'] = df['housing'].map({'yes': 0, 'no': 1, 'unknown': 2})
df.head()

In [None]:
df['loan'] = df['loan'].map({'no': 0, 'yes': 1, 'unknown': 2})
df.head()

In [None]:
df['contact'] = df['contact'].map({'cellular': 0, 'telephone': 1})
df.head()

In [None]:
df['month'] = df['month'].map({'may': 0, 'jul': 1, 'aug': 2, 'jun': 3, 'nov': 4, 'apr': 5, 'oct': 6, 
                               'sep': 7, 'mar': 8, 'dec': 9})
df.head()

In [None]:
df['day_of_week'] = df['day_of_week'].map({'thu': 0, 'mon': 1, 'wed': 2, 'tue': 3, 'fri': 4})
df.head()

In [None]:
df['poutcome'] = df['poutcome'].map({'nonexistent': 0, 'failure': 1, 'success': 2})
df.head()

In [None]:
df['y'] = df['y'].map({'no': 0, 'yes': 1})
df.head()

In [None]:
df['pdays_bin'] = df['pdays_bin'].map({'c5': 0, 'c1': 1, 'c4': 2, 'c3': 3, 'c2': 4})
df.head()

In [None]:
df['nr.employed'] = df['nr.employed'].map({'4964': 0, '4992': 1, '5009': 2, '5018': 3, '5024': 4, 
                                           '5076': 5, '5099': 6, '5176': 7, '5191': 8, '5196': 9, 
                                           '5228': 10})
df.head()

# Violion Plot

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x = df['job'], y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.violinplot(x = df['marital'], y = df['y'])

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x = df['education'], y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.violinplot(x = df['housing'], y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.violinplot(x = df['loan'], y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.violinplot(x = df['contact'], y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.violinplot(x = df['month'], y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.violinplot(x = df['day_of_week'], y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.violinplot(x = df['campaign'], y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.violinplot(x = df['pdays'], y = df['y'])

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x = df['previous'], y = df['y'])

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x = df['poutcome'], y = df['y'])

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x = df['emp.var.rate'], y = df['y'])

# Bar Plot

In [None]:
plt.figure(figsize=(15,15))
sns.barplot(x = df['age'],y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x = df['job'],y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x = df['marital'],y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x = df['education'],y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x = df['default'],y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x = df['housing'],y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x = df['loan'],y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x = df['contact'],y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x = df['month'],y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x = df['day_of_week'],y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x = df['campaign'],y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x = df['pdays'],y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x = df['previous'],y = df['y'])

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x = df['poutcome'],y = df['y'])

# Box Plot

In [None]:
plt.figure(figsize = (10, 10))
sns.boxplot(data = df, x = "y", y = "age")

In [None]:
plt.figure(figsize = (10, 10))
sns.boxplot(data = df, x = "y", y = "job")

In [None]:
plt.figure(figsize = (10, 10))
sns.boxplot(data = df, x = "y", y = "month")

In [None]:
plt.figure(figsize = (10, 10))
sns.boxplot(data = df, x = "y", y = "day_of_week")

In [None]:
plt.figure(figsize = (10, 10))
sns.boxplot(data = df, x = "y", y = "duration")

In [None]:
plt.figure(figsize = (10, 10))
sns.boxplot(data = df, x = "y", y = "campaign")

In [None]:
plt.figure(figsize = (10, 10))
sns.boxplot(data = df, x = "y", y = "emp.var.rate")

In [None]:
plt.figure(figsize = (10, 10))
sns.boxplot(data = df, x = "y", y = "cons.price.idx")

In [None]:
plt.figure(figsize = (10, 10))
sns.boxplot(data = df, x = "y", y = "cons.conf.idx")

In [None]:
plt.figure(figsize = (10, 10))
sns.boxplot(data = df, x = "y", y = "euribor3m")

# Scatter Plot

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(data = df, x = "age", y = "campaign",hue = "job")

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(data = df, x = "age", y = "job",hue = "y")

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(data = df, x = "age", y = "marital",hue = "y")

# Strip Plot

In [None]:
plt.figure(figsize=(15,10))
sns.stripplot(x = 'job', y = "age", data = df)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.stripplot(x = 'marital', y = "age", data = df)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.stripplot(x = 'education', y = "age", data = df)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.stripplot(x = 'education', y = "age", data = df)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.stripplot(x = 'default', y = "age", data = df)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.stripplot(x = 'housing', y = "age", data = df)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.stripplot(x = 'loan', y = "age", data = df)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.stripplot(x = 'contact', y = "age", data = df)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.stripplot(x = 'month', y = "age", data = df)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.stripplot(x = 'day_of_week', y = "age", data = df)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.stripplot(x = 'campaign', y = "age", data = df)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.stripplot(x = 'pdays', y = "age", data = df)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.stripplot(x = 'poutcome', y = "age", data = df)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.stripplot(x = 'emp.var.rate', y = "age", data = df)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.stripplot(x = 'cons.price.idx', y = "age", data = df)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.stripplot(x = 'cons.conf.idx', y = "age", data = df)
plt.show()

# Dist Plot

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["age"])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["job"])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["marital"])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["education"])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["default"])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["housing"])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["loan"])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["contact"])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["duration"])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["campaign"])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["pdays"])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["poutcome"])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["emp.var.rate"])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["cons.price.idx"])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["cons.conf.idx"])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["euribor3m"])
plt.show()

# Training and Testing Data

In [None]:
X = df.loc[:, ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 
               'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 
               'euribor3m', 'pdays_bin']]
X.head()

In [None]:
Y = df.loc[:, ['y']]
Y.head()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state=42, shuffle = True)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Linear Regression

In [None]:
regressor = LinearRegression()  
regressor.fit(X_train, Y_train) #training the algorithm
#To retrieve the intercept:
print(regressor.intercept_)

#For retrieving the slope:
print(regressor.coef_)

In [None]:
Y_pred = regressor.predict(X_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))

In [None]:
# Model initialization
regression_model = LinearRegression()
# Fit the data(train the model)
regression_model.fit(X, Y)
# Predict
Y_pred = regression_model.predict(X)

# model evaluation
rmse = mean_squared_error(Y, Y_pred)
r2 = r2_score(Y, Y_pred)

# printing values
print('Slope:' ,regression_model.coef_)
print('Intercept:', regression_model.intercept_)
print('Root mean squared error: ', rmse)
print('R2 score: ', r2)

In [None]:
import statsmodels.api as sm

X = np.random.rand(100)
Y = X + np.random.rand(100)*0.1

results = sm.OLS(Y,sm.add_constant(X)).fit()

print(results.summary())

plt.scatter(X,Y)

X_plot = np.linspace(0,1,100)
plt.plot(X_plot, X_plot*results.params[0] + results.params[1])

plt.show()

In [None]:
X = df.iloc[:, 0].values.reshape(-1, 1)
Y = df.iloc[:, 1].values.reshape(-1, 1)
linear_regressor = LinearRegression()
linear_regressor.fit(X, Y)
Y_pred = linear_regressor.predict(X)

In [None]:
plt.scatter(X, Y)
plt.plot(X, Y_pred, color='red')
plt.show()

In [None]:
from sklearn import linear_model
# with sklearn
regr = linear_model.LinearRegression()
regr.fit(X, Y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

# Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

In [None]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, Y_test)))

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, y_pred)))

In [None]:
confusion_matrix = confusion_matrix(Y_test, y_pred)
print(confusion_matrix)

In [None]:
print(classification_report(Y_test, y_pred))

In [None]:
logit_roc_auc = roc_auc_score(Y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(Y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

# Random Forest Classifier Model

In [None]:
# create and fit RandomForestClassifier model  
rfc=RandomForestClassifier()
rfc.fit(X_train, Y_train)

In [None]:
#predict
y_pred = rfc.predict(X_test)
y_pred

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, y_pred)))

In [None]:
rfc_acc= accuracy_score(Y_test, y_pred)
print('The accuracy score using the RandomForestClassifier (befor resample) is :',rfc_acc)

In [None]:
print(classification_report(Y_test, y_pred))

# Decision Tree Classifier

In [None]:
# create and fit DecisionTreeClassifier model
dtc = DecisionTreeClassifier()
dtc.fit(X_train,Y_train)

In [None]:
#predict
y_pred = dtc.predict(X_test)
y_pred

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, y_pred)))

In [None]:
dtc_acc = accuracy_score(Y_test, y_pred)
print('The accuracy score with using the decision tree classifier is :',dtc_acc)

In [None]:
print(classification_report(Y_test, y_pred))

# K Nearest Neighbors Classifier

In [None]:
# create and fit KNeighborsClassifier model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,Y_train)

In [None]:
#predict
Y_pred = knn.predict(X_test)

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))

In [None]:
#KNN accuracy score

Knn_acc= accuracy_score(Y_test, knn.predict(X_test))
print('The accuracy socre using the KNeighborsClassifier is :',Knn_acc)

In [None]:
print(classification_report(Y_test, Y_pred))

# Evaluation
Comparing Model Accuracy

In [None]:
pd.DataFrame([rfc_acc, dtc_acc, Knn_acc]).plot.bar();
plt.xticks(np.arange(3),('RFC','DTC','KNN'))
plt.legend().remove()
plt.ylim(0,1)
plt.ylabel('Accuracy')
plt.xlabel('Models')
plt.xticks(rotation = 0)
plt.title('Comparing Model Accuracy');

In [None]:
# ROC/AUC curve
plt.figure(figsize = (10,10))
ax = plt.gca()
ax.set_title('Receiver Operating Characteristic',size = 15)
rfc_disp = plot_roc_curve(rfc, X_test, Y_test, ax = ax, alpha = 0.8, name = 'RandomForestClassifier')
tree_disp = plot_roc_curve(dtc, X_test, Y_test,  ax = ax, alpha = 0.8, name = 'DecisionTreeClassifier')
knn_disp =  plot_roc_curve(knn, X_test, Y_test,  ax = ax, alpha = 0.8, name = 'KNeighborsClassifier')