In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.feature_selection import SelectKBest,chi2,f_regression
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import r2_score,mean_absolute_error,accuracy_score,confusion_matrix,classification_report
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
%config Completer.use_jedi = False

scale = StandardScaler()
df = pd.read_csv('/kaggle/input/bank-marketing-campaigns-dataset/bank-additional-full.csv',sep=";")
df.head()

In [None]:
df.info()

In [None]:
df.select_dtypes('object')

In [None]:
df['y'].value_counts()

# Feature Encoding

In [None]:
df = df.replace('unknown',np.NaN)
df['y'] = df['y'].apply(lambda x: 1 if x=='yes' else (0 if x=='no' else None))
df.head()

In [None]:
df.isna().sum()

In [None]:
def one_hot_encoder(df,columns,prefixes):
    df = df.copy()
    for column,prefix in zip(columns,prefixes):
        dummies = pd.get_dummies(df[column],prefix=prefix)
        df = pd.concat([df,dummies],axis=1)
        df = df.drop(column,axis=1)
    return df

def ordinal_encoder(df,columns,orderings):
    df = df.copy()
    for column,ordering in zip(columns,orderings):
        df[column] = df[column].apply(lambda x:ordering.index(x)) 
    return df
    
#binary encoder
def binary_encoder(df, columns, positive_values):
    df = df.copy()
    for column, positive_value in zip(columns, positive_values):
        df[column] = df[column].apply(lambda x: 1 if x == positive_value else x)
        df[column] = df[column].apply(lambda x: 0 if str(x) != 'nan' else x)
    return df

In [None]:
nominal_features = ['job','marital','education','day_of_week','month','poutcome']
prefixes = ['j','m','e','d','mo','p']

df = one_hot_encoder(df,nominal_features,prefixes)

In [None]:
binary_features = ['default','housing','loan','contact']
positive_values = ['yes','yes','yes','cellular']
# df = binary_encoder(df,binary_features,positive_values)
binVal = {'yes':1,'no':0}
contVal = {'cellular':1,'telephone':0}
df['housing'].replace(binVal,inplace=True)
df['default'].replace(binVal,inplace=True)
df['loan'].replace(binVal,inplace=True)
df['contact'].replace(contVal,inplace=True)
# df['housing'].value_counts()

In [None]:
df.head()

# Filling Missing Values

In [None]:
df.isna().sum()
print(df['housing'].unique())

In [None]:
for column in ['default','housing','loan']:
    df[column] = df[column].fillna(df[column].mean())

In [None]:
print("missing values are: {}".format(df.isna().sum().sum()))

# Splitting / Scaling Data

In [None]:
y = df['y']
X = df.drop(columns='y').copy()

trainX,testX,trainY,testY = train_test_split(X,y,random_state=36,stratify=y,test_size=0.25)
#Scaling X
trainX = scale.fit_transform(trainX)
testX = scale.fit_transform(testX)

# Features Selection

In [None]:
sns.countplot(df['y'])

In [None]:
best_feature = SelectKBest(k="all").fit(trainX,trainY)
scores = pd.DataFrame(best_feature.scores_)
columns = pd.DataFrame(X.columns)
bestFeatures = pd.concat([columns,scores],axis=1)
bestFeatures.columns = ['Feature','Score']
bestFeatures = bestFeatures.sort_values(by="Score",ascending=False)
bestFeatures

# Applying ML Algorithms

In [None]:
model_acc_scores = {}
def predictionResult(testY,pred,model):
    conf_mat = confusion_matrix(testY,pred)
    correct = conf_mat[0,0]+conf_mat[1,1]
    wrong = conf_mat.sum() - correct
    mae = mean_absolute_error(testY,pred)
    acc_score = accuracy_score(testY,pred)
    model_acc_scores[model] = {'correct':correct,'wrong':wrong,'mae':mae,'accuracy_score':acc_score}
    print("{} {} {}".format("-"*20,model,"-"*20))
    print("Model predicted {} correct and {} wrong".format(correct,wrong))
    print("Mean Absolute Error is: {}".format(round(mae*100,2)))
    print("Accuracy Score is: {}".format(round(acc_score*100,2)))

In [None]:
model = LogisticRegression(max_iter=200).fit(trainX,trainY)
cv_score = cross_val_score(model,trainX,trainY,cv=10)
pred = model.predict(testX)
print("Cross val score is: {}%".format(round(cv_score.mean()*100,2)))
predictionResult(testY,pred,"LogisticRegression")

In [None]:
model = LogisticRegressionCV(cv=10,max_iter=320).fit(trainX,trainY)
pred = model.predict(testX)
predictionResult(testY,pred,"LogisticRegressionCV")
report = classification_report(testY,pred)
print(report)

In [None]:
model = RandomForestClassifier().fit(trainX,trainY)
cv_score = cross_val_score(model,trainX,trainY,cv=10)
pred = model.predict(testX)
print("Cross val score is: {}%".format(round(cv_score.mean()*100,2)))
predictionResult(testY,pred,"RandomForestClassifier")
report = classification_report(testY,pred)
print(report)

In [None]:
model = DecisionTreeClassifier().fit(trainX,trainY)
cv_score = cross_val_score(model,trainX,trainY,cv=10)
pred = model.predict(testX)
print("Cross val score is: {}%".format(round(cv_score.mean()*100,2)))
predictionResult(testY,pred,"DecisionTreeClassifier")

In [None]:
model = KNeighborsClassifier(n_neighbors=23).fit(trainX,trainY)
cv_score = cross_val_score(model,trainX,trainY,cv=10)
pred = model.predict(testX)
print("Cross val score is: {}%".format(round(cv_score.mean()*100,2)))
predictionResult(testY,pred,"KNeighborsClassifier")

In [None]:
model = GaussianNB().fit(trainX,trainY)
cv_score = cross_val_score(model,trainX,trainY,cv=10)
pred = model.predict(testX)
print("Cross val score is: {}%".format(round(cv_score.mean()*100,2)))
predictionResult(testY,pred,"GaussianNB")

In [None]:
res = pd.DataFrame(model_acc_scores)
res.head()