In [1]:
##importing necessary libraries


import numpy as np
import pandas as pd
import time
import math
import sklearn.tree
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.neighbors import DistanceMetric




## RETURNS percentage% OF DATAFrame AFTER SAMPLING

def sampling(dataframe,percentage):
    
    return dataframe.sample(frac=percentage).reset_index(drop=True)
    

## PRE-PROCESSING:
## CONVERTS WORD CLASSES TO NUMERIC CLASSES:
def preprocess_word_to_num(dataframe):
    df_copy=dataframe.copy(deep=True)
       
    for column in df_copy.columns:
        if column=="y":
            df_copy[column]=df_copy[column].replace({"yes":1,"no":0}) 
        if column=="job":
            df_copy["job"].replace({"management":1,
                                    "blue-collar":2,
                                    "admin":3,
                                    "entrepreneur":4,
                                    "services":5,
                                    "self-employed":6,
                                    "retired":7,
                                    "technician":8,
                                    "housemaid":9,
                                    "student":10,
                                    "unemployed":11,
                                    "unknown":12})
        value=df_copy[column][df_copy.first_valid_index()]
   
        if not (isinstance(value, np.int64) or isinstance(value, np.float64)):
            l1=list(set(df_copy[column]))
            l2=list(range(0,len(list(set(df_copy[column])))))
            df_copy[column]=df_copy[column].replace(l1,l2)
    return df_copy


##NORMALIZES THE FEATURE DATA POINTS BY REPLACING X WITH (X-X_MEAN)/X_STD

def normalize(dataframe):
        df_copy=dataframe.copy(deep=True)
        for column in df_copy.columns:
            std_dev=np.std(df_copy[column])
            mean=np.mean(df_copy[column])
            l1=list(set(df_copy[column]))
            l2=list((item-mean)/std_dev for item in l1)
            df_copy[column]=df_copy[column].replace(l1,l2)
        return df_copy

    
##DROPS UNNECESSARY FEARURE FFROM TRAINING DATA:

def drop_column(dataframe,col):
    return dataframe.drop(col,axis=1)


#counting time from now

start=time.time()

#redaing data from csv

url="https://github.com/shadow23-cmi/DT-SVM-NB/raw/master/bank-additional-full.csv"
df1=pd.read_csv(url,sep=";")

#storing all the distinct months and distinct consumer price index
# as, dividing the data points on maothly basis needs these 2 parameters
#  as, there are more than 1 year and no 2 same month of diff yaer have same same consumer price index

month_unique=list(set(df1["month"]))
cons_price_idx_unique=list(set(df1["cons.price.idx"]))



#seperating on basis of outcome
#so that the majority class can be undersampled for better fitting

df_no=df1[df1["y"]=="no"]
df_yes=df1[df1["y"]=="yes"]

print("Seperating by outcme: Done")


#reseting df_no indices

df_no=df_no.reset_index(drop=True)

#taking only the following features while calculating data point distances
# as, taking monthly data for clustering the rest of the features are of no need as they are monthly data
#  so, no difference is made

df_no_undersampling_clustered=df_no[["age", 'job', 'marital', 'education', 'default', 'housing', 'loan',
                                    'contact', 'day_of_week', "month", 'campaign', 'pdays',"cons.price.idx"]]

print("starting clustering by month:")


#this object will give distance matrix based on "taxi cab " matrix:

dist = DistanceMetric.get_metric('manhattan')

# contains the ps than can be deleted for under-sampling
# and their centers

pts_to_delete=[]
centers=[]

# Here we are calculating the distance matrix of monthly data pts
#  since, the distance matrix is symmetric we are considering only the upper triangular part
#   there, the pts that  have >= 4 pts in therir 2 radius neighbourhood are only selected 
#    and, the rst of the pts in its neighbourhood are discarded
for month in month_unique:
    for index in cons_price_idx_unique:
        
        
        df_month_wise=df_no_undersampling_clustered.loc[(df_no_undersampling_clustered["month"] == month )
                                                        &(df_no_undersampling_clustered["cons.price.idx"]==index)]
        start_index=df_month_wise.first_valid_index()
               
        if not df_month_wise.empty:
            
            df_month_wise=preprocess_word_to_num(df_month_wise)
            distance_matrix=dist.pairwise(df_month_wise)
            
            for i in range(len(distance_matrix)):
                
                temp=[]
                
                for j in range(i+1,len(distance_matrix)):
                    
                    if distance_matrix[i][j]==0:
                        pts_to_delete.append(j+start_index)
                    
                    if distance_matrix[i][j]<=2:
                        temp.append(j+start_index)        
                
                if len(temp)>=4:
                    pts_to_delete += temp
                    centers.append(i+start_index)



print("done")


#storing unnecessary data pts

drop=list((set(pts_to_delete)))


print("started droping unnecessary rows:")

#Dropping the unnecesary data pts after clustering

df_no.drop(df_no.index[drop],inplace=True)
print("done")
df_no=df_no.reset_index(drop=True)

#taking 80% the no clss data

df_no=sampling(df_no,0.8)

#creating mixed data types by combining all yes class and no clss data
#randomsampling them

df=df_no.append(df_yes)
df=df.reset_index(drop=True)


#Pre processing data by sampling and converting str data types to numerical types:

df=sampling(df,1)
df=preprocess_word_to_num(df)

print("started droping unnecessary columns:")

#Dropping unnecessary features

df2=drop_column(df,"y")

#Normalizing the features to mean 0 standerd deviation 1

df2=normalize(df2)


# Final Data for fitting:

data=df2.to_numpy()
target=df["y"].to_numpy()

#Fitting naive bayes classifire
naive_bayes=GaussianNB()

data_train,data_test,target_train,target_test=train_test_split(data,target,test_size=.1,shuffle=True)


crs=cross_validate(naive_bayes, 
                     data, 
                     target,
                     cv=10,
                     scoring=["recall","precision","accuracy","f1"],
                     n_jobs=2)

#printing output:

print("For Naive bayesian ,\n")
print("\t After cross validation the  different scores are as follows:")
print("recall:",crs["test_recall"],"\n",
      "precision:",crs["test_precision"],"\n",
      "accuracy:",crs["test_accuracy"],"\n",
      "fscore:",crs["test_f1"],"\n")
print("recall mean   :", crs["test_recall"].mean(),"with starderd deviation:",crs["test_recall"].std())
print("precision mean:",crs["test_precision"].mean(),"with starderd deviation:",crs["test_recall"].std())
print("accuracy mean :",crs["test_accuracy"].mean(),"with starderd deviation:",crs["test_recall"].std())
print("fscore   mean :",crs["test_f1"].mean(),"with starderd deviation:",crs["test_recall"].std())

end=time.time()
total_time=end-start
# out put file:

with open("output_file.txt","a+")as output_file:
    output_file.write("\nFor Naive Bayesian ,\n")
    output_file.write("\t After cross validation the  different scores are as follows:")

    output_file.write("\n\n\t accuracy: \t")
    output_file.write(str(crs["test_accuracy"]))

    output_file.write("\n\n\t recall: \t")

    output_file.write(str(crs["test_recall"])) 
    output_file.write("\n\n\t precision: \t")


    output_file.write(str(crs["test_precision"])) 
    output_file.write("\n\n\t fscore: \t")

    output_file.write(str(str(crs["test_f1"])))                  

    x="\n\t recall mean   :\t"+str(round(crs["test_recall"].mean(),2))+"\t with starderd deviation:\t"+str(round(crs["test_recall"].std(),3))
    output_file.write(x)
    y="\n\t precision mean:\t"+str(round(crs["test_precision"].mean(),1))+"\t with starderd deviation:\t"+str(round(crs["test_recall"].std(),3))
    output_file.write(y)
    z="\n\t accuracy mean :\t"+str(round(crs["test_accuracy"].mean(),2))+"\t with starderd deviation:\t"+str(round(crs["test_recall"].std(),3))
    output_file.write(z)
    f1="\n\t fscore mean   :\t"+str(round(crs["test_f1"].mean(),2))+"\t with starderd deviation:\t"+str(round(crs["test_f1"].std(),3))
    output_file.write(f1)
    output_file.write("\n")
    output_file.write("Time taken: "+str(total_time)+"secs")
    output_file.write("\n ===================================================================")

print("Time taken:",end-start)

Seperating by outcme: Done
starting clustering by month:
done
started droping unnecessary rows:
done
started droping unnecessary columns:
For Decision Tree ,

	 After cross validation the  different scores are as follows:
recall: [0.63577586 0.63577586 0.63577586 0.59482759 0.60344828 0.65086207
 0.62284483 0.62068966 0.63362069 0.61637931] 
 precision: [0.5353902  0.52304965 0.51304348 0.49819495 0.47945205 0.52521739
 0.4923339  0.51428571 0.51488616 0.49225473] 
 accuracy: [0.82247285 0.81704261 0.81244779 0.80526536 0.79607188 0.81821981
 0.80234016 0.8127873  0.81320518 0.80234016] 
 fscore: [0.58128079 0.57392996 0.56785371 0.54223969 0.53435115 0.5813282
 0.54995243 0.5625     0.56811594 0.54736842] 

recall mean   : 0.625 with starderd deviation: 0.016012220362936813
precision mean: 0.5088108228555484 with starderd deviation: 0.016012220362936813
accuracy mean : 0.8102193078461581 with starderd deviation: 0.016012220362936813
fscore   mean : 0.5608920274984014 with starderd dev