In [2]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.types import *
#import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
spark = SparkSession.builder.master('local[4]').appName('ml').getOrCreate()

# Dataset - 1(bank-full.csv)

In [4]:
data_1 = spark.read.csv('bank-full.csv',sep = ";",header=True,inferSchema=True)

In [5]:
data_1.count()

45211

### Converting to Pandas

In [6]:
pdf_1=data_1.toPandas()

#### to display maximum no. of columns

In [7]:
pd.pandas.set_option('display.max_columns',None)

In [8]:
pdf_1.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


### renamed column for unform naming before concat operation

In [9]:
new_cols = ["emp_var_rate", "cons_price_idx", "cons_conf_idx", "euribor_3m", "nr_employed"]
col_no = 16
for column in new_cols:
    pdf_1.insert(col_no, column, value = np.nan)
    col_no = col_no + 1

In [10]:
pdf_1.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor_3m,nr_employed,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,,,,,,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,,,,,,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,,,,,,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,,,,,,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,,,,,,no


## Adding year mapper function to add year in dataset - 1(bank - full.csv)

In [14]:
def year_mapper(data, start_yr):
    month_lst = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]

    # Make a copy of the original dataframe
    new_data = data.copy()

    # Insert a new "year" column filled with zeros
    new_data.insert(loc=0, column="year", value=0)

    # Set the first year to the start year
    current_year = int(start_yr)
    new_data.at[0, "year"] = current_year

    # Loop through the rows of the dataframe, updating the year column when the month changes
    for i in range(1, len(new_data)):
        # If the current month is earlier in the year than the previous month, increment the year
        if month_lst.index(new_data["month"][i]) < month_lst.index(new_data["month"][i-1]):
            current_year += 1

        new_data.at[i, "year"] = current_year

        # If the current year exceeds the end year, break out of the loop
        if current_year > 2010:
            break

    return new_data

In [12]:
result_bankfull = year_mapper( data = pdf_1, start_yr = 2008)
result_bankfull.head()

Unnamed: 0,year,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor_3m,nr_employed,y
0,2008,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,,,,,,no
1,2008,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,,,,,,no
2,2008,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,,,,,,no
3,2008,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,,,,,,no
4,2008,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,,,,,,no


In [13]:
result_bankfull["year"].value_counts()

2008    27729
2009    14862
2010     2620
Name: year, dtype: int64

### Mapper function to insert indexes value into columns 

In [None]:
def map_index(data):
    
    index_2008 = {"may":{"emp_var_rate":1.1, "cons_price_idx":93.994, "cons_conf_idx":-36.4, "euribor_3m":4.85, "nr_employed":5191},
             "jun":{"emp_var_rate":1.4, "cons_price_idx":94.465, "cons_conf_idx":-41.8, "euribor_3m":4.86, "nr_employed":5228.1},
             "jul":{"emp_var_rate":1.4, "cons_price_idx":93.918, "cons_conf_idx":-42.7, "euribor_3m":4.96, "nr_employed":5228.1},
             "aug":{"emp_var_rate":1.4, "cons_price_idx":93.444, "cons_conf_idx":-36.1, "euribor_3m":4.965, "nr_employed":5228.1},
             "oct":{"emp_var_rate":-0.1, "cons_price_idx":93.798, "cons_conf_idx":-40.4, "euribor_3m":5, "nr_employed":5195.8},
             "nov":{"emp_var_rate":-0.1, "cons_price_idx":93.2, "cons_conf_idx":-42, "euribor_3m":4.406, "nr_employed":5195.8},
             "dec":{"emp_var_rate":-0.2, "cons_price_idx":92.75, "cons_conf_idx":-45.9, "euribor_3m":3.563, "nr_employed":5176.3}}
    
    index_2009 = {"jan":{"emp_var_rate":-0.2, "nr_employed":5176.3},
              "feb":{"emp_var_rate":-0.2, "nr_employed":5176.3},
              "mar":{"emp_var_rate":-1.8, "cons_price_idx":92.84, "cons_conf_idx":-50, "euribor_3m":1.811, "nr_employed":5099.1},
              "apr":{"emp_var_rate":-1.8, "cons_price_idx":93.075, "cons_conf_idx":-47.1, "euribor_3m":1.498, "nr_employed":5099.1},
              "may":{"emp_var_rate":-1.8, "cons_price_idx":92.89, "cons_conf_idx":-46.2, "euribor_3m":1.334, "nr_employed":5099.1},
             "jun":{"emp_var_rate":-2.9, "cons_price_idx":92.963, "cons_conf_idx":-40.8, "euribor_3m":1.26, "nr_employed":5076.2},
             "jul":{"emp_var_rate":-2.9, "cons_price_idx":93.469, "cons_conf_idx":-33.6, "euribor_3m":1.072, "nr_employed":5076.2},
             "aug":{"emp_var_rate":-2.9, "cons_price_idx":92.201, "cons_conf_idx":-31.4, "euribor_3m":0.884, "nr_employed":5076.2},
             "sep":{"emp_var_rate":-3.4, "cons_price_idx":92.379, "cons_conf_idx":-29.8, "euribor_3m":0.813, "nr_employed":5017.5},
             "oct":{"emp_var_rate":-3.4, "cons_price_idx":92.431, "cons_conf_idx":-26.9, "euribor_3m":0.754, "nr_employed":5017.5},
             "nov":{"emp_var_rate":-3.4, "cons_price_idx":92.649, "cons_conf_idx":-30.1, "euribor_3m":0.722, "nr_employed":5017.5},
             "dec":{"emp_var_rate":-3, "cons_price_idx":92.713, "cons_conf_idx":-33, "euribor_3m":0.718, "nr_employed":5023.5}}
    
    index_2010 = {"jan":{"emp_var_rate":-3, "nr_employed":5023.5},
              "feb":{"emp_var_rate":-3, "nr_employed":5023.5},
               "mar":{"emp_var_rate":-1.8, "cons_price_idx":92.369, "cons_conf_idx":-34.8, "euribor_3m":0.655, "nr_employed":5008.7},
              "apr":{"emp_var_rate":-1.8, "cons_price_idx":93.749, "cons_conf_idx":-34.6, "euribor_3m":0.64, "nr_employed":5008.7},
              "may":{"emp_var_rate":-1.8, "cons_price_idx":93.876, "cons_conf_idx":-40, "euribor_3m":0.668, "nr_employed":5008.7},
             "jun":{"emp_var_rate":-1.7, "cons_price_idx":94.055, "cons_conf_idx":-39.8, "euribor_3m":0.704, "nr_employed":4991.6},
             "jul":{"emp_var_rate":-1.7, "cons_price_idx":94.215, "cons_conf_idx":-40.3, "euribor_3m":0.79, "nr_employed":4991.6},
             "aug":{"emp_var_rate":-1.7, "cons_price_idx":94.027, "cons_conf_idx":-38.3, "euribor_3m":0.898, "nr_employed":4991.6},
             "sep":{"emp_var_rate":-1.1, "cons_price_idx":94.199, "cons_conf_idx":-37.5, "euribor_3m":0.886, "nr_employed":4963.6},
             "oct":{"emp_var_rate":-1.1, "cons_price_idx":94.601, "cons_conf_idx":-49.5, "euribor_3m":0.959, "nr_employed":4963.6},
             "nov":{"emp_var_rate":-1.1, "cons_price_idx":94.767, "cons_conf_idx":-50.8, "euribor_3m":1.05, "nr_employed":4963.6}}
    
    new_data = data.copy()
    indx = [index_2008, index_2009, index_2010]
    years = [2008, 2009, 2010]

    for i in range(0 , len(years)):
        for months, indexes in indx[i].items():
            for index, index_val in indexes.items():
                new_data.loc[(new_data['year'] == years[i]) & (new_data['month'] == months), index] = index_val
    return new_data            

In [None]:
bank_full = map_index(data = result_bankfull)
bank_full.head()

In [None]:
bank_full.count()

### Adding date column

In [15]:
def get_date2(month, year, day_of_week_str):
    year = int(year)
    
    month_lst = [ "haha",
               "jan", "feb", "mar", "apr",
               "may", "jun", "jul", "aug",
               "sep", "oct", "nov", "dec"]
    
    day_of_week_abbr=day_of_week_str.capitalize()
    # Get the abbreviated name of the day of the week
    day_of_week_abbr = day_of_week_abbr[:3]

    # Get the day of the week number
    day_of_week = list(calendar.day_abbr).index(day_of_week_abbr)
    month_num = month_lst.index(month)
    
    # Iterate over the days in the month and find the first day that matches the day of the week
    for day in range(1, calendar.monthrange(year, month_num)[1] + 1):
        if calendar.weekday(year, month_num, day) == day_of_week:
            return f"{day:02}"
    return None



def map_date(data):
    # Make a copy of the original dataframe
    new_data = data.copy()
    
    # Insert a new "date" column filled with zeros
    new_data.insert(loc=10, column="date", value=0)
    
    month_lst = [ "haha",
               "jan", "feb", "mar", "apr",
               "may", "jun", "jul", "aug",
               "sep", "oct", "nov", "dec"]
    
    # Loop through the rows of the dataframe, updating the date column
    for i in range(0,len(new_data)):
        day = int(new_data["day"][i])
        new_data.at[i, "date"] = str(new_data["year"][i]) + "-" + \
        str( month_lst.index(new_data["month"][i]) ) + "-" + \
        str(f"{day:02}")
    return new_data


In [16]:
res_full_date = map_date(data = bank_full)
res_full_date.head()

NameError: name 'bank_full' is not defined

In [17]:
res_full_date.info()

NameError: name 'res_full_date' is not defined

# Dataset - 2 (additional-full.csv)

In [None]:
dataset_2 = spark.read.csv('bank-additional-full.csv',sep = ";",header=True,inferSchema=True)

In [None]:
dataset_2.count()

### Converting dataset to pandas

In [None]:
pdf_2 = dataset_2.toPandas()

In [None]:
pdf_2.head(5)

### Replacing pdays 999 value to -1

In [None]:
pdf_2["pdays"] = pdf_2["pdays"].replace(999, -1)

In [None]:
pdf_2.head(5)

### Renaming column names

In [None]:
old_col_list = ["emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"]
for i in range(0, len(old_col_list)):
    pdf_2.rename(columns={old_col_list[i]: new_cols[i]}, inplace=True)

In [None]:
pdf_2.head(5)

## Adding year mapper function to add year to dataset - 2(addition-full.csv)

In [None]:
def year_mapper(data, start_yr):
    month_lst = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]

    # Make a copy of the original dataframe
    new_data = data.copy()

    # Insert a new "year" column filled with zeros
    new_data.insert(loc=0, column="year", value=0)

    # Set the first year to the start year
    current_year = int(start_yr)
    new_data.at[0, "year"] = current_year

    # Loop through the rows of the dataframe, updating the year column when the month changes
    for i in range(1, len(new_data)):
        # If the current month is earlier in the year than the previous month, increment the year
        if month_lst.index(new_data["month"][i]) < month_lst.index(new_data["month"][i-1]):
            current_year += 1

        new_data.at[i, "year"] = current_year

        # If the current year exceeds the end year, break out of the loop
        if current_year > 2010:
            break

    return new_data

In [None]:
result_addition = year_mapper( data = pdf_2, start_yr = 2008)
result_addition.head()

In [None]:
result_addition["year"].value_counts()

### adding date column to additional 

In [None]:


def get_date(month, year, day_of_week_str):
    import calendar
    year = int(year)
    
    month_lst = [ "haha",
               "jan", "feb", "mar", "apr",
               "may", "jun", "jul", "aug",
               "sep", "oct", "nov", "dec"]
    
    day_of_week_abbr=day_of_week_str.capitalize()
    # Get the abbreviated name of the day of the week
    day_of_week_abbr = day_of_week_abbr[:3]

    # Get the day of the week number
    day_of_week = list(calendar.day_abbr).index(day_of_week_abbr)
    month_num = month_lst.index(month)
    
    # Iterate over the days in the month and find the first day that matches the day of the week
    for day in range(1, calendar.monthrange(year, month_num)[1] + 1):
        if calendar.weekday(year, month_num, day) == day_of_week:
            return f"{day:02}"
    return None


def map_date2(data):
    # Make a copy of the original dataframe
    new_data = data.copy()
    
    # Insert a new "date" column filled with zeros
    new_data.insert(loc=10, column="date", value=0)
    
    month_lst = [ "haha",
               "jan", "feb", "mar", "apr",
               "may", "jun", "jul", "aug",
               "sep", "oct", "nov", "dec"]
    
    # Loop through the rows of the dataframe, updating the date column
    for i in range(0,len(new_data)):
        
        new_data.at[i, "date"] = str(new_data["year"][i]) + "-" + \
        str( month_lst.index(new_data["month"][i]) ) + "-" + \
        str( get_date(month = new_data["month"][i],
                      year = new_data["year"][i],
                      day_of_week_str = new_data["day_of_week"][i]) )
    return new_data


In [None]:
res_add_date = map_date2(data=result_addition)
res_add_date.head()

In [None]:
old_edu = ["basic.4y", "high.school", "basic.6y", "basic.9y", "university.degree", "professional.course"]
new_edu = ["basic_4y", "high_school", "basic_6y", "basic_9y", "university_degree" ,"professional_course"]

for i in range(0,6):
    res_add_date.loc[(res_add_date['education'] == old_edu[i]), "education"] = new_edu[i]

In [None]:
res_add_date.head()

In [None]:
res_add_date.info()

In [None]:
res_add_date = res_add_date.drop(["day_of_week","year","month"], axis=1)

In [None]:
res_full_date = res_full_date.drop(["balance","year","month","day"], axis=1)

In [None]:
res_add_date.head()

In [None]:
res_full_date.head()

## Concatinating two datasets into one

In [None]:
frames  = [res_add_date, res_full_date]

bank_data = pd.concat(frames)

In [None]:
bank_data.head()

In [None]:
bank_data.info()

In [None]:
bank_data.describe()

In [None]:
bank_data.columns

In [None]:
categories = bank_data.filter(["job","marital","education","default","housing","loan","contact","month","year","poutcome","y"])
continuous = bank_data.filter(["age","duration","campaign","pdays","previus","emp_var_rate","cons_price_idx","cons_conf_idx","euribor_3m","nr_employed"])

In [None]:
for columns in categories:
    print("Column Name", columns)
    print("-----------------------")
    print(categories[columns].value_counts())
    print("     ")
    print("******************************************************")
    print("     ")

In [None]:
bank_data.loc[(bank_data['job'] == "admin."), "job"] = "admin"
bank_data["job"].unique()

In [None]:
bank_data = bank_data.replace(["unknown","nonexistent"], np.nan)

In [None]:
bank_data.head()

In [None]:
bank_data.isnull().sum()

In [None]:
for columns in categories:
    bank_data[columns] = bank_data[columns].fillna(bank_data[columns].mode()[0])
# Replacing null values of categorical columns with the mode

In [None]:
for cols in continuous:
    bank_data[cols] = bank_data[cols].fillna(bank_data[cols].mean())
# Replacing null values of Continuous columns with the mean


In [None]:
bank_data.isnull().sum()

In [None]:
correlation = bank_data.corr()
sns.heatmap(correlation)

## Hypothesis Testing

### Chi_square test

In [None]:
from scipy.stats import chi2_contingency

hypo_cat_columns = ["default","housing","loan","job","education"]

for column in hypo_cat_columns:
    ctab = pd.crosstab(bank_data["y"], bank_data[column])
    test_statistic, p_value, df, expected_frequencies = chi2_contingency(ctab)
    print("p_value for ",column," ", p_value)
    print("--------------------------------")

### Annova_test

In [None]:
from statsmodels.stats.anova import anova_lm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [None]:
# Annova test for age column
agr_ols = ols("age ~ y", data=bank_data).fit()
table = anova_lm(agr_ols, typ=2)
print(table)

In [None]:
# Annova test for duration column
agr_ols = ols("duration ~ y", data=bank_data).fit()
table = anova_lm(agr_ols, typ=2)
print(table)

### Annova tuckey's test

In [None]:
compare = pairwise_tukeyhsd(bank_data['age'], 
                            bank_data['y'], alpha=0.05)

result = pd.DataFrame(compare._results_table.data)
result

In [None]:
compare = pairwise_tukeyhsd(bank_data['duration'], 
                            bank_data['y'], alpha=0.05)

result = pd.DataFrame(compare._results_table.data)
result

## Data_preprocessing

In [None]:
# Creating a dictionary for converting categorical textual data entries
# into categorical numeric on basis of job profile
job_dict = {"entrepreneur":11, "self-employed":10, "admin":9, "management":8, "services":7, 
       "technician":6, "blue-collar":5, "housemaid":4, "retired":3, "student":2, "unemployed":1}

In [None]:
bank_data["jobs"] = bank_data["job"].map(job_dict)
bank_data = bank_data.drop("job", axis=1)

In [None]:
bank_data.head()

In [None]:
marital_dict = {"married":3, "single":2, "divorced":1}
bank_data["maritals"] = bank_data["marital"].map(marital_dict)
bank_data = bank_data.drop("marital", axis=1)
bank_data.head()

In [None]:
edu_dict = {"professional_course":10, "university_degree":9, "tertiary":8, "secondary":7, 
       "high_school":6, "basic_9y":5, "basic_6y":4, "primary":3, "basic_4y":2, "illiterate":1}
bank_data["education"] = bank_data["education"].map(edu_dict)
#new_data = new_data.drop("marital", axis=1)
bank_data.head()

In [None]:
bank_data.head()

In [None]:
# Convert the date column to a datetime object
bank_data['date'] = pd.to_datetime(bank_data['date'])

# Add a new column with the quarter based on the date values
bank_data['quarter'] = bank_data['date'].apply(lambda x: "q"+str((x.month-1)//3 + 1))

bank_data.head()

quarters = ["Q1", "Q2", "Q3", "Q4"]
months = [["jan","feb","mar"],["apr","may","jun"],["jul","aug","sep"],["oct","nov","dec"]]

for i in range(0,4):
    new_data["month"] = new_data["month"].replace(months[i], quarters[i])

new_data["month"].unique()

In [None]:
bank_data["y"] = bank_data["y"].replace("yes", 1)
bank_data["y"] = bank_data["y"].replace("no", 0)

In [None]:
bank_data = pd.get_dummies(data = bank_data, drop_first = True)
bank_data.head(5)

In [None]:
outliers_columns = ["age","duration","campaign","pdays","previous"]

In [None]:
def plot_box():
    plt.figure(figsize=(10,10))
    plt.subplot(3,2,1)
    bank_data.boxplot(column=["age"])

    plt.subplot(3,2,2)
    bank_data.boxplot(column=["duration"])

    plt.subplot(3,2,3)
    bank_data.boxplot(column=["campaign"])

    plt.subplot(3,2,4)
    bank_data.boxplot(column=["pdays"])

    plt.subplot(3,2,5)
    bank_data.boxplot(column=["previous"])
           
plot_box()

In [None]:
max_out_limit = []
for cols in outliers_columns:
    q3 = bank_data[cols].quantile(0.75)
    q1 = bank_data[cols].quantile(0.25)
    iqr = q3 - q1
    iqr = iqr*1.5
    max_limit = q3 + iqr
    min_limit = q1 - iqr
    max_out_limit.append(max_limit)
    print(cols, "max_limit: ",max_limit," min_limit: ",min_limit)
else:
    print("------------------------------------------")
    print(max_out_limit)

In [None]:
for i, j in zip(outliers_columns, max_out_limit):
    bank_data.loc[bank_data[i]>=j, i]=j

In [None]:
plot_box()

In [None]:
bank_data = bank_data.drop(["date"],axis = 1)

In [None]:
features = []
for columns in bank_data:
    features.append(columns)
else:
    print(features)

In [None]:
bank_data.head()

In [None]:
tele_copy = bank_data.copy()
tele_copy.to_csv("bank_spark.csv")

### Machine Learning Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
df = pd.read_csv("bank_spark.csv")

In [None]:
x = df.drop(["y","Unnamed: 0"], axis=1)
y = df["y"]

In [None]:
x.head()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = pd.DataFrame(scaler.fit_transform(x), columns=x.columns)

In [None]:
x.head()

In [None]:
y.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 2022, test_size = 0.25)

### Logistic Regression

In [None]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
y_pred_prob = lr.predict_proba(x_test)[:,1]

print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob))

In [None]:
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)
y_pred_prob = rfc.predict_proba(x_test)[:,1]

print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob))

svm = SVC(kernel="poly", probability = True,  random_state = 2022)
svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)
y_pred_prob = svm.predict_proba(x_test)[:,1]

print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob))

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)
y_pred = dtc.predict(x_test)
y_pred_prob = dtc.predict_proba(x_test)[:,1]

print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob))

In [None]:
voting = VotingClassifier([('LR',lr),('RFC',rfc),('DTC',dtc)],voting='soft')
voting.fit(x_train, y_train)
y_pred = voting.predict(x_test)
y_pred_prob = voting.predict_proba(x_test)[:,1]

print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob))

In [None]:
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, classification_report

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))