In [1]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.types import *
#import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql.functions import when
from pyspark.sql.functions import col, sum, count
import pyspark.pandas as ps
import warnings



In [2]:
spark = SparkSession.builder.master('local[4]').appName('ml').getOrCreate()

In [3]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [4]:
pd.pandas.set_option('display.max_columns',None)

# Preprocessing

### Bank Full (dataset-1)

In [5]:
bank_full = spark.read.csv('bank-full.csv',
                           sep = ";",
                           header=True,
                           inferSchema=True)

In [6]:
bank_full.count()

45211

In [7]:
display(bank_full.limit(5))

age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [8]:
bank_full.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



In [9]:
new_cols = ["emp_var_rate", "cons_price_idx", "cons_conf_idx", "euribor_3m", "nr_employed"]
for column in new_cols:
    bank_full = bank_full.withColumn(column, bank_full["poutcome"]+1)

In [10]:
display(bank_full.limit(5))

age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,emp_var_rate,cons_price_idx,cons_conf_idx,euribor_3m,nr_employed
58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no,,,,,
44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no,,,,,
33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no,,,,,
47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no,,,,,
33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no,,,,,


In [11]:
def year_mapper(data, start_yr, end_yr):
    month_lst = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]

    # Make a copy of the original dataframe
    new_data = data.copy()

    # Insert a new "year" column filled with zeros
    new_data.insert(loc=0, column="year", value=0)

    # Set the first year to the start year
    current_year = int(start_yr)
    new_data.at[0, "year"] = current_year

    # Loop through the rows of the dataframe, updating the year column when the month changes
    for i in range(1, len(new_data)):
        # If the current month is earlier in the year than the previous month, increment the year
        if month_lst.index(new_data["month"][i]) < month_lst.index(new_data["month"][i-1]):
            current_year += 1

        new_data.at[i, "year"] = current_year

        # If the current year exceeds the end year, break out of the loop
        if current_year > end_yr:
            break

    return new_data

In [12]:
# Ignore warnings coming from Arrow optimizations.
warnings.filterwarnings("ignore")

# Use default index prevent overhead.
ps.set_option("compute.default_index_type", "distributed")

# To speed up dataset processing
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [13]:
bank_full_pdf = bank_full.toPandas()

# Apply the function to the Pandas DataFrame
new_bank_full_pdf = year_mapper(bank_full_pdf, 2008, 2010)

# Convert the updated Pandas DataFrame back to a PySpark DataFrame
bank_full = spark.createDataFrame(new_bank_full_pdf)

In [14]:
display(bank_full.limit(5))

year,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,emp_var_rate,cons_price_idx,cons_conf_idx,euribor_3m,nr_employed
2008,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no,,,,,
2008,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no,,,,,
2008,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no,,,,,
2008,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no,,,,,
2008,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no,,,,,


In [15]:
counts = bank_full.groupBy("year").count()
counts.show()

+----+-----+
|year|count|
+----+-----+
|2008|27729|
|2009|14862|
|2010| 2620|
+----+-----+



### Mapping date to Bank-Full

In [16]:
def map_date(data):
    # Make a copy of the original dataframe
    new_data = data.copy()
    
    # Insert a new "date" column filled with zeros
    new_data.insert(loc=10, column="date", value=0)
    
    month_lst = [ "haha",
               "jan", "feb", "mar", "apr",
               "may", "jun", "jul", "aug",
               "sep", "oct", "nov", "dec"]
    
    # Loop through the rows of the dataframe, updating the date column
    for i in range(0,len(new_data)):
        day = int(new_data["day"][i])
        mnt = int(month_lst.index(new_data["month"][i]))
        new_data.at[i, "date"] = str(new_data["year"][i]) + "-" + \
        str(f"{mnt:02}" ) + "-" + \
        str(f"{day:02}")
    return new_data

In [17]:
bank_full_pdf = bank_full.toPandas()

# Apply the function to the Pandas DataFrame
new_bank_full_pdf = map_date(bank_full_pdf)

# Convert the updated Pandas DataFrame back to a PySpark DataFrame
bank_full = spark.createDataFrame(new_bank_full_pdf)

In [18]:
display(bank_full.limit(5))

year,age,job,marital,education,default,balance,housing,loan,contact,date,day,month,duration,campaign,pdays,previous,poutcome,y,emp_var_rate,cons_price_idx,cons_conf_idx,euribor_3m,nr_employed
2008,58,management,married,tertiary,no,2143,yes,no,unknown,2008-05-05,5,may,261,1,-1,0,unknown,no,,,,,
2008,44,technician,single,secondary,no,29,yes,no,unknown,2008-05-05,5,may,151,1,-1,0,unknown,no,,,,,
2008,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,2008-05-05,5,may,76,1,-1,0,unknown,no,,,,,
2008,47,blue-collar,married,unknown,no,1506,yes,no,unknown,2008-05-05,5,may,92,1,-1,0,unknown,no,,,,,
2008,33,unknown,single,unknown,no,1,no,no,unknown,2008-05-05,5,may,198,1,-1,0,unknown,no,,,,,


In [19]:
#to convert "date" column from string to date format
from pyspark.sql.functions import to_date

bank_full = bank_full.withColumn("date", to_date("date", "yyyy-MM-dd"))
display(bank_full.limit(5))

year,age,job,marital,education,default,balance,housing,loan,contact,date,day,month,duration,campaign,pdays,previous,poutcome,y,emp_var_rate,cons_price_idx,cons_conf_idx,euribor_3m,nr_employed
2008,58,management,married,tertiary,no,2143,yes,no,unknown,2008-05-05,5,may,261,1,-1,0,unknown,no,,,,,
2008,44,technician,single,secondary,no,29,yes,no,unknown,2008-05-05,5,may,151,1,-1,0,unknown,no,,,,,
2008,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,2008-05-05,5,may,76,1,-1,0,unknown,no,,,,,
2008,47,blue-collar,married,unknown,no,1506,yes,no,unknown,2008-05-05,5,may,92,1,-1,0,unknown,no,,,,,
2008,33,unknown,single,unknown,no,1,no,no,unknown,2008-05-05,5,may,198,1,-1,0,unknown,no,,,,,


In [20]:
bank_full.printSchema()

root
 |-- year: long (nullable = true)
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- date: date (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)
 |-- emp_var_rate: double (nullable = true)
 |-- cons_price_idx: double (nullable = true)
 |-- cons_conf_idx: double (nullable = true)
 |-- euribor_3m: double (nullable = true)
 |-- nr_employed: double (nullable = true)



In [21]:
#
from pyspark.sql.types import IntegerType

### Index mapper

In [22]:
def map_index(data):
    new_data = data
    index_2008 = {"may":{"emp_var_rate":1.1, "cons_price_idx":93.994, "cons_conf_idx":-36.4, "euribor_3m":4.85, "nr_employed":5191},
                 "jun":{"emp_var_rate":1.4, "cons_price_idx":94.465, "cons_conf_idx":-41.8, "euribor_3m":4.86, "nr_employed":5228.1},
                 "jul":{"emp_var_rate":1.4, "cons_price_idx":93.918, "cons_conf_idx":-42.7, "euribor_3m":4.96, "nr_employed":5228.1},
                 "aug":{"emp_var_rate":1.4, "cons_price_idx":93.444, "cons_conf_idx":-36.1, "euribor_3m":4.965, "nr_employed":5228.1},
                 "oct":{"emp_var_rate":-0.1, "cons_price_idx":93.798, "cons_conf_idx":-40.4, "euribor_3m":5, "nr_employed":5195.8},
                 "nov":{"emp_var_rate":-0.1, "cons_price_idx":93.2, "cons_conf_idx":-42, "euribor_3m":4.406, "nr_employed":5195.8},
                 "dec":{"emp_var_rate":-0.2, "cons_price_idx":92.75, "cons_conf_idx":-45.9, "euribor_3m":3.563, "nr_employed":5176.3}}

    index_2009 = {"jan":{"emp_var_rate":-0.2, "nr_employed":5176.3},
                 "feb":{"emp_var_rate":-0.2, "nr_employed":5176.3},
                 "mar":{"emp_var_rate":-1.8, "cons_price_idx":92.84, "cons_conf_idx":-50, "euribor_3m":1.811, "nr_employed":5099.1},
                 "apr":{"emp_var_rate":-1.8, "cons_price_idx":93.075, "cons_conf_idx":-47.1, "euribor_3m":1.498, "nr_employed":5099.1},
                 "may":{"emp_var_rate":-1.8, "cons_price_idx":92.89, "cons_conf_idx":-46.2, "euribor_3m":1.334, "nr_employed":5099.1},
                 "jun":{"emp_var_rate":-2.9, "cons_price_idx":92.963, "cons_conf_idx":-40.8, "euribor_3m":1.26, "nr_employed":5076.2},
                 "jul":{"emp_var_rate":-2.9, "cons_price_idx":93.469, "cons_conf_idx":-33.6, "euribor_3m":1.072, "nr_employed":5076.2},
                 "aug":{"emp_var_rate":-2.9, "cons_price_idx":92.201, "cons_conf_idx":-31.4, "euribor_3m":0.884, "nr_employed":5076.2},
                 "sep":{"emp_var_rate":-3.4, "cons_price_idx":92.379, "cons_conf_idx":-29.8, "euribor_3m":0.813, "nr_employed":5017.5},
                 "oct":{"emp_var_rate":-3.4, "cons_price_idx":92.431, "cons_conf_idx":-26.9, "euribor_3m":0.754, "nr_employed":5017.5},
                 "nov":{"emp_var_rate":-3.4, "cons_price_idx":92.649, "cons_conf_idx":-30.1, "euribor_3m":0.722, "nr_employed":5017.5},
                 "dec":{"emp_var_rate":-3, "cons_price_idx":92.713, "cons_conf_idx":-33, "euribor_3m":0.718, "nr_employed":5023.5}}

    index_2010 = {"jan":{"emp_var_rate":-3, "nr_employed":5023.5},
                 "feb":{"emp_var_rate":-3, "nr_employed":5023.5},
                 "mar":{"emp_var_rate":-1.8, "cons_price_idx":92.369, "cons_conf_idx":-34.8, "euribor_3m":0.655, "nr_employed":5008.7},
                 "apr":{"emp_var_rate":-1.8, "cons_price_idx":93.749, "cons_conf_idx":-34.6, "euribor_3m":0.64, "nr_employed":5008.7},
                 "may":{"emp_var_rate":-1.8, "cons_price_idx":93.876, "cons_conf_idx":-40, "euribor_3m":0.668, "nr_employed":5008.7},
                 "jun":{"emp_var_rate":-1.7, "cons_price_idx":94.055, "cons_conf_idx":-39.8, "euribor_3m":0.704, "nr_employed":4991.6},
                 "jul":{"emp_var_rate":-1.7, "cons_price_idx":94.215, "cons_conf_idx":-40.3, "euribor_3m":0.79, "nr_employed":4991.6},
                 "aug":{"emp_var_rate":-1.7, "cons_price_idx":94.027, "cons_conf_idx":-38.3, "euribor_3m":0.898, "nr_employed":4991.6},
                 "sep":{"emp_var_rate":-1.1, "cons_price_idx":94.199, "cons_conf_idx":-37.5, "euribor_3m":0.886, "nr_employed":4963.6},
                 "oct":{"emp_var_rate":-1.1, "cons_price_idx":94.601, "cons_conf_idx":-49.5, "euribor_3m":0.959, "nr_employed":4963.6},
                 "nov":{"emp_var_rate":-1.1, "cons_price_idx":94.767, "cons_conf_idx":-50.8, "euribor_3m":1.05, "nr_employed":4963.6}}

    indx = [index_2008, index_2009, index_2010]
    years = [2008, 2009, 2010]
    
    for i in range(len(years)):
        for months, indexes in indx[i].items():
            for index, index_val in indexes.items():
                new_data = new_data.withColumn(index, 
                    when((col('year') == years[i]) & (col('month') == months), index_val).otherwise(col(index))) 
    return new_data

In [23]:
bank_full = map_index(data = bank_full)
display(bank_full.limit(5))

year,age,job,marital,education,default,balance,housing,loan,contact,date,day,month,duration,campaign,pdays,previous,poutcome,y,emp_var_rate,cons_price_idx,cons_conf_idx,euribor_3m,nr_employed
2008,58,management,married,tertiary,no,2143,yes,no,unknown,2008-05-05,5,may,261,1,-1,0,unknown,no,1.1,93.994,-36.4,4.85,5191.0
2008,44,technician,single,secondary,no,29,yes,no,unknown,2008-05-05,5,may,151,1,-1,0,unknown,no,1.1,93.994,-36.4,4.85,5191.0
2008,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,2008-05-05,5,may,76,1,-1,0,unknown,no,1.1,93.994,-36.4,4.85,5191.0
2008,47,blue-collar,married,unknown,no,1506,yes,no,unknown,2008-05-05,5,may,92,1,-1,0,unknown,no,1.1,93.994,-36.4,4.85,5191.0
2008,33,unknown,single,unknown,no,1,no,no,unknown,2008-05-05,5,may,198,1,-1,0,unknown,no,1.1,93.994,-36.4,4.85,5191.0


In [24]:
def get_date(month, year, day_of_week_str):
    year = int(year)
    
    month_lst = [ "haha",
               "jan", "feb", "mar", "apr",
               "may", "jun", "jul", "aug",
               "sep", "oct", "nov", "dec"]
    
    day_of_week_abbr=day_of_week_str.capitalize()
    # Get the abbreviated name of the day of the week
    day_of_week_abbr = day_of_week_abbr[:3]

    # Get the day of the week number
    day_of_week = list(calendar.day_abbr).index(day_of_week_abbr)
    month_num = month_lst.index(month)
    
    # Iterate over the days in the month and find the first day that matches the day of the week
    for day in range(1, calendar.monthrange(year, month_num)[1] + 1):
        if calendar.weekday(year, month_num, day) == day_of_week:
            return f"{day:02}"
    return None

### Bank Addition Full Dataset (dataset-2)

In [25]:
bank_add_full = spark.read.csv('bank-additional-full.csv',
                               sep = ";",
                               header=True,
                               inferSchema=True)

In [26]:
display(bank_add_full.limit(5))

age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [27]:
bank_add_full_pdf = bank_add_full.toPandas()

# Apply the function to the Pandas DataFrame
new_bank_add_full_pdf = year_mapper(bank_add_full_pdf, 2008, 2010)

In [33]:

def get_date(month, year, day_of_week_str):
    """
        This function acts as a helper function to "map_date2()"
        method.
    """
    import calendar
    year = int(year)
    
    month_lst = [ "haha",
               "jan", "feb", "mar", "apr",
               "may", "jun", "jul", "aug",
               "sep", "oct", "nov", "dec"]
    
    day_of_week_abbr=day_of_week_str.capitalize()
    # Get the abbreviated name of the day of the week
    day_of_week_abbr = day_of_week_abbr[:3]

    # Get the day of the week number
    day_of_week = list(calendar.day_abbr).index(day_of_week_abbr)
    month_num = month_lst.index(month)
    
    # Iterate over the days in the month and find the first day that matches the day of the week
    for day in range(1, calendar.monthrange(year, month_num)[1] + 1):
        if calendar.weekday(year, month_num, day) == day_of_week:
            return f"{day:02}"
    return None


def map_date2(data):
    """
        This function is written to map dates to date column in the
        bank-additional-full.csv file with helper function get_date.
    """
    # Make a copy of the original dataframe
    new_data = data.copy()
    
    # Insert a new "date" column filled with zeros
    new_data.insert(loc=10, column="date", value=0)
    
    month_lst = [ "haha",
               "jan", "feb", "mar", "apr",
               "may", "jun", "jul", "aug",
               "sep", "oct", "nov", "dec"]
    
    # Loop through the rows of the dataframe, updating the date column
    for i in range(0,len(new_data)):
        
        new_data.at[i, "date"] = str(new_data["year"][i]) + "-" + \
        str( month_lst.index(new_data["month"][i]) ) + "-" + \
        str( get_date(month = new_data["month"][i],
                      year = new_data["year"][i],
                      day_of_week_str = new_data["day_of_week"][i]) )
    return new_data

In [34]:
#  Mapping date to bank-additional-full dataset
bank_add_full_pdf = map_date2(bank_add_full_pdf)

KeyError: 'year'

In [None]:
# Convert the updated Pandas DataFrame back to a PySpark DataFrame
bank_add_full = spark.createDataFrame(new_bank_add_full_pdf)

In [None]:
display(bank_add_full.limit(5))

In [None]:
bank_add_full = map_date2(bank_add_full_pdf)

### replace values from 999 to -1

In [None]:
bank_add_full = bank_add_full.withColumn("pdays", when(col("pdays") == 999, -1).otherwise(col("pdays")))

### Renaming columns names and values

In [None]:
old_col_list = ["emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"]
for i in range(0, len(old_col_list)):
    bank_add_full = bank_add_full.withColumnRenamed(old_col_list[i], new_cols[i])

In [None]:
old_edu = ["basic.4y", "high.school", "basic.6y", "basic.9y", "university.degree", "professional.course"]
new_edu = ["basic_4y", "high_school", "basic_6y", "basic_9y", "university_degree" ,"professional_course"]

for i in range(0,6):
    bank_add_full = bank_add_full.withColumn("education", when(col("education") == old_edu[i], new_edu[i]).otherwise(col("education")))

In [None]:
display(bank_add_full.limit(5))

In [None]:
display(bank_full.limit(5))

In [None]:
bank_full = bank_full.drop("balance", "day")

In [None]:
bank_full_tableau2 = nbank_full.drop("day")

In [None]:
bank_add_full = bank_add_full.drop("day_of_week")

## Concat two dataframes

In [None]:
bank_data = bank_add_full.union(bank_full)

In [None]:
bank_data.count()

In [None]:
bank_data.summary()

In [None]:
cat_col = ["job","marital","education","default","housing","loan","contact","month","year","y"]
cont_col = ["age","duration","campaign","pdays","previous","emp_var_rate","cons_price_idx","cons_conf_idx","euribor_3m","nr_employed"]
categories = bank_data.select(cat_col)
continuous = bank_data.select(cont_col)

### value counts

In [None]:
for columns in categories:
    print("Column Name", columns)
    print("-----------------------")
    counts = bank_data.groupBy(columns).count()
    counts.show()
    print("     ")
    print("******************************************************")
    print("     ")

In [None]:
bank_data = bank_data.withColumn("job", when(col("job") == "admin.", "admin").otherwise(col("job")))

In [None]:
counts = bank_data.groupBy("job").count()
counts.show()

In [None]:
for column in bank_data.columns:
    bank_data = bank_data.withColumn(column, when(col(column).isin("unknown", "nonexistent"), None).otherwise(col(column)))

In [None]:
display(bank_add_full.limit(5))

### Checking for null values

In [None]:
bank_data.agg(*[count(when(col(c).isNull(), c)).alias(c) for c in categories.columns]).show()

In [None]:
bank_data.agg(*[count(when(col(c).isNull(), c)).alias(c) for c in continuous.columns]).show()

### Replacing continue variables

In [None]:
from pyspark.sql.functions import mean

# calculate the mean of non-null values in columns col1 and col2
mean_dict = bank_data.select(*(mean(c).alias(c) for c in cont_col)).first().asDict()

# replace null values with the mean in columns col1 and col2
bank_data = bank_data.fillna(mean_dict)

In [None]:
bank_data.agg(*[count(when(col(c).isNull(), c)).alias(c) for c in continuous.columns]).show()

### Replacing categorical variables

In [None]:
bank_data = bank_data.drop("poutcome")

In [None]:
for column in cat_col:
    mode_value = bank_data.first()[column]
    bank_data = bank_data.fillna({column:mode_value})

In [None]:
bank_data.agg(*[count(when(col(c).isNull(), c)).alias(c) for c in categories.columns]).show()

In [None]:
pdf=bank_data.toPandas()

In [None]:
correlation = pdf.corr()
plt.figure(figsize=(10,10))
sns.heatmap(correlation)

## Data Preprocessing

In [None]:
# Creating a dictionary for converting categorical textual data entries
# into categorical numeric on basis of job profile
job_dict = {"entrepreneur":11, "self-employed":10, "admin":9, "management":8, "services":7, 
       "technician":6, "blue-collar":5, "housemaid":4, "retired":3, "student":2, "unemployed":1}

for key, value in job_dict.items():
    bank_data = bank_data.withColumn("job", when(bank_data["job"] == key, int(value)).otherwise(bank_data["job"]))

In [None]:
marital_dict = {"married":3, "single":2, "divorced":1}

for key, value in marital_dict.items():
    bank_data = bank_data.withColumn("marital", when(bank_data["marital"] == key, value).otherwise(bank_data["marital"]))

In [None]:
edu_dict = {"professional_course":10, "university_degree":9, "tertiary":8, "secondary":7, 
       "high_school":6, "basic_9y":5, "basic_6y":4, "primary":3, "basic_4y":2, "illiterate":1}

for key, value in edu_dict.items():
    bank_data = bank_data.withColumn("education", when(bank_data["education"] == key, value).otherwise(bank_data["education"]))

In [None]:
y_dict = {"yes":1, "no":0}

for key, value in y_dict.items():
    bank_data = bank_data.withColumn("y", when(bank_data["y"] == key, value).otherwise(bank_data["y"]))

contact_dict = {"telephone":1, "cellular":0}

for key, value in contact_dict.items():
    bank_data = bank_data.withColumn("contact", when(bank_data["contact"] == key, value).otherwise(bank_data["contact"]))

In [None]:
display(bank_data.limit(5))

In [None]:
quarter_dict = {"jan":"Q1", "feb":"Q1", "mar":"Q1", "apr":"Q2", "may":"Q2", "jun":"Q2", 
                "jul":"Q3", "aug":"Q3", "sep":"Q3", "oct":"Q4", "nov":"Q4", "dec":"Q4"}

for key, value in quarter_dict.items():
    bank_data = bank_data.withColumn("month", when(bank_data["month"] == key, value).otherwise(bank_data["month"]))

In [None]:
bank_data.printSchema()

In [None]:
one_hot_cols = ["contact", "default", "housing", "month", "loan"]

In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.sql.functions import col

def one_hot_encode(data, columns):
    new_data = data.select("*")
    for col_name in columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_index")
        new_data = indexer.fit(new_data).transform(new_data)

        encoder = OneHotEncoder(inputCols=[f"{col_name}_index"], outputCols=[f"{col_name}_one_hot"], dropLast=True)
        new_data = encoder.fit(new_data).transform(new_data)

        new_data = new_data.drop(col_name).drop(f"{col_name}_index")

    return new_data

In [None]:
bank_data = one_hot_encode(data = bank_data, columns = one_hot_cols)

In [None]:
column_types = bank_data.dtypes
# Filter the list to only include the string datatype columns

string_columns = [column[0] for column in column_types if column[1] == "string"]
print(string_columns)

In [None]:
for cols in string_columns:
# Change the datatype of the "age" column to integer
    bank_data = bank_data.withColumn(cols, bank_data[cols].cast("int"))

In [None]:
display(bank_data.limit(5))

In [None]:
vectors = [contact_one_hot	default_one_hot	housing_one_hot	month_one_hot	loan_one_hot]

# Define a UDF to convert vector elements to floats
to_float = udf(lambda x: [float(i) for i in x], ArrayType(FloatType()))

# Apply the UDF to the vector column
bank_data = bank_data.withColumn("vector_float", to_float(bank_data["vector"]))

In [None]:
bank_data.printSchema()

### Outliers

In [None]:
out = bank_data.toPandas()

In [None]:
outliers_columns = ["age","duration","campaign","pdays","previous"]

In [None]:
def plot_box():
    plt.figure(figsize=(10,10))
    plt.subplot(3,2,1)
    out.boxplot(column=["age"])

    plt.subplot(3,2,2)
    out.boxplot(column=["duration"])

    plt.subplot(3,2,3)
    out.boxplot(column=["campaign"])

    plt.subplot(3,2,4)
    out.boxplot(column=["pdays"])

    plt.subplot(3,2,5)
    out.boxplot(column=["previous"])
           
plot_box()

In [None]:
max_out_limit = []
for cols in outliers_columns:
    quantiles = bank_data.approxQuantile(cols, [0.25, 0.5, 0.75], 0.01)
    
    q3 = quantiles[2]
    q1 = quantiles[0]
    iqr = q3 - q1
    iqr = iqr*1.5
    max_limit = q3 + iqr
    min_limit = q1 - iqr
    max_out_limit.append(max_limit)
    
    print(cols, "max_limit: ",max_limit,"      min_limit: ",min_limit)
else:
    print("------------------------------------------")
    print(max_out_limit)



In [None]:
for i, j in zip(outliers_columns, max_out_limit):
    bank_data = bank_data.withColumn(i, 
                    when((col(i) >= j), j).otherwise(col(i))) 

In [None]:
out = bank_data.toPandas()

In [None]:
plot_box()

In [None]:
bank_data.printSchema()

In [None]:
display(bank_data.limit(5))

In [None]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler

# Select the numerical columns
numerical_cols = ["year", "age", "duration", "campaign", "pdays", "previous", "cons_price_idx", "cons_conf_idx", "euribor_3m", "nr_employed"]

# Create a vector assembler to combine the numerical columns into a single vector
assembler = VectorAssembler(inputCols=numerical_cols, outputCol="numerical_features")

# Transform the DataFrame to create the numerical features vector
bank_data = assembler.transform(bank_data)

# Apply MinMaxScaler to the numerical features vector
scaler = MinMaxScaler(inputCol="numerical_features", outputCol="scaled_numerical_features")
scaler_model = scaler.fit(bank_data)
df = scaler_model.transform(bank_data)

df = df.drop("numerical_features")

# Drop the original numerical columns and keep only the scaled numerical features
sc_bank_data = df.drop(*numerical_cols).withColumnRenamed("scaled_numerical_features", "features")


In [None]:
display(sc_bank_data.limit(5))

In [None]:
features = []
for columns in sc_bank_data.columns:
    features.append(columns)
else:
    print(features)

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, classification_report

In [None]:
assembler = VectorAssembler(inputCols=features, outputCol='features')

In [None]:
sc_bank_data.printSchema()

In [None]:
assembler = VectorAssembler(inputCols=[col("vector_col")], outputCol="features")
df_features = assembler.transform(df)

df_double = df_features.withColumn("double_col", col("features").cast("double")).select("double_col")

In [None]:
train_data, test_data = bank_transform.randomSplit([0.75, 0.25], seed = 14)
display(train_data.limit(5))