### copy for day_of_week in additional

In [1]:
import pandas as pd
import numpy as np

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
spark = SparkSession.builder.master('local[4]').appName('ml').getOrCreate()

# Dataset - 1(bank-full.csv)

In [4]:
data = spark.read.csv('bank-full.csv',sep = ";",header=True,inferSchema=True)

In [5]:
data.count()

45211

### Converting to Pandas

In [6]:
pdf=data.toPandas()

#### to display maximum no. of columns

In [7]:
pd.pandas.set_option('display.max_columns',None)
pd.set_option('display.max_rows', 5000)


In [8]:
pdf.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [9]:
pdf["education"].value_counts()

secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: education, dtype: int64

### renamed column for unform naming before concat operation

In [10]:
new_cols = ["emp_var_rate", "cons_price_idx", "cons_conf_idx", "euribor_3m", "nr_employed"]
col_no = 16
for column in new_cols:
    pdf.insert(col_no, column, value = np.nan)
    col_no = col_no + 1

In [11]:
pdf.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor_3m,nr_employed,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,,,,,,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,,,,,,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,,,,,,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,,,,,,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,,,,,,no


## Adding year mapper function to add year in dataset - 1(bank - full.csv)

In [12]:
def year_mapper(data, start_yr, break_yr):
    month_lst = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]

    # Make a copy of the original dataframe
    new_data = data.copy()

    # Insert a new "year" column filled with zeros
    new_data.insert(loc=0, column="year", value=0)

    # Set the first year to the start year
    current_year = int(start_yr)
    new_data.at[0, "year"] = current_year

    # Loop through the rows of the dataframe, updating the year column when the month changes
    for i in range(1, len(new_data)):
        # If the current month is earlier in the year than the previous month, increment the year
        if month_lst.index(new_data["month"][i]) < month_lst.index(new_data["month"][i-1]):
            current_year += 1

        new_data.at[i, "year"] = current_year

        # If the current year exceeds the end year, break out of the loop
        if current_year > break_yr:
            break

    return new_data

In [13]:
result_bankfull = year_mapper( data = pdf, start_yr = 2008)
result_bankfull.head()

Unnamed: 0,year,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor_3m,nr_employed,y
0,2008,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,,,,,,no
1,2008,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,,,,,,no
2,2008,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,,,,,,no
3,2008,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,,,,,,no
4,2008,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,,,,,,no


In [14]:
result_bankfull["year"].value_counts()

2008    27729
2009    14862
2010     2620
Name: year, dtype: int64

### Mapper function to insert indexes value into columns 

In [15]:
def map_index(data):
    
    index_2008 = {"may":{"emp_var_rate":1.1, "cons_price_idx":93.994, "cons_conf_idx":-36.4, "euribor_3m":4.85, "nr_employed":5191},
                  "jun":{"emp_var_rate":1.4, "cons_price_idx":94.465, "cons_conf_idx":-41.8, "euribor_3m":4.86, "nr_employed":5228.1},
                  "jul":{"emp_var_rate":1.4, "cons_price_idx":93.918, "cons_conf_idx":-42.7, "euribor_3m":4.96, "nr_employed":5228.1},
                  "aug":{"emp_var_rate":1.4, "cons_price_idx":93.444, "cons_conf_idx":-36.1, "euribor_3m":4.965, "nr_employed":5228.1},
                  "oct":{"emp_var_rate":-0.1, "cons_price_idx":93.798, "cons_conf_idx":-40.4, "euribor_3m":5, "nr_employed":5195.8},
                  "nov":{"emp_var_rate":-0.1, "cons_price_idx":93.2, "cons_conf_idx":-42, "euribor_3m":4.406, "nr_employed":5195.8},
                  "dec":{"emp_var_rate":-0.2, "cons_price_idx":92.75, "cons_conf_idx":-45.9, "euribor_3m":3.563, "nr_employed":5176.3}}

    index_2009 = {"jan":{"emp_var_rate":-0.2, "nr_employed":5176.3},
                  "feb":{"emp_var_rate":-0.2, "nr_employed":5176.3},
                  "mar":{"emp_var_rate":-1.8, "cons_price_idx":92.84, "cons_conf_idx":-50, "euribor_3m":1.811, "nr_employed":5099.1},
                  "apr":{"emp_var_rate":-1.8, "cons_price_idx":93.075, "cons_conf_idx":-47.1, "euribor_3m":1.498, "nr_employed":5099.1},
                  "may":{"emp_var_rate":-1.8, "cons_price_idx":92.89, "cons_conf_idx":-46.2, "euribor_3m":1.334, "nr_employed":5099.1},
                  "jun":{"emp_var_rate":-2.9, "cons_price_idx":92.963, "cons_conf_idx":-40.8, "euribor_3m":1.26, "nr_employed":5076.2},
                  "jul":{"emp_var_rate":-2.9, "cons_price_idx":93.469, "cons_conf_idx":-33.6, "euribor_3m":1.072, "nr_employed":5076.2},
                  "aug":{"emp_var_rate":-2.9, "cons_price_idx":92.201, "cons_conf_idx":-31.4, "euribor_3m":0.884, "nr_employed":5076.2},
                  "sep":{"emp_var_rate":-3.4, "cons_price_idx":92.379, "cons_conf_idx":-29.8, "euribor_3m":0.813, "nr_employed":5017.5},
                  "oct":{"emp_var_rate":-3.4, "cons_price_idx":92.431, "cons_conf_idx":-26.9, "euribor_3m":0.754, "nr_employed":5017.5},
                  "nov":{"emp_var_rate":-3.4, "cons_price_idx":92.649, "cons_conf_idx":-30.1, "euribor_3m":0.722, "nr_employed":5017.5},
                  "dec":{"emp_var_rate":-3, "cons_price_idx":92.713, "cons_conf_idx":-33, "euribor_3m":0.718, "nr_employed":5023.5}}
    
    index_2010 = {"jan":{"emp_var_rate":-3, "nr_employed":5023.5},
                  "feb":{"emp_var_rate":-3, "nr_employed":5023.5},
                  "mar":{"emp_var_rate":-1.8, "cons_price_idx":92.369, "cons_conf_idx":-34.8, "euribor_3m":0.655, "nr_employed":5008.7},
                  "apr":{"emp_var_rate":-1.8, "cons_price_idx":93.749, "cons_conf_idx":-34.6, "euribor_3m":0.64, "nr_employed":5008.7},
                  "may":{"emp_var_rate":-1.8, "cons_price_idx":93.876, "cons_conf_idx":-40, "euribor_3m":0.668, "nr_employed":5008.7},
                  "jun":{"emp_var_rate":-1.7, "cons_price_idx":94.055, "cons_conf_idx":-39.8, "euribor_3m":0.704, "nr_employed":4991.6},
                  "jul":{"emp_var_rate":-1.7, "cons_price_idx":94.215, "cons_conf_idx":-40.3, "euribor_3m":0.79, "nr_employed":4991.6},
                  "aug":{"emp_var_rate":-1.7, "cons_price_idx":94.027, "cons_conf_idx":-38.3, "euribor_3m":0.898, "nr_employed":4991.6},
                  "sep":{"emp_var_rate":-1.1, "cons_price_idx":94.199, "cons_conf_idx":-37.5, "euribor_3m":0.886, "nr_employed":4963.6},
                  "oct":{"emp_var_rate":-1.1, "cons_price_idx":94.601, "cons_conf_idx":-49.5, "euribor_3m":0.959, "nr_employed":4963.6},
                  "nov":{"emp_var_rate":-1.1, "cons_price_idx":94.767, "cons_conf_idx":-50.8, "euribor_3m":1.05, "nr_employed":4963.6}}

    new_data = data.copy()
    indx = [index_2008, index_2009, index_2010]
    years = [2008, 2009, 2010]

    for i in range(0 , len(years)):
        for months, indexes in indx[i].items():
            for index, index_val in indexes.items():
                new_data.loc[(new_data['year'] == years[i]) & (new_data['month'] == months), index] = index_val
    return new_data            

In [16]:
bank_full_index = map_index(data = result_bankfull)
bank_full_index.head()

Unnamed: 0,year,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor_3m,nr_employed,y
0,2008,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,1.1,93.994,-36.4,4.85,5191.0,no
1,2008,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,1.1,93.994,-36.4,4.85,5191.0,no
2,2008,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,1.1,93.994,-36.4,4.85,5191.0,no
3,2008,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,1.1,93.994,-36.4,4.85,5191.0,no
4,2008,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,1.1,93.994,-36.4,4.85,5191.0,no


In [17]:
bank_full_index.count()

year              45211
age               45211
job               45211
marital           45211
education         45211
default           45211
balance           45211
housing           45211
loan              45211
contact           45211
day               45211
month             45211
duration          45211
campaign          45211
pdays             45211
previous          45211
poutcome          45211
emp_var_rate      45211
cons_price_idx    41159
cons_conf_idx     41159
euribor_3m        41159
nr_employed       45211
y                 45211
dtype: int64

### Adding date column

In [18]:
def get_date(month, year, day_of_week_str):
    year = int(year)
    
    month_lst = [ "haha",
               "jan", "feb", "mar", "apr",
               "may", "jun", "jul", "aug",
               "sep", "oct", "nov", "dec"]
    
    day_of_week_abbr=day_of_week_str.capitalize()
    # Get the abbreviated name of the day of the week
    day_of_week_abbr = day_of_week_abbr[:3]

    # Get the day of the week number
    day_of_week = list(calendar.day_abbr).index(day_of_week_abbr)
    month_num = month_lst.index(month)
    
    # Iterate over the days in the month and find the first day that matches the day of the week
    for day in range(1, calendar.monthrange(year, month_num)[1] + 1):
        if calendar.weekday(year, month_num, day) == day_of_week:
            return f"{day:02}"
    return None



def map_date2(data):
    # Make a copy of the original dataframe
    new_data = data.copy()
    
    # Insert a new "date" column filled with zeros
    new_data.insert(loc=10, column="date", value=0)
    
    month_lst = [ "haha",
               "jan", "feb", "mar", "apr",
               "may", "jun", "jul", "aug",
               "sep", "oct", "nov", "dec"]
    
    # Loop through the rows of the dataframe, updating the date column
    for i in range(0,len(new_data)):
        day = int(new_data["day"][i])
        mnth = month_lst.index(new_data["month"][i])
        new_data.at[i, "date"] = str(new_data["year"][i]) + "-" + \
        str( f"{mnth:02}" ) + "-" + \
        str(f"{day:02}")
    return new_data


In [19]:
res_full_date = map_date2(data = bank_full_index)
res_full_date.head()

Unnamed: 0,year,age,job,marital,education,default,balance,housing,loan,contact,date,day,month,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor_3m,nr_employed,y
0,2008,58,management,married,tertiary,no,2143,yes,no,unknown,2008-05-05,5,may,261,1,-1,0,unknown,1.1,93.994,-36.4,4.85,5191.0,no
1,2008,44,technician,single,secondary,no,29,yes,no,unknown,2008-05-05,5,may,151,1,-1,0,unknown,1.1,93.994,-36.4,4.85,5191.0,no
2,2008,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,2008-05-05,5,may,76,1,-1,0,unknown,1.1,93.994,-36.4,4.85,5191.0,no
3,2008,47,blue-collar,married,unknown,no,1506,yes,no,unknown,2008-05-05,5,may,92,1,-1,0,unknown,1.1,93.994,-36.4,4.85,5191.0,no
4,2008,33,unknown,single,unknown,no,1,no,no,unknown,2008-05-05,5,may,198,1,-1,0,unknown,1.1,93.994,-36.4,4.85,5191.0,no


# Dataset - 2 (additional-full.csv)

In [20]:
dataset_2 = spark.read.csv('bank-additional-full.csv',sep = ";",header=True,inferSchema=True)

In [21]:
dataset_2.count()

41188

### Converting dataset to pandas

In [22]:
pdf_2 = dataset_2.toPandas()

In [23]:
pdf_2.head(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


### Replacing pdays 999 value to -1

In [24]:
pdf_2["pdays"] = pdf_2["pdays"].replace(999, -1)

In [25]:
pdf_2.head(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,-1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,-1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,-1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,-1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,-1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


### Renaming column names

In [26]:
old_col_list = ["emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"]
for i in range(0, len(old_col_list)):
    pdf_2.rename(columns={old_col_list[i]: new_cols[i]}, inplace=True)


In [27]:
pdf_2.head(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor_3m,nr_employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,-1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,-1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,-1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,-1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,-1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


## Adding year mapper function to add year to dataset - 2(addition-full.csv)

In [28]:
def year_mapper(data, start_yr):
    month_lst = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]

    # Make a copy of the original dataframe
    new_data = data.copy()

    # Insert a new "year" column filled with zeros
    new_data.insert(loc=0, column="year", value=0)

    # Set the first year to the start year
    current_year = int(start_yr)
    new_data.at[0, "year"] = current_year

    # Loop through the rows of the dataframe, updating the year column when the month changes
    for i in range(1, len(new_data)):
        # If the current month is earlier in the year than the previous month, increment the year
        if month_lst.index(new_data["month"][i]) < month_lst.index(new_data["month"][i-1]):
            current_year += 1

        new_data.at[i, "year"] = current_year

        # If the current year exceeds the end year, break out of the loop
        if current_year > 2010:
            break

    return new_data

In [29]:
import pandas as pd
from pyspark.sql.functions import col
import pyspark.pandas as ps
prev = spark.conf.get("spark.sql.execution.arrow.pyspark.enabled")  # Keep its default value.
ps.set_option("compute.default_index_type", "distributed")  # Use default index prevent overhead.
import warnings
warnings.filterwarnings("ignore")  # Ignore warnings coming from Arrow optimizations.
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

# Apply the function to the Pandas DataFrame
new_pandas_df = year_mapper(pdf_2, 2008)

# Convert the updated Pandas DataFrame back to a PySpark DataFrame
new_df = spark.createDataFrame(new_pandas_df)

new_df.show()



+----+---+-----------+--------+-------------------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+----------+-----------+---+
|year|age|        job| marital|          education|default|housing|loan|  contact|month|day_of_week|duration|campaign|pdays|previous|   poutcome|emp_var_rate|cons_price_idx|cons_conf_idx|euribor_3m|nr_employed|  y|
+----+---+-----------+--------+-------------------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+----------+-----------+---+
|2008| 56|  housemaid| married|           basic.4y|     no|     no|  no|telephone|  may|        mon|     261|       1|   -1|       0|nonexistent|         1.1|        93.994|        -36.4|     4.857|     5191.0| no|
|2008| 57|   services| married|        high.school|unknown|     no|  no|telephone|  may|        mon|     149|       1|   -1|       0|nonexis

### dump new_df with year_mapper into csv

In [30]:

#Write DataFrame to CSV file
new_df.write.csv("new_df")


Py4JJavaError: An error occurred while calling o63.csv.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:735)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:270)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:286)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:978)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkOneDirWithMode(RawLocalFileSystem.java:660)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:700)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.ChecksumFileSystem.mkdirs(ChecksumFileSystem.java:788)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.setupJob(FileOutputCommitter.java:356)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.setupJob(HadoopMapReduceCommitProtocol.scala:188)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:219)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:186)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:116)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:851)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.fileNotFoundException(Shell.java:547)
	at org.apache.hadoop.util.Shell.getHadoopHomeDir(Shell.java:568)
	at org.apache.hadoop.util.Shell.getQualifiedBin(Shell.java:591)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:688)
	at org.apache.hadoop.util.StringUtils.<clinit>(StringUtils.java:79)
	at org.apache.hadoop.conf.Configuration.getTimeDurationHelper(Configuration.java:1907)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1867)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1840)
	at org.apache.hadoop.util.ShutdownHookManager.getShutdownTimeout(ShutdownHookManager.java:183)
	at org.apache.hadoop.util.ShutdownHookManager$HookEntry.<init>(ShutdownHookManager.java:207)
	at org.apache.hadoop.util.ShutdownHookManager.addShutdownHook(ShutdownHookManager.java:304)
	at org.apache.spark.util.SparkShutdownHookManager.install(ShutdownHookManager.scala:181)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks$lzycompute(ShutdownHookManager.scala:50)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks(ShutdownHookManager.scala:48)
	at org.apache.spark.util.ShutdownHookManager$.addShutdownHook(ShutdownHookManager.scala:153)
	at org.apache.spark.util.ShutdownHookManager$.<init>(ShutdownHookManager.scala:58)
	at org.apache.spark.util.ShutdownHookManager$.<clinit>(ShutdownHookManager.scala)
	at org.apache.spark.util.Utils$.createTempDir(Utils.scala:343)
	at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:344)
	at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:901)
	at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
	at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
	at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
	at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1046)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1055)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset.
	at org.apache.hadoop.util.Shell.checkHadoopHomeInner(Shell.java:467)
	at org.apache.hadoop.util.Shell.checkHadoopHome(Shell.java:438)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:515)
	... 22 more


In [None]:
result_addition = year_mapper( data = pdf_2, start_yr = 2008)
result_addition.head()

In [None]:
result_addition["year"].value_counts()

### adding date column to additional 

In [None]:
import calendar

def get_date(month, year, day_of_week_str):
    year = int(year)
    
    month_lst = [ "haha",
               "jan", "feb", "mar", "apr",
               "may", "jun", "jul", "aug",
               "sep", "oct", "nov", "dec"]
    
    day_of_week_abbr=day_of_week_str.capitalize()
    # Get the abbreviated name of the day of the week
    day_of_week_abbr = day_of_week_abbr[:3]

    # Get the day of the week number
    day_of_week = list(calendar.day_abbr).index(day_of_week_abbr)
    month_num = month_lst.index(month)
    
    # Iterate over the days in the month and find the first day that matches the day of the week
    for day in range(1, calendar.monthrange(year, month_num)[1] + 1):
        if calendar.weekday(year, month_num, day) == day_of_week:
            return f"{day:02}"
    return None


def map_date(data):
    # Make a copy of the original dataframe
    new_data = data.copy()
    
    # Insert a new "date" column filled with zeros
    new_data.insert(loc=10, column="date", value=0)
    
    month_lst = [ "haha",
               "jan", "feb", "mar", "apr",
               "may", "jun", "jul", "aug",
               "sep", "oct", "nov", "dec"]
    
    # Loop through the rows of the dataframe, updating the date column
    for i in range(0,len(new_data)):
        mnth = month_lst.index(new_data["month"][i])
        new_data.at[i, "date"] = str(new_data["year"][i]) + "-" + \
        str( f"{mnth:02}" ) + "-" + \
        str( get_date(month = new_data["month"][i],
                      year = new_data["year"][i],
                      day_of_week_str = new_data["day_of_week"][i]) )
    return new_data


In [None]:
res_add_date = map_date(data=result_addition)
res_add_date.head()

### rename education field values

In [None]:
old_edu = ["basic.4y", "high.school", "basic.6y", "basic.9y", "university.degree", "professional.course"]
new_edu = ["basic_4y", "high_school", "basic_6y", "basic_9y", "university_degree" ,"professional_course"]

for i in range(0,6):
    res_add_date.loc[(res_add_date['education'] == old_edu[i]), "education"] = new_edu[i]
    
res_add_date.head()

In [None]:
    
res_add_date.head()

In [None]:
res_full_date.head()

### Final drop column in both dataset

In [None]:
drop_cols_full = ["year","balance","day","month"]
res_full_date.drop(drop_cols_full,axis=1,inplace=True)
res_full_date.head()

In [None]:
drop_cols_add = ["year","day_of_week","month"]
res_add_date.drop(drop_cols_add,axis=1,inplace=True)
res_add_date.head()

|                **additional**                |                    **full**                   |
|:--------------------------------------------:|:---------------------------------------------:|
|                read with spark               |                read with spark                |
|               spark toPandas()               |                spark toPandas()               |
|          replace 999 to -1 in pdays          |            added new index columns            |
|           rename column for index            |      added year column with year mapper       |
|      added year column with year mapper      |   added values to index cols with map index   |
| added date column with get_date and map_date | added date column with get_date and map_date2 |
|         rename education field values        |                                               |
|drop_cols_add = ["year","day_of_week","month"]                                              | drop_cols_full = ["year","balance","day","month"]|

## Concatinating two datasets into one

In [None]:
frames  = [res_full_date, res_add_date]

tele_df = pd.concat(frames)
tele_df.head()

In [None]:
tele_df.tail()

In [None]:
tele_df.info()

In [None]:
tele_df.describe()

**Splitting Dataframe into categorical and continuous typed Dataframe**

In [None]:
categories = tele_df.filter(["job", "marital", "education","default","housing","loan","contact","month","year","poutcome","y"])
continuous = tele_df.filter(["age", "duration", "campaign","pdays","previus", "cons_price_idx", "cons_conf_idx","euribor_3m","nr_employed"])

In [None]:
for column in categories:
    print("Column Name: ", column)
    print("-----------------------")
    print(categories[column].value_counts())
    print("     ")
    print("******************************************************")
    print("     ")
# Printing Unique values from categorical columns

### converts ".admin" into "admin"

In [None]:
tele_df.loc[(tele_df['job'] == "admin."), "job"] = "admin"
tele_df["job"].unique()

# **Checking for null values and removing**

In [None]:
tele_df = tele_df.replace(["unknown","nonexistent"], np.nan)

In [None]:
tele_df.head()

In [None]:
tele_df.isnull().sum()

### Replacing null values of categorical columns with the mode

In [None]:
for columns in categories:
    tele_df[columns] = tele_df[columns].fillna(tele_df[columns].mode()[0])

### Replacing null values of Continuous columns with the mean

In [None]:
for cols in continuous:
    tele_df[cols] = tele_df[cols].fillna(tele_df[cols].mean())

In [None]:
tele_df.isnull().sum()

In [None]:
import seaborn as sns
correlation = tele_df.corr()
sns.heatmap(correlation)

### Creating a dictionary for converting categorical textual data entries into categorical numeric on basis of job profile

In [None]:
job_dict = {"entrepreneur":11, "self-employed":10, "admin":9, "management":8, "services":7, 
       "technician":6, "blue-collar":5, "housemaid":4, "retired":3, "student":2, "unemployed":1}

In [None]:
tele_df["jobs"] = tele_df["job"].map(job_dict)
tele_df = tele_df.drop("job", axis=1)

In [None]:
tele_df.head()

### Creating a dictionary for converting categorical textual data entries into categorical numeric on basis of marital

In [None]:
marital_dict = {"married":3, "single":2, "divorced":1}
tele_df["maritals"] = tele_df["marital"].map(marital_dict)
tele_df = tele_df.drop("marital", axis=1)
tele_df.head()

### Creating a dictionary for converting categorical textual data entries into categorical numeric on basis of education

In [None]:
edu_dict = {"professional_course":10, "university_degree":9, "tertiary":8, "secondary":7, 
       "high_school":6, "basic_9y":5, "basic_6y":4, "primary":3, "basic_4y":2, "illiterate":1}
tele_df["education"] = tele_df["education"].map(edu_dict)
tele_df.head()

In [None]:
tele_df=tele_df.reset_index(drop=True)
tele_df.to_csv("./tele_df.csv")

### adding quarter column

In [None]:
# Convert the date column to a datetime object
tele_df['date'] = pd.to_datetime(tele_df['date'])

# Add a new column with the quarter based on the date values
tele_df['quarter'] = tele_df['date'].apply(lambda x: "q"+str((x.month-1)//3 + 1))

tele_df.head()

In [None]:
tele_df["quarter"].value_counts()

### label encoding 

In [None]:

tele_df["y"] = tele_df["y"].replace("yes", 1)
tele_df["y"] = tele_df["y"].replace("no", 0)

### one hot encoding

In [None]:
tele_df = pd.get_dummies(data = tele_df,
                          drop_first = True)
tele_df.head(10)

In [None]:
tele_df.info()

In [None]:
tele_df.shape

In [None]:
outliers_columns = ["age","duration","campaign","pdays","previous"]

In [None]:
def plot_box():
    plt.figure(figsize=(10,10))
    plt.subplot(3,2,1)
    tele_df.boxplot(column=["age"])

    plt.subplot(3,2,2)
    tele_df.boxplot(column=["duration"])

    plt.subplot(3,2,3)
    tele_df.boxplot(column=["campaign"])

    plt.subplot(3,2,4)
    tele_df.boxplot(column=["pdays"])

    plt.subplot(3,2,5)
    tele_df.boxplot(column=["previous"])
           
plot_box()

In [None]:
max_out_limit = []
for cols in outliers_columns:
    q3 = tele_df[cols].quantile(0.75)
    q1 = tele_df[cols].quantile(0.25)
    iqr = q3 - q1
    iqr = iqr*1.5
    max_limit = q3 + iqr
    min_limit = q1 - iqr
    max_out_limit.append(max_limit)
    print(cols, "max_limit: ",max_limit," min_limit: ",min_limit)
else:
    print("------------------------------------------")
    print(max_out_limit)

In [None]:
for i, j in zip(outliers_columns, max_out_limit):
    tele_df.loc[tele_df[i]>=j, i]=j

In [None]:
plot_box()

## Hypothesis required?
### yes!

1. chisquare: default, housing, loan, jobs, marital, education, contact, quarter
2. ANOVA: age, duration, 

In [None]:
tele_df["campaign"].value_counts()

In [None]:
tele_df.head(1000)