In [1]:
#importing necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import *

from pyspark.sql.functions import when, col
from pyspark.sql.functions import sum, count
import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

# Creating a spark session object

In [2]:
spark = SparkSession.builder.master('local[4]').appName('ProjectLemon').getOrCreate()

# Preprocessing

## Preprocessing on dataset bank-full

In [3]:
# to display dataframe properly in jupyter
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [4]:
bank_full = spark.read.csv('bank-full.csv',
                           sep = ';',
                           header=True,
                           inferSchema=True)

In [5]:
bank_full.count()

45211

In [6]:
bank_full.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may

In [7]:
bank_full.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



In [8]:
new_cols = ["emp_var_rate", "cons_price_idx", "cons_conf_idx", "euribor_3m", "nr_employed"]

for column in new_cols:
    bank_full = bank_full.withColumn(column, bank_full["poutcome"] + 1)

In [9]:
display(bank_full.limit(7))

age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,emp_var_rate,cons_price_idx,cons_conf_idx,euribor_3m,nr_employed
58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no,,,,,
44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no,,,,,
33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no,,,,,
47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no,,,,,
33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no,,,,,
35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no,,,,,
28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no,,,,,


### Dropping the balance column

In [12]:
bank_full.drop("balance").limit(3)

age,job,marital,education,default,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,emp_var_rate,cons_price_idx,cons_conf_idx,euribor_3m,nr_employed
58,management,married,tertiary,no,yes,no,unknown,5,may,261,1,-1,0,unknown,no,,,,,
44,technician,single,secondary,no,yes,no,unknown,5,may,151,1,-1,0,unknown,no,,,,,
33,entrepreneur,married,secondary,no,yes,yes,unknown,5,may,76,1,-1,0,unknown,no,,,,,


In [13]:
display(bank_full.limit(5))

age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,emp_var_rate,cons_price_idx,cons_conf_idx,euribor_3m,nr_employed
58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no,,,,,
44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no,,,,,
33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no,,,,,
47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no,,,,,
33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no,,,,,


### Adding year mapper function to add year in DataFrame bank_full

In [18]:
conda install pyarrow>1.0.0


Note: you may need to restart the kernel to use updated packages.




  current version: 22.9.0
  latest version: 23.1.0

Please update conda by running

    $ conda update -n base -c defaults conda




In [14]:
import pandas as pd
from pyspark.sql.functions import col
import pyspark.pandas as ps
import warnings

ImportError: PyArrow >= 1.0.0 must be installed; however, it was not found.

In [12]:
def year_mapper(data, start_yr, end_yr):
    month_lst = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]

    # Make a copy of the original dataframe
    new_data = data.copy()

    # Insert a new "year" column filled with zeros
    new_data.insert(loc=0, column="year", value=0)

    # Set the first year to the start year
    current_year = int(start_yr)
    new_data.at[0, "year"] = current_year

    # Loop through the rows of the dataframe, updating the year column when the month changes
    for i in range(1, len(new_data)):
        # If the current month is earlier in the year than the previous month, increment the year
        if month_lst.index(new_data["month"][i]) < month_lst.index(new_data["month"][i-1]):
            current_year += 1

        new_data.at[i, "year"] = current_year

        # If the current year exceeds the end year, break out of the loop
        if current_year > end_yr:
            break

    return new_data

In [13]:
# Use default index prevent overhead.
ps.set_option("compute.default_index_type", "distributed") 

warnings.filterwarnings("ignore")  # Ignore warnings coming from Arrow optimizations.

# To speed up dataset processing
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

# Apply the function to the Pandas DataFrame
new_pandas_df = year_mapper(pdf_2, 2008, 2010)

# Convert the updated Pandas DataFrame back to a PySpark DataFrame
bank_full_yr = spark.createDataFrame(new_pandas_df)

bank_full_yr.show()

ImportError: PyArrow >= 1.0.0 must be installed; however, it was not found.

In [None]:
result_bankfull = year_mapper( data = , start_yr = 2008)
result_bankfull.head()