<a href="https://colab.research.google.com/github/sadiksmart0/Bank-Marketing/blob/main/Bank_Marketing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

##Bank Marketing Data Set
##Download: Data Folder, Data Set Description

##Abstract: The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution.
#The classification goal is to predict if the client will subscribe a term deposit (variable y).

In [None]:
#IMPORTING DEPENDENCIES
import pandas as pd
from pyspark.sql import SparkSession
import seaborn as sns
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.sql.functions import col
from pyspark.ml.feature import StandardScaler

In [None]:
# STARTING A SPARK SESSION
spark = SparkSession.builder.getOrCreate()

In [None]:
# MOUNTING DRIVE
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Loading Dataset
df1 = pd.read_csv("/content/drive/MyDrive/bank-full.csv", sep=";")

In [None]:
# Number of columns
len(df1.columns)

17

In [None]:
df1.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [None]:
# Unique Counts of Subscribtion
df1["y"].value_counts()

no     39922
yes     5289
Name: y, dtype: int64

In [None]:
# Slicing 
df_yes = df1[df1["y"]=="yes"]
df_no = df1[df1["y"]=="no"]
len(df_yes)

5289

In [None]:
# DOWNSAMPLING OF IMBALANCED DATASET
downsampled = df_no.sample(n=len(df_yes), replace=False, random_state=1)

In [None]:
# CONCATENATING 
bank_data = pd.concat([df_yes, downsampled])
# SHUFFLING 
shuffled_bank_data = bank_data.sample(frac=1, random_state=21)
len(bank_data)

10578

In [None]:
# DROP UNWANTED COLUMNS
bank_final = shuffled_bank_data.drop(["month", "day", "contact"], axis=1)
len(bank_final.columns)

14

In [None]:
# LOADING SPARK DATAFRAME
bank_df = spark.createDataFrame(bank_final)

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


In [None]:
# VIEW TOP 10
bank_df.show(10)

+---+-------------+--------+---------+-------+-------+-------+----+--------+--------+-----+--------+--------+---+
|age|          job| marital|education|default|balance|housing|loan|duration|campaign|pdays|previous|poutcome|  y|
+---+-------------+--------+---------+-------+-------+-------+----+--------+--------+-----+--------+--------+---+
| 67|    housemaid|divorced|secondary|     no|   5275|     no|  no|     222|       4|   -1|       0| unknown|yes|
| 34|   technician| married|secondary|     no|    545|    yes|  no|      96|       4|   -1|       0| unknown| no|
| 29|self-employed|  single| tertiary|     no|   1579|     no|  no|     190|       1|   92|       5| success|yes|
| 36|  blue-collar|  single|secondary|     no|    366|    yes| yes|    1133|       2|   -1|       0| unknown|yes|
| 81|      retired|divorced|  primary|     no|    949|     no|  no|     188|       2|  280|       1|   other|yes|
| 34|   technician| married|secondary|     no|    656|    yes|  no|      10|       7|  3

In [None]:
# DROP EMPTY ROWS
bank_final = bank_df.dropna(how="any")

In [None]:
# VIEW SCHEMA
bank_final.printSchema()

root
 |-- age: long (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: long (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- duration: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- pdays: long (nullable = true)
 |-- previous: long (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



#ONE HOT ENCODING CATEGORICAL COLUMNS


In [None]:
# Selecting of input and Output Columns
inputCols = ['job', 'marital', 'education', 'default','housing', 'loan', 'poutcome','y']
outputCols = [col+"_encoded" for col in inputCols]

# Create separate StringIndexer instances for each input column
indexers = [StringIndexer(inputCol=col, outputCol=col+"_Index") for col in inputCols]

# Fit and transform the DataFrame using the indexers
indexed = bank_final
for indexer in indexers:
    indexed = indexer.fit(indexed).transform(indexed)

# Create separate OneHotEncoder instances for each input column
encoders = [OneHotEncoder(inputCol=col+"_Index", outputCol=outputCols[i]) for i, col in enumerate(inputCols)]

# Fit and transform the DataFrame using the encoders
encoded = indexed
for encoder in encoders:
    encoded = encoder.fit(encoded).transform(encoded)

# # # Drop the indexed columns
for col in inputCols:
      if col != "y":
        encoded = encoded.drop(col+"_Index")
        encoded = encoded.drop(col)

# Display the transformed DataFrame
encoded.show()

+---+-------+--------+--------+-----+--------+---+-------+---------------+---------------+-----------------+---------------+---------------+-------------+----------------+-------------+
|age|balance|duration|campaign|pdays|previous|  y|y_Index|    job_encoded|marital_encoded|education_encoded|default_encoded|housing_encoded| loan_encoded|poutcome_encoded|    y_encoded|
+---+-------+--------+--------+-----+--------+---+-------+---------------+---------------+-----------------+---------------+---------------+-------------+----------------+-------------+
| 67|   5275|     222|       4|   -1|       0|yes|    1.0|(11,[10],[1.0])|      (2,[],[])|    (3,[0],[1.0])|  (1,[0],[1.0])|  (1,[0],[1.0])|(1,[0],[1.0])|   (3,[0],[1.0])|    (1,[],[])|
| 34|    545|      96|       4|   -1|       0| no|    0.0| (11,[2],[1.0])|  (2,[0],[1.0])|    (3,[0],[1.0])|  (1,[0],[1.0])|      (1,[],[])|(1,[0],[1.0])|   (3,[0],[1.0])|(1,[0],[1.0])|
| 29|   1579|     190|       1|   92|       5|yes|    1.0| (11,[7],[1.

#SCALING 

In [None]:
encoded.printSchema()

root
 |-- age: long (nullable = true)
 |-- balance: long (nullable = true)
 |-- duration: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- pdays: long (nullable = true)
 |-- previous: long (nullable = true)
 |-- y: string (nullable = true)
 |-- y_Index: double (nullable = false)
 |-- job_encoded: vector (nullable = true)
 |-- marital_encoded: vector (nullable = true)
 |-- education_encoded: vector (nullable = true)
 |-- default_encoded: vector (nullable = true)
 |-- housing_encoded: vector (nullable = true)
 |-- loan_encoded: vector (nullable = true)
 |-- poutcome_encoded: vector (nullable = true)
 |-- y_encoded: vector (nullable = true)



In [None]:
input_cols = ["balance","duration","pdays"]
output_cols = [col+"scaled" for col in input_cols]


assembler = VectorAssembler(inputCols=input_cols, outputCol='s_features')
encoded = assembler.transform(encoded)

scaler = StandardScaler(inputCol="s_features", outputCol="scaled_features",
                          withStd=True, withMean=False)
scaled = scaler.fit(encoded)
encoded = scaled.transform(encoded)



In [None]:
encoded.show(5)

+---+-------+--------+--------+-----+--------+---+-------+---------------+---------------+-----------------+---------------+---------------+-------------+----------------+-------------+-------------------+--------------------+
|age|balance|duration|campaign|pdays|previous|  y|y_Index|    job_encoded|marital_encoded|education_encoded|default_encoded|housing_encoded| loan_encoded|poutcome_encoded|    y_encoded|         s_features|     scaled_features|
+---+-------+--------+--------+-----+--------+---+-------+---------------+---------------+-----------------+---------------+---------------+-------------+----------------+-------------+-------------------+--------------------+
| 67|   5275|     222|       4|   -1|       0|yes|    1.0|(11,[10],[1.0])|      (2,[],[])|    (3,[0],[1.0])|  (1,[0],[1.0])|  (1,[0],[1.0])|(1,[0],[1.0])|   (3,[0],[1.0])|    (1,[],[])|[5275.0,222.0,-1.0]|[1.52399583141426...|
| 34|    545|      96|       4|   -1|       0| no|    0.0| (11,[2],[1.0])|  (2,[0],[1.0])|  

In [None]:
# Feature and Target
unwanted = ["balance","duration","pdays","y_encoded","y","s_features"]

for col in unwanted:
    features = encoded.drop(col)
    encoded = features
features.show(2)

+---+--------+--------+-------+---------------+---------------+-----------------+---------------+---------------+-------------+----------------+--------------------+
|age|campaign|previous|y_Index|    job_encoded|marital_encoded|education_encoded|default_encoded|housing_encoded| loan_encoded|poutcome_encoded|     scaled_features|
+---+--------+--------+-------+---------------+---------------+-----------------+---------------+---------------+-------------+----------------+--------------------+
| 67|       4|       0|    1.0|(11,[10],[1.0])|      (2,[],[])|    (3,[0],[1.0])|  (1,[0],[1.0])|  (1,[0],[1.0])|(1,[0],[1.0])|   (3,[0],[1.0])|[1.52399583141426...|
| 34|       4|       0|    0.0| (11,[2],[1.0])|  (2,[0],[1.0])|    (3,[0],[1.0])|  (1,[0],[1.0])|      (1,[],[])|(1,[0],[1.0])|   (3,[0],[1.0])|[0.15745549348261...|
+---+--------+--------+-------+---------------+---------------+-----------------+---------------+---------------+-------------+----------------+--------------------+
only

In [None]:
encoded.columns
combined = ['age','campaign','previous','job_encoded','marital_encoded','education_encoded','default_encoded','housing_encoded','loan_encoded','poutcome_encoded','scaled_features']

# SPLITTING TRAIN-TEST DATA

In [None]:

# Split the data into training and testing sets
train_data, test_data = encoded.randomSplit([0.7, 0.3], seed=42)

# Combine all features into one Vector
assembler = VectorAssembler(inputCols=combined, outputCol='all_features')

train_data = assembler.transform(train_data)
test_data = assembler.transform(test_data)

train_data.printSchema()

root
 |-- age: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- previous: long (nullable = true)
 |-- y_Index: double (nullable = false)
 |-- job_encoded: vector (nullable = true)
 |-- marital_encoded: vector (nullable = true)
 |-- education_encoded: vector (nullable = true)
 |-- default_encoded: vector (nullable = true)
 |-- housing_encoded: vector (nullable = true)
 |-- loan_encoded: vector (nullable = true)
 |-- poutcome_encoded: vector (nullable = true)
 |-- scaled_features: vector (nullable = true)
 |-- all_features: vector (nullable = true)



# Modelization

In [None]:
# Declaration of model
lr = LogisticRegression(featuresCol="all_features", labelCol="y_Index", maxIter=100,threshold=0.5)

# Model Training
lr_model = lr.fit(train_data)

In [None]:
# Predictions
predictions = lr_model.transform(test_data)

# Evaluate the model using a binary classification evaluator
evaluator = BinaryClassificationEvaluator(labelCol="y_Index", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
accuracy = evaluator.evaluate(predictions)

# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 89.08%


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
predictions.show()
evaluator = MulticlassClassificationEvaluator(labelCol="y_Index", predictionCol="prediction")

# Compute the confusion matrix
confusion_matrix = predictions.groupBy("y_Index", "prediction").count().orderBy("y_Index", "prediction")

# Print the confusion matrix
confusion_matrix.show()


+---+--------+--------+-------+--------------+---------------+-----------------+---------------+---------------+-------------+----------------+--------------------+--------------------+--------------------+--------------------+----------+
|age|campaign|previous|y_Index|   job_encoded|marital_encoded|education_encoded|default_encoded|housing_encoded| loan_encoded|poutcome_encoded|     scaled_features|        all_features|       rawPrediction|         probability|prediction|
+---+--------+--------+-------+--------------+---------------+-----------------+---------------+---------------+-------------+----------------+--------------------+--------------------+--------------------+--------------------+----------+
| 18|       4|       0|    1.0|(11,[6],[1.0])|  (2,[1],[1.0])|        (3,[],[])|  (1,[0],[1.0])|  (1,[0],[1.0])|(1,[0],[1.0])|   (3,[0],[1.0])|[0.10054038849898...|(28,[0,1,9,15,19,...|[-1.1216769005214...|[0.24570036821887...|       1.0|
| 19|       1|       3|    0.0|(11,[6],[1.0]