In [None]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
bank_marketing = fetch_ucirepo(id=222)

# data (as pandas dataframes)
X = bank_marketing.data.features
y = bank_marketing.data.targets

# metadata
print(bank_marketing.metadata)

# variable information
print(bank_marketing.variables)


{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'ID': 277, 'type': 'NATIVE', 'title': 'A data-driven approach to predict the s

In [None]:
print(X)

       age           job   marital  education default  balance housing loan  \
0       58    management   married   tertiary      no     2143     yes   no   
1       44    technician    single  secondary      no       29     yes   no   
2       33  entrepreneur   married  secondary      no        2     yes  yes   
3       47   blue-collar   married        NaN      no     1506     yes   no   
4       33           NaN    single        NaN      no        1      no   no   
...    ...           ...       ...        ...     ...      ...     ...  ...   
45206   51    technician   married   tertiary      no      825      no   no   
45207   71       retired  divorced    primary      no     1729      no   no   
45208   72       retired   married  secondary      no     5715      no   no   
45209   57   blue-collar   married  secondary      no      668      no   no   
45210   37  entrepreneur   married  secondary      no     2971      no   no   

         contact  day_of_week month  duration  camp

In [None]:
import findspark
from pyspark.sql import SparkSession
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from pyspark.ml.feature import OneHotEncoder, StandardScaler
# from pyspark.ml import Pipeline

from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler

In [None]:
#!pip install findspark
!pip install scikit-learn pandas numpy



In [None]:
findspark.init()

spark = SparkSession.builder.appName("BankMarketingPreprocessing").getOrCreate()

# Assuming X and y are pandas DataFrames
bank_df = spark.createDataFrame(pd.concat([X, y], axis=1))

print(bank_df)

DataFrame[age: bigint, job: string, marital: string, education: string, default: string, balance: bigint, housing: string, loan: string, contact: string, day_of_week: bigint, month: string, duration: bigint, campaign: bigint, pdays: bigint, previous: bigint, poutcome: string, y: string]


In [None]:
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
numerical_features = ['age', 'balance', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous']

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # sparse=False for dense output
encoded_data = encoder.fit_transform(X[categorical_features])

# Create a DataFrame from the encoded data
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_features))
encoded_df.columns = [re.sub('[^a-zA-Z0-9_]', '_', col) for col in encoded_df.columns] # Replace invalid chars with '_'

# Concatenate encoded features with numerical features
X_encoded = pd.concat([X[numerical_features], encoded_df], axis=1)

scaler = StandardScaler()
X_encoded[numerical_features] = scaler.fit_transform(X_encoded[numerical_features])

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

spark = SparkSession.builder.appName("BankMarketing").getOrCreate()

# Convert pandas DataFrames to PySpark DataFrames
train_df = spark.createDataFrame(pd.concat([X_train, y_train], axis=1))
test_df = spark.createDataFrame(pd.concat([X_test, y_test], axis=1))

In [None]:
# Define layers for the neural network:
# Input layer of size X_train.shape[1] (number of features)
# Two hidden layers of size 10 and 5
# Output layer of size 2 (for binary classification)
layers = [X_train.shape[1], 10, 5, 2]

# Create a MultilayerPerceptronClassifier instance
mlp = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)


# Get a list of all feature column names
feature_columns = train_df.columns
feature_columns.remove('y')  # Remove the target variable column

# Create a VectorAssembler instance
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Transform the train and test DataFrames to include the "features" column
train_df = assembler.transform(train_df)
test_df = assembler.transform(test_df)


# Train the model
model = mlp.fit(train_df)

# Make predictions on the test data
predictions = model.transform(test_df)

# Evaluate the model using MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="y", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Accuracy = %g" % accuracy)

IllegalArgumentException: label does not exist. Available: age, balance, day_of_week, duration, campaign, pdays, previous, job_admin_, job_blue_collar, job_entrepreneur, job_housemaid, job_management, job_retired, job_self_employed, job_services, job_student, job_technician, job_unemployed, job_nan, marital_divorced, marital_married, marital_single, education_primary, education_secondary, education_tertiary, education_nan, default_no, default_yes, housing_no, housing_yes, loan_no, loan_yes, contact_cellular, contact_telephone, contact_nan, month_apr, month_aug, month_dec, month_feb, month_jan, month_jul, month_jun, month_mar, month_may, month_nov, month_oct, month_sep, poutcome_failure, poutcome_other, poutcome_success, poutcome_nan, y, features