### Run with Spark 3.4 on Python 3.7

In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when 
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

import matplotlib.pyplot as plt

spark = SparkSession.builder.getOrCreate()

df = spark.read.csv('Social_Network_Ads.csv', header=True, inferSchema=True)

df.show(3)

+--------+------+---+---------------+---------+
| User ID|Gender|Age|EstimatedSalary|Purchased|
+--------+------+---+---------------+---------+
|15624510|  Male| 19|          19000|        0|
|15810944|  Male| 35|          20000|        0|
|15668575|Female| 26|          43000|        0|
+--------+------+---+---------------+---------+
only showing top 3 rows



In [20]:
# splitting the data into training and test sets with random seed 42
df_train, df_test = df.randomSplit([0.75, 0.25], seed=42)

df_train.count(), df_test.count()

(322, 78)

In [21]:
# select the features and target of train and test data then drop the null values
df_train = df_train.select('Gender', 'Age', 'Purchased', 'EstimatedSalary').na.drop()

df_train.show(3)

df_test = df_test.select('Gender', 'Age', 'Purchased', 'EstimatedSalary').na.drop()

df_test.show(3)

+------+---+---------+---------------+
|Gender|Age|Purchased|EstimatedSalary|
+------+---+---------+---------------+
|Female| 35|        0|          57000|
|Female| 58|        1|          95000|
|  Male| 34|        0|         115000|
+------+---+---------+---------------+
only showing top 3 rows

+------+---+---------+---------------+
|Gender|Age|Purchased|EstimatedSalary|
+------+---+---------+---------------+
|Female| 26|        0|          80000|
|  Male| 40|        1|          71000|
|Female| 58|        1|          47000|
+------+---+---------+---------------+
only showing top 3 rows



In [22]:
# change gender column to 0 and 1

df_train = df_train.withColumn('Gender', 
															 when(df_train['Gender'] == 'Male', 0).
                               when(df_train['Gender'] == 'Female', 1)
															)

df_train.show(3)

df_test = df_test.withColumn('Gender',
														 when(df_test['Gender'] == 'Male', 0).
                             when(df_test['Gender'] == 'Female', 1)
														)

df_test.show(3)

+------+---+---------+---------------+
|Gender|Age|Purchased|EstimatedSalary|
+------+---+---------+---------------+
|     1| 35|        0|          57000|
|     1| 58|        1|          95000|
|     0| 34|        0|         115000|
+------+---+---------+---------------+
only showing top 3 rows

+------+---+---------+---------------+
|Gender|Age|Purchased|EstimatedSalary|
+------+---+---------+---------------+
|     1| 26|        0|          80000|
|     0| 40|        1|          71000|
|     1| 58|        1|          47000|
+------+---+---------+---------------+
only showing top 3 rows



In [23]:
# create a vector assembler and standard scaler

cols = df_train.columns
cols.remove('EstimatedSalary')

df_train = VectorAssembler(inputCols=cols, outputCol='features').transform(df_train)

df_train = StandardScaler(inputCol='features', outputCol='features_scaled').fit(df_train).transform(df_train)

df_train.show(3)

+------+---+---------+---------------+--------------+--------------------+
|Gender|Age|Purchased|EstimatedSalary|      features|     features_scaled|
+------+---+---------+---------------+--------------+--------------------+
|     1| 35|        0|          57000|[1.0,35.0,0.0]|[1.99750858138724...|
|     1| 58|        1|          95000|[1.0,58.0,1.0]|[1.99750858138724...|
|     0| 34|        0|         115000|[0.0,34.0,0.0]|[0.0,3.2621352195...|
+------+---+---------+---------------+--------------+--------------------+
only showing top 3 rows



In [24]:
df_test = VectorAssembler(inputCols=cols, outputCol='features').transform(df_test)

df_test = StandardScaler(inputCol='features', outputCol='features_scaled').fit(df_test).transform(df_test)

df_test.show(3)

+------+---+---------+---------------+--------------+--------------------+
|Gender|Age|Purchased|EstimatedSalary|      features|     features_scaled|
+------+---+---------+---------------+--------------+--------------------+
|     1| 26|        0|          80000|[1.0,26.0,0.0]|[1.98713813025616...|
|     0| 40|        1|          71000|[0.0,40.0,1.0]|[0.0,3.7070677011...|
|     1| 58|        1|          47000|[1.0,58.0,1.0]|[1.98713813025616...|
+------+---+---------+---------------+--------------+--------------------+
only showing top 3 rows



In [25]:
# create a logistic regression model
lr = LogisticRegression(featuresCol='features_scaled', labelCol='Purchased', maxIter=100).fit(df_train)

lr

LogisticRegressionModel: uid=LogisticRegression_f84e2555ccac, numClasses=2, numFeatures=3

In [26]:
# make predictions
predictions = lr.transform(df_test)

predictions.select('Purchased', 'prediction').show()

+---------+----------+
|Purchased|prediction|
+---------+----------+
|        0|       0.0|
|        1|       1.0|
|        1|       1.0|
|        0|       0.0|
|        0|       0.0|
|        0|       0.0|
|        0|       0.0|
|        0|       0.0|
|        1|       1.0|
|        0|       0.0|
|        1|       1.0|
|        1|       1.0|
|        0|       0.0|
|        1|       1.0|
|        0|       0.0|
|        1|       1.0|
|        0|       0.0|
|        0|       0.0|
|        0|       0.0|
|        0|       0.0|
+---------+----------+
only showing top 20 rows



In [27]:
# evaluate the model with binary classification evaluator
evaluatorBinary = BinaryClassificationEvaluator(labelCol='Purchased')

evaluatorBinary.evaluate(predictions)

1.0

In [28]:
# evaluate the model with multiclass classification evaluator
evaluatorMulti = MulticlassClassificationEvaluator(labelCol='Purchased')

evaluatorMulti.evaluate(predictions)

1.0