<a href="https://colab.research.google.com/github/safi50/Social-Media-Ad-Purchases-PySpark/blob/main/Predicting_Social_Media_Ad_Purchases_Using_PySpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [136]:
# !pip install pyspark

In [137]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer
from pyspark.ml.feature import MinMaxScaler, IndexToString
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator




## Initialize and Start a Spark Session

In [138]:
spark = SparkSession.builder.appName('Social Network Ads').getOrCreate()

## Reading Data from CSV

In [139]:
data = spark.read.csv('Social_Network_Ads.csv', header=True, inferSchema=True)

In [140]:
data.show()

+--------+------+---+---------------+---------+
| User ID|Gender|Age|EstimatedSalary|Purchased|
+--------+------+---+---------------+---------+
|15624510|  Male| 19|          19000|        0|
|15810944|  Male| 35|          20000|        0|
|15668575|Female| 26|          43000|        0|
|15603246|Female| 27|          57000|        0|
|15804002|  Male| 19|          76000|        0|
|15728773|  Male| 27|          58000|        0|
|15598044|Female| 27|          84000|        0|
|15694829|Female| 32|         150000|        1|
|15600575|  Male| 25|          33000|        0|
|15727311|Female| 35|          65000|        0|
|15570769|Female| 26|          80000|        0|
|15606274|Female| 26|          52000|        0|
|15746139|  Male| 20|          86000|        0|
|15704987|  Male| 32|          18000|        0|
|15628972|  Male| 18|          82000|        0|
|15697686|  Male| 29|          80000|        0|
|15733883|  Male| 47|          25000|        1|
|15617482|  Male| 45|          26000|   

In [141]:
# length of data
data.count()

400

## Vectorizing Input and Output Features for faster Computing

In [142]:
vectorizer = VectorAssembler()
vectorizer.setInputCols(['Age', 'EstimatedSalary'])
vectorizer.setOutputCol('features')

data = vectorizer.transform(data)
data.show(10, False)

+--------+------+---+---------------+---------+---------------+
|User ID |Gender|Age|EstimatedSalary|Purchased|features       |
+--------+------+---+---------------+---------+---------------+
|15624510|Male  |19 |19000          |0        |[19.0,19000.0] |
|15810944|Male  |35 |20000          |0        |[35.0,20000.0] |
|15668575|Female|26 |43000          |0        |[26.0,43000.0] |
|15603246|Female|27 |57000          |0        |[27.0,57000.0] |
|15804002|Male  |19 |76000          |0        |[19.0,76000.0] |
|15728773|Male  |27 |58000          |0        |[27.0,58000.0] |
|15598044|Female|27 |84000          |0        |[27.0,84000.0] |
|15694829|Female|32 |150000         |1        |[32.0,150000.0]|
|15600575|Male  |25 |33000          |0        |[25.0,33000.0] |
|15727311|Female|35 |65000          |0        |[35.0,65000.0] |
+--------+------+---+---------------+---------+---------------+
only showing top 10 rows



## Normalizing Features

In [143]:
scaler = MinMaxScaler(inputCol='features', outputCol='scaledFeatures')
scalerModel = scaler.fit(data)
data = scalerModel.transform(data)
data.show(10, False)


+--------+------+---+---------------+---------+---------------+------------------------------------------+
|User ID |Gender|Age|EstimatedSalary|Purchased|features       |scaledFeatures                            |
+--------+------+---+---------------+---------+---------------+------------------------------------------+
|15624510|Male  |19 |19000          |0        |[19.0,19000.0] |[0.023809523809523808,0.02962962962962963]|
|15810944|Male  |35 |20000          |0        |[35.0,20000.0] |[0.40476190476190477,0.037037037037037035]|
|15668575|Female|26 |43000          |0        |[26.0,43000.0] |[0.19047619047619047,0.2074074074074074]  |
|15603246|Female|27 |57000          |0        |[27.0,57000.0] |[0.21428571428571427,0.3111111111111111]  |
|15804002|Male  |19 |76000          |0        |[19.0,76000.0] |[0.023809523809523808,0.45185185185185184]|
|15728773|Male  |27 |58000          |0        |[27.0,58000.0] |[0.21428571428571427,0.31851851851851853] |
|15598044|Female|27 |84000          |

## Indexing Input and Output Features for faster computing

In [144]:
labelIndexer = StringIndexer(inputCol='Purchased', outputCol='indexedLabel')
featureIndexer = VectorIndexer(inputCol='scaledFeatures', outputCol='indexedFeatures', maxCategories=4)

In [145]:
(train_set, test_set) = data.randomSplit([0.8, 0.2])

# 1. Classification Using Decision Trees

In [146]:
decision_tree = DecisionTreeClassifier(labelCol='indexedLabel', featuresCol='indexedFeatures')

In [147]:
dt_pipeline = Pipeline(stages=[labelIndexer, featureIndexer, decision_tree])

In [148]:
decisionTreeModel = dt_pipeline.fit(train_set)

## Decision Tree Model Evaluation

In [149]:
dt_preds = decisionTreeModel.transform(test_set)

In [150]:
dt_preds.select( 'Age', 'EstimatedSalary','Purchased', 'prediction').show(10, False)

+---+---------------+---------+----------+
|Age|EstimatedSalary|Purchased|prediction|
+---+---------------+---------+----------+
|58 |95000          |1        |1.0       |
|40 |71000          |1        |0.0       |
|48 |131000         |1        |1.0       |
|35 |73000          |0        |0.0       |
|23 |63000          |0        |0.0       |
|37 |74000          |0        |0.0       |
|42 |90000          |1        |1.0       |
|53 |104000         |1        |1.0       |
|48 |119000         |1        |1.0       |
|28 |59000          |0        |0.0       |
+---+---------------+---------+----------+
only showing top 10 rows



In [151]:
eval = MulticlassClassificationEvaluator(labelCol='indexedLabel', predictionCol='prediction', metricName='accuracy')
acc = eval.evaluate(dt_preds)
print('Accuracy: ', acc)


Accuracy:  0.9047619047619048


In [152]:
treeModel = decisionTreeModel.stages[2]
print(treeModel)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_881c04f627b6, depth=5, numNodes=35, numClasses=2, numFeatures=2


In [153]:
dt_preds.groupBy('Purchased', 'prediction').count().show()

+---------+----------+-----+
|Purchased|prediction|count|
+---------+----------+-----+
|        1|       0.0|    8|
|        0|       0.0|   50|
|        1|       1.0|   26|
+---------+----------+-----+



# 2. Classification Using Random Forests

In [154]:
random_forest = RandomForestClassifier(labelCol='indexedLabel', featuresCol='indexedFeatures', numTrees=50)
labelConverter = IndexToString(inputCol='prediction', outputCol='predictedLabel', labels=labelIndexer.fit(data).labels)


In [155]:
pipeline2 = Pipeline(stages=[labelIndexer, featureIndexer, random_forest, labelConverter])

In [156]:
RandomForestsModel = pipeline2.fit(train_set)

## Evaluation for Random Forests Classifier

In [157]:
rf_preds = RandomForestsModel.transform(test_set)

In [158]:
rf_preds.select('Age', 'EstimatedSalary', 'Purchased', 'predictedLabel').show(10, False)

+---+---------------+---------+--------------+
|Age|EstimatedSalary|Purchased|predictedLabel|
+---+---------------+---------+--------------+
|58 |95000          |1        |1             |
|40 |71000          |1        |0             |
|48 |131000         |1        |1             |
|35 |73000          |0        |0             |
|23 |63000          |0        |0             |
|37 |74000          |0        |0             |
|42 |90000          |1        |1             |
|53 |104000         |1        |1             |
|48 |119000         |1        |1             |
|28 |59000          |0        |0             |
+---+---------------+---------+--------------+
only showing top 10 rows



In [159]:
rf_eval = MulticlassClassificationEvaluator(labelCol='indexedLabel', predictionCol='prediction', metricName='accuracy')
rf_acc = rf_eval.evaluate(rf_preds)
print('Accuracy: ', rf_acc)

Accuracy:  0.9047619047619048


In [160]:
rf_preds.groupBy('Purchased', 'prediction').count().show()

+---------+----------+-----+
|Purchased|prediction|count|
+---------+----------+-----+
|        1|       0.0|    6|
|        0|       0.0|   48|
|        1|       1.0|   28|
|        0|       1.0|    2|
+---------+----------+-----+



In [161]:
print(RandomForestsModel.stages[2])

RandomForestClassificationModel: uid=RandomForestClassifier_0170f4e3cba8, numTrees=50, numClasses=2, numFeatures=2
