## Loading libraries

In [1]:
!pip install pyspark



## Loading data

In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Dataframe').getOrCreate()

In [3]:
# Read dataset
df=spark.read.option('header','true').csv('/content/iphone_purchase_records.csv',inferSchema=True)

# Check the schema
df.printSchema()

root
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Purchase Iphone: integer (nullable = true)



In [4]:
df.show()

+------+---+------+---------------+
|Gender|Age|Salary|Purchase Iphone|
+------+---+------+---------------+
|  Male| 19| 19000|              0|
|  Male| 35| 20000|              0|
|Female| 26| 43000|              0|
|Female| 27| 57000|              0|
|  Male| 19| 76000|              0|
|  Male| 27| 58000|              0|
|Female| 27| 84000|              0|
|Female| 32|150000|              1|
|  Male| 25| 33000|              0|
|Female| 35| 65000|              0|
|Female| 26| 80000|              0|
|Female| 26| 52000|              0|
|  Male| 20| 86000|              0|
|  Male| 32| 18000|              0|
|  Male| 18| 82000|              0|
|  Male| 29| 80000|              0|
|  Male| 47| 25000|              1|
|  Male| 45| 26000|              1|
|  Male| 46| 28000|              1|
|Female| 48| 29000|              1|
+------+---+------+---------------+
only showing top 20 rows



# Handling Categorical Values

In [5]:
df.columns

['Gender', 'Age', 'Salary', 'Purchase Iphone']

In [6]:
# Handling Categorical Features
from pyspark.ml.feature import StringIndexer

indexer=StringIndexer(inputCol="Gender",outputCol="Gender_indexed")
df=indexer.fit(df).transform(df)
df.show()

+------+---+------+---------------+--------------+
|Gender|Age|Salary|Purchase Iphone|Gender_indexed|
+------+---+------+---------------+--------------+
|  Male| 19| 19000|              0|           1.0|
|  Male| 35| 20000|              0|           1.0|
|Female| 26| 43000|              0|           0.0|
|Female| 27| 57000|              0|           0.0|
|  Male| 19| 76000|              0|           1.0|
|  Male| 27| 58000|              0|           1.0|
|Female| 27| 84000|              0|           0.0|
|Female| 32|150000|              1|           0.0|
|  Male| 25| 33000|              0|           1.0|
|Female| 35| 65000|              0|           0.0|
|Female| 26| 80000|              0|           0.0|
|Female| 26| 52000|              0|           0.0|
|  Male| 20| 86000|              0|           1.0|
|  Male| 32| 18000|              0|           1.0|
|  Male| 18| 82000|              0|           1.0|
|  Male| 29| 80000|              0|           1.0|
|  Male| 47| 25000|            

## Combining all features and separating labels

In [7]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=['Age', 'Salary', 'Gender_indexed'],outputCol="Independent Features")
output = featureassembler.transform(df)

In [8]:
output.select('Independent Features').show()

+--------------------+
|Independent Features|
+--------------------+
|  [19.0,19000.0,1.0]|
|  [35.0,20000.0,1.0]|
|  [26.0,43000.0,0.0]|
|  [27.0,57000.0,0.0]|
|  [19.0,76000.0,1.0]|
|  [27.0,58000.0,1.0]|
|  [27.0,84000.0,0.0]|
| [32.0,150000.0,0.0]|
|  [25.0,33000.0,1.0]|
|  [35.0,65000.0,0.0]|
|  [26.0,80000.0,0.0]|
|  [26.0,52000.0,0.0]|
|  [20.0,86000.0,1.0]|
|  [32.0,18000.0,1.0]|
|  [18.0,82000.0,1.0]|
|  [29.0,80000.0,1.0]|
|  [47.0,25000.0,1.0]|
|  [45.0,26000.0,1.0]|
|  [46.0,28000.0,1.0]|
|  [48.0,29000.0,0.0]|
+--------------------+
only showing top 20 rows



In [9]:
output.show()

+------+---+------+---------------+--------------+--------------------+
|Gender|Age|Salary|Purchase Iphone|Gender_indexed|Independent Features|
+------+---+------+---------------+--------------+--------------------+
|  Male| 19| 19000|              0|           1.0|  [19.0,19000.0,1.0]|
|  Male| 35| 20000|              0|           1.0|  [35.0,20000.0,1.0]|
|Female| 26| 43000|              0|           0.0|  [26.0,43000.0,0.0]|
|Female| 27| 57000|              0|           0.0|  [27.0,57000.0,0.0]|
|  Male| 19| 76000|              0|           1.0|  [19.0,76000.0,1.0]|
|  Male| 27| 58000|              0|           1.0|  [27.0,58000.0,1.0]|
|Female| 27| 84000|              0|           0.0|  [27.0,84000.0,0.0]|
|Female| 32|150000|              1|           0.0| [32.0,150000.0,0.0]|
|  Male| 25| 33000|              0|           1.0|  [25.0,33000.0,1.0]|
|Female| 35| 65000|              0|           0.0|  [35.0,65000.0,0.0]|
|Female| 26| 80000|              0|           0.0|  [26.0,80000.

In [10]:
finalized_data=output.select("Independent Features","Purchase Iphone")

finalized_data.show()

+--------------------+---------------+
|Independent Features|Purchase Iphone|
+--------------------+---------------+
|  [19.0,19000.0,1.0]|              0|
|  [35.0,20000.0,1.0]|              0|
|  [26.0,43000.0,0.0]|              0|
|  [27.0,57000.0,0.0]|              0|
|  [19.0,76000.0,1.0]|              0|
|  [27.0,58000.0,1.0]|              0|
|  [27.0,84000.0,0.0]|              0|
| [32.0,150000.0,0.0]|              1|
|  [25.0,33000.0,1.0]|              0|
|  [35.0,65000.0,0.0]|              0|
|  [26.0,80000.0,0.0]|              0|
|  [26.0,52000.0,0.0]|              0|
|  [20.0,86000.0,1.0]|              0|
|  [32.0,18000.0,1.0]|              0|
|  [18.0,82000.0,1.0]|              0|
|  [29.0,80000.0,1.0]|              0|
|  [47.0,25000.0,1.0]|              1|
|  [45.0,26000.0,1.0]|              1|
|  [46.0,28000.0,1.0]|              1|
|  [48.0,29000.0,0.0]|              1|
+--------------------+---------------+
only showing top 20 rows



## Train Test split and Model creation

In [11]:
from pyspark.ml.classification import LogisticRegression

##train test split
train_data,test_data = finalized_data.randomSplit([0.75,0.25])
lr = LogisticRegression(featuresCol='Independent Features', labelCol='Purchase Iphone')
lr = lr.fit(train_data)

In [12]:
# Predictions
predictions = lr.transform(test_data)

In [13]:
predictions.select('Purchase Iphone', 'Independent Features', 'rawPrediction', 'prediction', 'probability').toPandas().head(5)

Unnamed: 0,Purchase Iphone,Independent Features,rawPrediction,prediction,probability
0,0,"[18.0, 82000.0, 1.0]","[5.882695050865474, -5.882695050865474]",0.0,"[0.9972204839161163, 0.002779516083883715]"
1,0,"[19.0, 19000.0, 1.0]","[8.458267028417154, -8.458267028417154]",0.0,"[0.9997879056097394, 0.00021209439026059762]"
2,0,"[19.0, 21000.0, 0.0]","[8.67212463011553, -8.67212463011553]",0.0,"[0.9998287345575892, 0.0001712654424107507]"
3,0,"[20.0, 74000.0, 1.0]","[5.683564338550912, -5.683564338550912]",0.0,"[0.996610117367442, 0.0033898826325580167]"
4,0,"[20.0, 82000.0, 0.0]","[5.625374075804308, -5.625374075804308]",0.0,"[0.9964077366188521, 0.003592263381147931]"


# Accuracy checking

In [14]:
accuracy = predictions.filter(predictions['Purchase Iphone'] == predictions.prediction).count() / float(predictions.count())
print("Accuracy : ",accuracy)
Accuracy :  0.7446808510638298

Accuracy :  0.8125
