In [1]:
#import SparkSession
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('log_reg').getOrCreate()

In [17]:
#read the dataset
df=spark.read.csv('Log_Reg_dataset.csv',inferSchema=True,header=True)
df.show()

+---------+---+--------------+--------+----------------+------+
|  Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|
+---------+---+--------------+--------+----------------+------+
|    India| 41|             1|   Yahoo|              21|     1|
|   Brazil| 28|             1|   Yahoo|               5|     0|
|   Brazil| 40|             0|  Google|               3|     0|
|Indonesia| 31|             1|    Bing|              15|     1|
| Malaysia| 32|             0|  Google|              15|     1|
|   Brazil| 32|             0|  Google|               3|     0|
|   Brazil| 32|             0|  Google|               6|     0|
|Indonesia| 27|             0|  Google|               9|     0|
|Indonesia| 32|             0|   Yahoo|               2|     0|
|Indonesia| 31|             1|    Bing|              16|     1|
| Malaysia| 27|             1|  Google|              21|     1|
|Indonesia| 29|             1|   Yahoo|               9|     1|
|Indonesia| 33|             1|   Yahoo| 

In [18]:
#check the shape of the data 
print("Shape of dataset",(df.count(),len(df.columns)))

Shape of dataset (20000, 6)


In [19]:
#printSchema
df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Repeat_Visitor: integer (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Web_pages_viewed: integer (nullable = true)
 |-- Status: integer (nullable = true)



In [20]:
df.columns

['Country', 'Age', 'Repeat_Visitor', 'Platform', 'Web_pages_viewed', 'Status']

In [21]:
#Exploratory Data Analysis
df.describe().show()

+-------+--------+-----------------+-----------------+--------+-----------------+------------------+
|summary| Country|              Age|   Repeat_Visitor|Platform| Web_pages_viewed|            Status|
+-------+--------+-----------------+-----------------+--------+-----------------+------------------+
|  count|   20000|            20000|            20000|   20000|            20000|             20000|
|   mean|    null|         28.53955|           0.5029|    null|           9.5533|               0.5|
| stddev|    null|7.888912950773227|0.500004090187782|    null|6.073903499824976|0.5000125004687693|
|    min|  Brazil|               17|                0|    Bing|                1|                 0|
|    max|Malaysia|              111|                1|   Yahoo|               29|                 1|
+-------+--------+-----------------+-----------------+--------+-----------------+------------------+



In [22]:
df.groupBy('Country').count().show()

+---------+-----+
|  Country|count|
+---------+-----+
| Malaysia| 1218|
|    India| 4018|
|Indonesia|12178|
|   Brazil| 2586|
+---------+-----+



In [23]:
df.groupBy('Platform').count().show()

+--------+-----+
|Platform|count|
+--------+-----+
|   Yahoo| 9859|
|    Bing| 4360|
|  Google| 5781|
+--------+-----+



In [24]:
df.groupBy('Status').count().show()

+------+-----+
|Status|count|
+------+-----+
|     1|10000|
|     0|10000|
+------+-----+



In [25]:
from pyspark.ml.feature import StringIndexer

In [26]:
platform_indexer = StringIndexer(inputCol="Platform", outputCol="Platform_Num").fit(df)
df = platform_indexer.transform(df)
df.show()

+---------+---+--------------+--------+----------------+------+------------+
|  Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|Platform_Num|
+---------+---+--------------+--------+----------------+------+------------+
|    India| 41|             1|   Yahoo|              21|     1|         0.0|
|   Brazil| 28|             1|   Yahoo|               5|     0|         0.0|
|   Brazil| 40|             0|  Google|               3|     0|         1.0|
|Indonesia| 31|             1|    Bing|              15|     1|         2.0|
| Malaysia| 32|             0|  Google|              15|     1|         1.0|
|   Brazil| 32|             0|  Google|               3|     0|         1.0|
|   Brazil| 32|             0|  Google|               6|     0|         1.0|
|Indonesia| 27|             0|  Google|               9|     0|         1.0|
|Indonesia| 32|             0|   Yahoo|               2|     0|         0.0|
|Indonesia| 31|             1|    Bing|              16|     1|         2.0|

In [27]:
from pyspark.ml.feature import OneHotEncoder

In [28]:
#one hot encoding
platform_encoder = OneHotEncoder(inputCol="Platform_Num", outputCol="Platform_Num_Vector")
df = platform_encoder.transform(df)

In [29]:
country_indexer = StringIndexer(inputCol="Country", outputCol="Country_Num").fit(df)
df = country_indexer.transform(df)

In [30]:
df.select(['Country','Country_Num']).show()

+---------+-----------+
|  Country|Country_Num|
+---------+-----------+
|    India|        1.0|
|   Brazil|        2.0|
|   Brazil|        2.0|
|Indonesia|        0.0|
| Malaysia|        3.0|
|   Brazil|        2.0|
|   Brazil|        2.0|
|Indonesia|        0.0|
|Indonesia|        0.0|
|Indonesia|        0.0|
| Malaysia|        3.0|
|Indonesia|        0.0|
|Indonesia|        0.0|
|Indonesia|        0.0|
|    India|        1.0|
|Indonesia|        0.0|
|Indonesia|        0.0|
|Indonesia|        0.0|
| Malaysia|        3.0|
|Indonesia|        0.0|
+---------+-----------+
only showing top 20 rows



In [31]:
#one hot encoding
country_encoder = OneHotEncoder(inputCol="Country_Num", outputCol="Country_Vector")
df = country_encoder.transform(df)

In [32]:


df.select(['Country','country_Num','Country_Vector']).show(3,False)



+-------+-----------+--------------+
|Country|country_Num|Country_Vector|
+-------+-----------+--------------+
|India  |1.0        |(3,[1],[1.0]) |
|Brazil |2.0        |(3,[2],[1.0]) |
|Brazil |2.0        |(3,[2],[1.0]) |
+-------+-----------+--------------+
only showing top 3 rows



In [33]:
df.groupBy('Country_Num').count().orderBy('count',ascending=False).show(5,False)

+-----------+-----+
|Country_Num|count|
+-----------+-----+
|0.0        |12178|
|1.0        |4018 |
|2.0        |2586 |
|3.0        |1218 |
+-----------+-----+



In [34]:


df.groupBy('Country_Vector').count().orderBy('count',ascending=False).show(5,False)



+--------------+-----+
|Country_Vector|count|
+--------------+-----+
|(3,[0],[1.0]) |12178|
|(3,[1],[1.0]) |4018 |
|(3,[2],[1.0]) |2586 |
|(3,[],[])     |1218 |
+--------------+-----+



In [35]:
from pyspark.ml.feature import VectorAssembler

In [36]:
df.show(3)

+-------+---+--------------+--------+----------------+------+------------+-------------------+-----------+--------------+
|Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|Platform_Num|Platform_Num_Vector|Country_Num|Country_Vector|
+-------+---+--------------+--------+----------------+------+------------+-------------------+-----------+--------------+
|  India| 41|             1|   Yahoo|              21|     1|         0.0|      (2,[0],[1.0])|        1.0| (3,[1],[1.0])|
| Brazil| 28|             1|   Yahoo|               5|     0|         0.0|      (2,[0],[1.0])|        2.0| (3,[2],[1.0])|
| Brazil| 40|             0|  Google|               3|     0|         1.0|      (2,[1],[1.0])|        2.0| (3,[2],[1.0])|
+-------+---+--------------+--------+----------------+------+------------+-------------------+-----------+--------------+
only showing top 3 rows



In [38]:
df_assembler = VectorAssembler(inputCols=['Platform_Num_Vector','Country_Vector',
                                          'Age', 'Repeat_Visitor','Web_pages_viewed'], 
                               outputCol="features")
df = df_assembler.transform(df)

In [39]:


df.printSchema()



root
 |-- Country: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Repeat_Visitor: integer (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Web_pages_viewed: integer (nullable = true)
 |-- Status: integer (nullable = true)
 |-- Platform_Num: double (nullable = false)
 |-- Platform_Num_Vector: vector (nullable = true)
 |-- Country_Num: double (nullable = false)
 |-- Country_Vector: vector (nullable = true)
 |-- features: vector (nullable = true)



In [41]:

df.select(['features','Status']).show(10,False)



+-----------------------------------+------+
|features                           |Status|
+-----------------------------------+------+
|[1.0,0.0,0.0,1.0,0.0,41.0,1.0,21.0]|1     |
|[1.0,0.0,0.0,0.0,1.0,28.0,1.0,5.0] |0     |
|(8,[1,4,5,7],[1.0,1.0,40.0,3.0])   |0     |
|(8,[2,5,6,7],[1.0,31.0,1.0,15.0])  |1     |
|(8,[1,5,7],[1.0,32.0,15.0])        |1     |
|(8,[1,4,5,7],[1.0,1.0,32.0,3.0])   |0     |
|(8,[1,4,5,7],[1.0,1.0,32.0,6.0])   |0     |
|(8,[1,2,5,7],[1.0,1.0,27.0,9.0])   |0     |
|(8,[0,2,5,7],[1.0,1.0,32.0,2.0])   |0     |
|(8,[2,5,6,7],[1.0,31.0,1.0,16.0])  |1     |
+-----------------------------------+------+
only showing top 10 rows



In [42]:
#select data for building model
model_df=df.select(['features','Status'])

In [43]:
from pyspark.ml.classification import LogisticRegression

In [44]:
#split the data 
training_df,test_df=model_df.randomSplit([0.75,0.25])

In [45]:
training_df.count()

14937

In [46]:
test_df.count()

5063

In [47]:
log_reg=LogisticRegression(labelCol='Status').fit(training_df)

In [48]:
train_results=log_reg.evaluate(training_df).predictions

In [49]:
train_results.filter(train_results['Status']==1).filter(train_results['prediction']==1).select(['Status','prediction','probability']).show(10,False)

+------+----------+----------------------------------------+
|Status|prediction|probability                             |
+------+----------+----------------------------------------+
|1     |1.0       |[0.31049070635400494,0.689509293645995] |
|1     |1.0       |[0.17459623190439916,0.8254037680956008]|
|1     |1.0       |[0.17459623190439916,0.8254037680956008]|
|1     |1.0       |[0.09038303290316324,0.9096169670968368]|
|1     |1.0       |[0.09038303290316324,0.9096169670968368]|
|1     |1.0       |[0.09038303290316324,0.9096169670968368]|
|1     |1.0       |[0.09038303290316324,0.9096169670968368]|
|1     |1.0       |[0.09038303290316324,0.9096169670968368]|
|1     |1.0       |[0.04459398342158313,0.955406016578417] |
|1     |1.0       |[0.04459398342158313,0.955406016578417] |
+------+----------+----------------------------------------+
only showing top 10 rows



Probability at 0 index is for 0 class and probabilty as 1 index is for 1 class

In [50]:
correct_preds=train_results.filter(train_results['Status']==1).filter(train_results['prediction']==1).count()

In [51]:
training_df.filter(training_df['Status']==1).count()

7493

In [52]:
#accuracy on training dataset 
float(correct_preds)/(training_df.filter(training_df['Status']==1).count())

0.9376751634859202

In [53]:
results=log_reg.evaluate(test_df).predictions
results.select(['Status','prediction']).show(10,False)

+------+----------+
|Status|prediction|
+------+----------+
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|1     |0.0       |
|1     |0.0       |
|0     |1.0       |
+------+----------+
only showing top 10 rows



In [54]:
results.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Status: integer (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [55]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [56]:
#confusion matrix
true_postives = results[(results.Status == 1) & (results.prediction == 1)].count()
true_negatives = results[(results.Status == 0) & (results.prediction == 0)].count()
false_positives = results[(results.Status == 0) & (results.prediction == 1)].count()
false_negatives = results[(results.Status == 1) & (results.prediction == 0)].count()

In [57]:
print (true_postives)
print (true_negatives)
print (false_positives)
print (false_negatives)
print(true_postives+true_negatives+false_positives+false_negatives)
print (results.count())

2349
2407
149
158
5063
5063


In [58]:


recall = float(true_postives)/(true_postives + false_negatives)
print(recall)



0.9369764658954927


In [59]:
precision = float(true_postives) / (true_postives + false_positives)
print(precision)

0.9403522818254604


In [60]:
accuracy=float((true_postives+true_negatives) /(results.count()))
print(accuracy)

0.9393640134307722
