# Create the Spark Session Object

In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('log_reg').getOrCreate()

# Read the dataset 

In [2]:
 df=spark.read.csv('./Data/Log_Reg_dataset.csv',inferSchema=True,
header=True)

#  Exploratory Data Analysis

In [3]:
#The shape of the dataset 
print((df.count(), len(df.columns)))


(20000, 6)


In [4]:
# Check the datatypes 
df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Repeat_Visitor: integer (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Web_pages_viewed: integer (nullable = true)
 |-- Status: integer (nullable = true)



In [5]:
 df.show(10)

+---------+---+--------------+--------+----------------+------+
|  Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|
+---------+---+--------------+--------+----------------+------+
|    India| 41|             1|   Yahoo|              21|     1|
|   Brazil| 28|             1|   Yahoo|               5|     0|
|   Brazil| 40|             0|  Google|               3|     0|
|Indonesia| 31|             1|    Bing|              15|     1|
| Malaysia| 32|             0|  Google|              15|     1|
|   Brazil| 32|             0|  Google|               3|     0|
|   Brazil| 32|             0|  Google|               6|     0|
|Indonesia| 27|             0|  Google|               9|     0|
|Indonesia| 32|             0|   Yahoo|               2|     0|
|Indonesia| 31|             1|    Bing|              16|     1|
+---------+---+--------------+--------+----------------+------+
only showing top 10 rows



In [6]:
# statistical measures 
df.describe().show()


+-------+--------+-----------------+-----------------+--------+-----------------+------------------+
|summary| Country|              Age|   Repeat_Visitor|Platform| Web_pages_viewed|            Status|
+-------+--------+-----------------+-----------------+--------+-----------------+------------------+
|  count|   20000|            20000|            20000|   20000|            20000|             20000|
|   mean|    null|         28.53955|           0.5029|    null|           9.5533|               0.5|
| stddev|    null|7.888912950773227|0.500004090187782|    null|6.073903499824976|0.5000125004687693|
|    min|  Brazil|               17|                0|    Bing|                1|                 0|
|    max|Malaysia|              111|                1|   Yahoo|               29|                 1|
+-------+--------+-----------------+-----------------+--------+-----------------+------------------+



In [7]:
# Apply some filters 
df.groupBy('Country').count().show()
df.groupBy('Platform').count().show()
df.groupBy('Status').count().show()
df.groupBy('Platform').mean().show()


+---------+-----+
|  Country|count|
+---------+-----+
| Malaysia| 1218|
|    India| 4018|
|Indonesia|12178|
|   Brazil| 2586|
+---------+-----+

+--------+-----+
|Platform|count|
+--------+-----+
|   Yahoo| 9859|
|    Bing| 4360|
|  Google| 5781|
+--------+-----+

+------+-----+
|Status|count|
+------+-----+
|     1|10000|
|     0|10000|
+------+-----+

+--------+------------------+-------------------+---------------------+------------------+
|Platform|          avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)|       avg(Status)|
+--------+------------------+-------------------+---------------------+------------------+
|   Yahoo|28.569226087838523| 0.5094837204584644|    9.599655137437875|0.5071508266558474|
|    Bing| 28.68394495412844| 0.4720183486238532|    9.114908256880733|0.4559633027522936|
|  Google|28.380038055699707| 0.5149628092025601|    9.804878048780488|0.5210171250648676|
+--------+------------------+-------------------+---------------------+------------------+



# Feature Engineering

In [8]:
# convert the categorical variable into numerical form and create a single vector combining all the input features
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
#Platform and country are string so we have to convert them 
Platform_indexer =StringIndexer(inputCol="Platform", outputCol="Platform_Num").fit(df)
df = Platform_indexer.transform(df)
df.show(3,False)

+-------+---+--------------+--------+----------------+------+------------+
|Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|Platform_Num|
+-------+---+--------------+--------+----------------+------+------------+
|India  |41 |1             |Yahoo   |21              |1     |0.0         |
|Brazil |28 |1             |Yahoo   |5               |0     |0.0         |
|Brazil |40 |0             |Google  |3               |0     |1.0         |
+-------+---+--------------+--------+----------------+------+------------+
only showing top 3 rows



In [9]:
# represent the values into a 1 vector 
from pyspark.ml.feature import OneHotEncoder

Platform_encoder=OneHotEncoder(inputCol="Platform_Num", outputCol="Platform_Vector")
Platform_encoder.setDropLast(False)
ohe = Platform_encoder.fit(df)
df = ohe.transform(df)

In [10]:
df.show(3,False)
# (3,[0],[1.0]) represents a vector of lenght 3 :
#   Size of vector : 3 
#   Value contained : [1.0]
#   position of 1.0 in vector [0]

+-------+---+--------------+--------+----------------+------+------------+---------------+
|Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|Platform_Num|Platform_Vector|
+-------+---+--------------+--------+----------------+------+------------+---------------+
|India  |41 |1             |Yahoo   |21              |1     |0.0         |(3,[0],[1.0])  |
|Brazil |28 |1             |Yahoo   |5               |0     |0.0         |(3,[0],[1.0])  |
|Brazil |40 |0             |Google  |3               |0     |1.0         |(3,[1],[1.0])  |
+-------+---+--------------+--------+----------------+------+------------+---------------+
only showing top 3 rows



In [11]:
country_indexer = StringIndexer(inputCol="Country",outputCol="Country_Num").fit(df)
df = country_indexer.transform(df)

In [12]:
df.groupBy('Country').count().orderBy('count',ascending=False).show(5,False)
df.groupBy('Country_Num').count().orderBy('count',ascending=False).show(5,False)

+---------+-----+
|Country  |count|
+---------+-----+
|Indonesia|12178|
|India    |4018 |
|Brazil   |2586 |
|Malaysia |1218 |
+---------+-----+

+-----------+-----+
|Country_Num|count|
+-----------+-----+
|0.0        |12178|
|1.0        |4018 |
|2.0        |2586 |
|3.0        |1218 |
+-----------+-----+



In [13]:
country_encoder = OneHotEncoder(inputCol="Country_Num",outputCol="Country_Vector")
country_encoder.setDropLast(False)
ohe = country_encoder.fit(df)
df = ohe.transform(df)
df.select(['Country','Country_Num','Country_Vector']).show(3,False)

+-------+-----------+--------------+
|Country|Country_Num|Country_Vector|
+-------+-----------+--------------+
|India  |1.0        |(4,[1],[1.0]) |
|Brazil |2.0        |(4,[2],[1.0]) |
|Brazil |2.0        |(4,[2],[1.0]) |
+-------+-----------+--------------+
only showing top 3 rows



In [14]:
df.groupBy('Country_Vector').count().orderBy('count',ascending=False).show(5,False)

+--------------+-----+
|Country_Vector|count|
+--------------+-----+
|(4,[0],[1.0]) |12178|
|(4,[1],[1.0]) |4018 |
|(4,[2],[1.0]) |2586 |
|(4,[3],[1.0]) |1218 |
+--------------+-----+



In [15]:
# Select the culumns that we need to to usse to create the vector 
df_assembler = VectorAssembler(inputCols=['Platform_Vector','Country_Vector','Age', 'Repeat_Visitor','Web_pages_viewed'], outputCol="features")
df = df_assembler.transform(df)

In [16]:
 df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Repeat_Visitor: integer (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Web_pages_viewed: integer (nullable = true)
 |-- Status: integer (nullable = true)
 |-- Platform_Num: double (nullable = false)
 |-- Platform_Vector: vector (nullable = true)
 |-- Country_Num: double (nullable = false)
 |-- Country_Vector: vector (nullable = true)
 |-- features: vector (nullable = true)



In [17]:
 df.select(['features','Status']).show(10,False)

+----------------------------------------+------+
|features                                |Status|
+----------------------------------------+------+
|(10,[0,4,7,8,9],[1.0,1.0,41.0,1.0,21.0])|1     |
|(10,[0,5,7,8,9],[1.0,1.0,28.0,1.0,5.0]) |0     |
|(10,[1,5,7,9],[1.0,1.0,40.0,3.0])       |0     |
|(10,[2,3,7,8,9],[1.0,1.0,31.0,1.0,15.0])|1     |
|(10,[1,6,7,9],[1.0,1.0,32.0,15.0])      |1     |
|(10,[1,5,7,9],[1.0,1.0,32.0,3.0])       |0     |
|(10,[1,5,7,9],[1.0,1.0,32.0,6.0])       |0     |
|(10,[1,3,7,9],[1.0,1.0,27.0,9.0])       |0     |
|(10,[0,3,7,9],[1.0,1.0,32.0,2.0])       |0     |
|(10,[2,3,7,8,9],[1.0,1.0,31.0,1.0,16.0])|1     |
+----------------------------------------+------+
only showing top 10 rows



In [18]:
 model_df=df.select(['features','Status'])

## Split the dataset 

In [19]:
training_df,test_df=model_df.randomSplit([0.75,0.25])
print(training_df.count())

14895


In [20]:
 training_df.groupBy('Status').count().show()

+------+-----+
|Status|count|
+------+-----+
|     1| 7457|
|     0| 7438|
+------+-----+



In [21]:
print(test_df.count())

5105


In [22]:
 test_df.groupBy('Status').count().show()

+------+-----+
|Status|count|
+------+-----+
|     1| 2543|
|     0| 2562|
+------+-----+



## Build and train the model 

In [23]:
from pyspark.ml.classification import LogisticRegression

In [25]:
log_reg=LogisticRegression(labelCol='Status').fit(training_df)

In [27]:
train_results=log_reg.evaluate(training_df).predictions

In [28]:
train_results.filter(train_results['Status']==1).filter(train_results['prediction']==1).select(['Status','prediction','probability']).show(10,False)

+------+----------+----------------------------------------+
|Status|prediction|probability                             |
+------+----------+----------------------------------------+
|1     |1.0       |[0.2540780514504137,0.7459219485495864] |
|1     |1.0       |[0.13802880928765998,0.8619711907123401]|
|1     |1.0       |[0.07000976293178146,0.9299902370682186]|
|1     |1.0       |[0.07000976293178146,0.9299902370682186]|
|1     |1.0       |[0.07000976293178146,0.9299902370682186]|
|1     |1.0       |[0.07000976293178146,0.9299902370682186]|
|1     |1.0       |[0.07000976293178146,0.9299902370682186]|
|1     |1.0       |[0.03418057622497152,0.9658194237750285]|
|1     |1.0       |[0.03418057622497152,0.9658194237750285]|
|1     |1.0       |[0.03418057622497152,0.9658194237750285]|
+------+----------+----------------------------------------+
only showing top 10 rows



## Evaluation 

In [30]:
results=log_reg.evaluate(test_df).predictions
results.printSchema()


root
 |-- features: vector (nullable = true)
 |-- Status: integer (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [31]:
 results.select(['Status','prediction']).show(10,False)

+------+----------+
|Status|prediction|
+------+----------+
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|1     |0.0       |
|1     |1.0       |
|1     |1.0       |
|1     |1.0       |
|1     |1.0       |
|1     |1.0       |
|1     |1.0       |
+------+----------+
only showing top 10 rows



## Confusion matrix 

In [33]:
tp = results[(results.Status == 1) & (results.prediction== 1)].count()
tn = results[(results.Status == 0) & (results.prediction== 0)].count()
fp = results[(results.Status == 0) & (results.prediction== 1)].count()
fn = results[(results.Status == 1) & (results.prediction== 0)].count()

In [36]:
print(tp)
print(tn)
print(fp)
print(fn)

2377
2400
162
166


In [None]:
accuracy=float((true_postives+true_negatives) /(results.count()))
