# Logistic Regression in pyspark

In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
import pyspark

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('logreg').getOrCreate()

# Load dataset

In [6]:
df = spark.read.csv(r'logreg.csv', header=True, inferSchema=True)

In [7]:
df.show(5)

+---------+---+--------------+--------+----------------+------+
|  Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|
+---------+---+--------------+--------+----------------+------+
|    India| 41|             1|   Yahoo|              21|     1|
|   Brazil| 28|             1|   Yahoo|               5|     0|
|   Brazil| 40|             0|  Google|               3|     0|
|Indonesia| 31|             1|    Bing|              15|     1|
| Malaysia| 32|             0|  Google|              15|     1|
+---------+---+--------------+--------+----------------+------+
only showing top 5 rows



In [8]:
df.count(),len(df.columns)

(20000, 6)

In [9]:
df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Repeat_Visitor: integer (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Web_pages_viewed: integer (nullable = true)
 |-- Status: integer (nullable = true)



In [10]:
df.describe().show()

+-------+--------+-----------------+-----------------+--------+-----------------+------------------+
|summary| Country|              Age|   Repeat_Visitor|Platform| Web_pages_viewed|            Status|
+-------+--------+-----------------+-----------------+--------+-----------------+------------------+
|  count|   20000|            20000|            20000|   20000|            20000|             20000|
|   mean|    null|         28.53955|           0.5029|    null|           9.5533|               0.5|
| stddev|    null|7.888912950773227|0.500004090187782|    null|6.073903499824976|0.5000125004687693|
|    min|  Brazil|               17|                0|    Bing|                1|                 0|
|    max|Malaysia|              111|                1|   Yahoo|               29|                 1|
+-------+--------+-----------------+-----------------+--------+-----------------+------------------+



# Converting categorical variables to features

## Use LabelEncoding -> OneHotEncoding

In [11]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder

In [12]:
lbl_enc = StringIndexer(inputCol='Country',outputCol='country_labels').fit(df)

In [13]:
df = lbl_enc.transform(df)

In [15]:
df.show(5)

+---------+---+--------------+--------+----------------+------+--------------+
|  Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|country_labels|
+---------+---+--------------+--------+----------------+------+--------------+
|    India| 41|             1|   Yahoo|              21|     1|           1.0|
|   Brazil| 28|             1|   Yahoo|               5|     0|           2.0|
|   Brazil| 40|             0|  Google|               3|     0|           2.0|
|Indonesia| 31|             1|    Bing|              15|     1|           0.0|
| Malaysia| 32|             0|  Google|              15|     1|           3.0|
+---------+---+--------------+--------+----------------+------+--------------+
only showing top 5 rows



In [20]:
lbl_enc = StringIndexer(inputCol='Platform', outputCol='Platform_labels').fit(df)

In [21]:
df = lbl_enc.transform(df)

In [23]:
ohe = OneHotEncoder(inputCol='country_labels',outputCol='country_vector').fit(df)

In [24]:
df = ohe.transform(df)

In [25]:
ohe = OneHotEncoder(inputCol='Platform_labels',outputCol='platform_vector').fit(df)

In [28]:
df = ohe.transform(df)

In [32]:
vec_asm = VectorAssembler(inputCols=['country_vector', 'Repeat_Visitor', 'Web_pages_viewed', 'Status', 'platform_vector'], outputCol='features')

In [33]:
df = vec_asm.transform(df)

In [38]:
df = df.select('features','Status')

# Splitting Dataset

In [41]:
train_df, test_df = df.randomSplit([0.8,0.2])

In [44]:
train_df.groupBy('Status').count().show()

+------+-----+
|Status|count|
+------+-----+
|     1| 7984|
|     0| 7978|
+------+-----+



# Training Model

In [45]:
from pyspark.ml.classification import LogisticRegression

In [80]:
logreg = LogisticRegression(labelCol='Status').fit(df)

# Prediction and Evaluation

In [81]:
preds = logreg.evaluate(test_df).predictions

In [82]:
preds = preds.select('Status','prediction')

In [83]:
TP = preds.filter((preds['Status'] == 1) & (preds['prediction'] == 1)).count()

In [84]:
TN = preds.filter((preds['Status'] == 0) & (preds['prediction'] == 0)).count()

In [85]:
FP = preds.filter((preds['Status'] == 0)&(preds['prediction'] == 1)).count()

In [86]:
FN = preds.filter((preds['Status'] == 1)&(preds['prediction'] == 0)).count()

In [87]:
acc = (TP+TN)/(TP+TN+FP+FN)

In [93]:
acc

1.0

In [88]:
precision = TP/(TP+FP)

In [89]:
recall = TP/(TP+FN)

In [90]:
precision

1.0

In [91]:
recall

1.0