In [1]:
import os
os.environ["SPARK_HOME"] = "/Applications/spark"
os.environ["PYSPARK_DRIVER_PYTHON"] = "jupyter"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"] = "notebook"
os.environ["PYSPARK_PYTHON"] = "python"

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('pyspark-ml') \
                .getOrCreate()

spark

26/01/27 22:39:03 WARN Utils: Your hostname, Kavis-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.4.24 instead (on interface en0)
26/01/27 22:39:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/27 22:39:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import pyspark.sql.functions as F
from pyspark.sql.types import (
    StructField, 
    StructType, 
    IntegerType, 
    FloatType, 
    StringType,
)
from pyspark.sql import DataFrame
from pyspark.sql.functions import udf

In [4]:
data = spark.read.parquet("../dataset/month_partition_online_shoppers_intention", header= True, inferSchema = True)

data.show()

                                                                                

+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+-----------+----------+----------------+-------+------+-----------+-----------------+-------+-------------+-------------------+-----+
|Administrative|Administrative_Duration|Informational|Informational_Duration|ProductRelated|ProductRelated_Duration|BounceRates|  ExitRates| PageValues|SpecialDay|OperatingSystems|Browser|Region|TrafficType|      VisitorType|Weekend|made_purchase|ingestion_timestamp|Month|
+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+-----------+----------+----------------+-------+------+-----------+-----------------+-------+-------------+-------------------+-----+
|             1|                   39.2|            2|                 120.8|             7|                   80.5|        0.0|       0.01|        0.0|       0.0|               

##### 1. Create TotalDuration by summing all duration columns

In [5]:
data = data.withColumn(
    "TotalDuration", 
    F.col('Administrative_Duration') + F.col('Informational_Duration') + F.col('ProductRelated_Duration')
)

cols = ["TotalDuration", "Administrative_Duration", "Informational_Duration", "ProductRelated_Duration"]
data.select(*cols).show()

+------------------+-----------------------+----------------------+-----------------------+
|     TotalDuration|Administrative_Duration|Informational_Duration|ProductRelated_Duration|
+------------------+-----------------------+----------------------+-----------------------+
|             240.5|                   39.2|                 120.8|                   80.5|
|1811.5066669999999|                   89.6|                   0.0|            1721.906667|
|       856.5766667|                  204.2|                   0.0|            652.3766667|
|       710.0666667|                    0.0|                   0.0|            710.0666667|
|       968.6924242|                    0.0|                   0.0|            968.6924242|
|       3812.879013|            1013.056909|                 102.8|            2697.022104|
|436.17619049999996|                   86.6|                   0.0|            349.5761905|
|      1932.4289997|                 545.65|           204.3466667|            1

##### 2.	Bucket PageValues into Low / Medium / High.

In [6]:
data.select('PageValues').describe().show()

+-------+-----------------+
|summary|       PageValues|
+-------+-----------------+
|  count|            12330|
|   mean|5.889257862693592|
| stddev|18.56843660780653|
|    min|              0.0|
|    max|      361.7637419|
+-------+-----------------+



In [7]:
(data.select(F.percentile(col = 'PageValues', 
                        percentage = [0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90], frequency = 1)
                        .alias("percentileValues")).collect()[0][0])

# We see almost around 70% values are null 

# Bucket -> Low  where value == 0.0
# Bucket -> Medium where value > 0 and value <= 3.07
# Bucket -> hight where value > 3.07


[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.060078087400001, 18.855502398000006]

In [8]:
data = (data.withColumn("PageValuesBucket", 
                 F.when(F.col('PageValues') == 0.0, F.lit('low'))
                .when(((F.col('PageValues') > 0) & (F.col('PageValues') <= 3.07)), F.lit('Medium'))
                .otherwise(F.lit("High"))))


data.select(*['PageValues', 'PageValuesBucket']).show()


# Counts
data.groupBy("PageValuesBucket").count().show()

+-----------+----------------+
| PageValues|PageValuesBucket|
+-----------+----------------+
|        0.0|             low|
|204.0079491|            High|
|        0.0|             low|
|72.52283848|            High|
|106.2525169|            High|
|1.798669862|          Medium|
|60.43737784|            High|
|19.71529542|            High|
|        0.0|             low|
|        0.0|             low|
|        0.0|             low|
|        0.0|             low|
| 77.4579855|            High|
|26.13034871|            High|
|        0.0|             low|
|44.85638692|            High|
|        0.0|             low|
|        0.0|             low|
|20.94544525|            High|
|10.72117234|            High|
+-----------+----------------+
only showing top 20 rows

+----------------+-----+
|PageValuesBucket|count|
+----------------+-----+
|            High| 2464|
|             low| 9600|
|          Medium|  266|
+----------------+-----+



##### 3. Encode Month into a numeric month index.

In [9]:
data.select(F.from_unixtime(F.unix_timestamp(F.col("Month"), "MMM"), 'MM')).show()

+---------------------------------------------+
|from_unixtime(unix_timestamp(Month, MMM), MM)|
+---------------------------------------------+
|                                           11|
|                                           11|
|                                           11|
|                                           11|
|                                           11|
|                                           11|
|                                           11|
|                                           11|
|                                           11|
|                                           11|
|                                           11|
|                                           11|
|                                           11|
|                                           11|
|                                           11|
|                                           11|
|                                           11|
|                                       

In [15]:
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

In [16]:
data.withColumn('Month_Num', 
        F.from_unixtime(F.unix_timestamp(F.col("Month"), "MMM"), 'MM')).select(['Month', 'Month_Num']).distinct().show()

+-----+---------+
|Month|Month_Num|
+-----+---------+
|  nov|       11|
|  may|       05|
|  dec|       12|
|  mar|       03|
|  oct|       10|
|  sep|       09|
|  aug|       08|
|  jul|       07|
| june|       06|
|  feb|       02|
+-----+---------+



In [17]:
data = data.withColumn('Month_Num', 
        F.from_unixtime(F.unix_timestamp(F.col("Month"), "MMM"), 'MM'))


data.select(F.col("Month"), F.col("Month_Num")).show()

+-----+---------+
|Month|Month_Num|
+-----+---------+
|  nov|       11|
|  nov|       11|
|  nov|       11|
|  nov|       11|
|  nov|       11|
|  nov|       11|
|  nov|       11|
|  nov|       11|
|  nov|       11|
|  nov|       11|
|  nov|       11|
|  nov|       11|
|  nov|       11|
|  nov|       11|
|  nov|       11|
|  nov|       11|
|  nov|       11|
|  nov|       11|
|  nov|       11|
|  nov|       11|
+-----+---------+
only showing top 20 rows



##### 4. Normalize BounceRates using min‑max scaling

In [18]:
maxVal, minVal = data.select(F.max('BounceRates'), F.min('BounceRates')).collect()[0][0], data.select(F.max('BounceRates'), F.min('BounceRates')).collect()[0][1]

print(f"maxVal : {maxVal} and minVal : {minVal}")

maxVal : 0.2 and minVal : 0.0


In [19]:
@udf(returnType = FloatType())
def min_max_sampling(val):
    scaled_val =(val - minVal) / (maxVal - minVal)
    return scaled_val


data = data.withColumn('scaledBounceRates', min_max_sampling(F.col('BounceRates')))
cols = ["BounceRates", "scaledBounceRates"]
data.select(*cols).show()

+-----------+-----------------+
|BounceRates|scaledBounceRates|
+-----------+-----------------+
|        0.0|              0.0|
|        0.0|              0.0|
|0.012121212|       0.06060606|
|        0.0|              0.0|
|        0.0|              0.0|
|        0.0|              0.0|
|        0.0|              0.0|
| 6.84932E-4|       0.00342466|
|        0.0|              0.0|
|        0.0|              0.0|
|        0.0|              0.0|
|        0.0|              0.0|
|        0.0|              0.0|
|0.015384615|       0.07692307|
|0.003773585|      0.018867925|
|        0.0|              0.0|
|        0.2|              1.0|
|        0.0|              0.0|
|       0.04|              0.2|
|0.021212121|        0.1060606|
+-----------+-----------------+
only showing top 20 rows



##### 5. Create interaction feature: ProductRelated * PageValues.

In [20]:
@udf(returnType = FloatType())
def interactionfeature(productrel, pageVal):
    return productrel * pageVal


data = data.withColumn("InteractionFeat", interactionfeature(F.col('ProductRelated'), F.col('PageValues')))

data.select(["InteractionFeat", "ProductRelated", "PageValues"]).show()
    

+---------------+--------------+-----------+
|InteractionFeat|ProductRelated| PageValues|
+---------------+--------------+-----------+
|            0.0|             7|        0.0|
|      11628.453|            57|204.0079491|
|            0.0|            31|        0.0|
|       942.7969|            13|72.52283848|
|      2550.0603|            24|106.2525169|
|      138.49757|            77|1.798669862|
|       846.1233|            14|60.43737784|
|      1360.3553|            69|19.71529542|
|            0.0|            16|        0.0|
|            0.0|            12|        0.0|
|            0.0|            11|        0.0|
|            0.0|             0|        0.0|
|      1936.4496|            25| 77.4579855|
|      679.38904|            26|26.13034871|
|            0.0|            48|        0.0|
|      1211.1224|            27|44.85638692|
|            0.0|             0|        0.0|
|            0.0|            10|        0.0|
|      125.67267|             6|20.94544525|
|      407

##### 6. StringIndexer Visitor Type Column

In [21]:
data.select('VisitorType').distinct().show()

+-----------------+
|      VisitorType|
+-----------------+
|      New_Visitor|
|            Other|
|Returning_Visitor|
+-----------------+



In [22]:
from pyspark.ml.feature import StringIndexer
strIndexer = StringIndexer(inputCol = 'VisitorType', outputCol = 'VisitorType_indexer')
strIndexer.setHandleInvalid('error')

# fit strIndexer to the dataset
model = strIndexer.fit(data)

model.transform(data).select(['VisitorType', 'VisitorType_indexer']).show()

+-----------------+-------------------+
|      VisitorType|VisitorType_indexer|
+-----------------+-------------------+
|      New_Visitor|                1.0|
|Returning_Visitor|                0.0|
|Returning_Visitor|                0.0|
|Returning_Visitor|                0.0|
|Returning_Visitor|                0.0|
|Returning_Visitor|                0.0|
|      New_Visitor|                1.0|
|Returning_Visitor|                0.0|
|      New_Visitor|                1.0|
|Returning_Visitor|                0.0|
|Returning_Visitor|                0.0|
|      New_Visitor|                1.0|
|      New_Visitor|                1.0|
|Returning_Visitor|                0.0|
|Returning_Visitor|                0.0|
|      New_Visitor|                1.0|
|Returning_Visitor|                0.0|
|      New_Visitor|                1.0|
|Returning_Visitor|                0.0|
|Returning_Visitor|                0.0|
+-----------------+-------------------+
only showing top 20 rows



##### 7. One‑hot encode Month.

In [23]:
# Change the data type of the Month_Num column
data = data.withColumn('Month_Num', F.col('Month_Num').cast('float'))

data.printSchema()

root
 |-- Administrative: integer (nullable = true)
 |-- Administrative_Duration: double (nullable = true)
 |-- Informational: integer (nullable = true)
 |-- Informational_Duration: double (nullable = true)
 |-- ProductRelated: integer (nullable = true)
 |-- ProductRelated_Duration: double (nullable = true)
 |-- BounceRates: double (nullable = true)
 |-- ExitRates: double (nullable = true)
 |-- PageValues: double (nullable = true)
 |-- SpecialDay: double (nullable = true)
 |-- OperatingSystems: integer (nullable = true)
 |-- Browser: integer (nullable = true)
 |-- Region: integer (nullable = true)
 |-- TrafficType: integer (nullable = true)
 |-- VisitorType: string (nullable = true)
 |-- Weekend: boolean (nullable = true)
 |-- made_purchase: boolean (nullable = true)
 |-- ingestion_timestamp: date (nullable = true)
 |-- Month: string (nullable = true)
 |-- TotalDuration: double (nullable = true)
 |-- PageValuesBucket: string (nullable = false)
 |-- Month_Num: float (nullable = true)
 |

In [24]:
data.select('Month_Num').distinct().show()

+---------+
|Month_Num|
+---------+
|     11.0|
|      5.0|
|     12.0|
|      3.0|
|     10.0|
|      9.0|
|      8.0|
|      7.0|
|      6.0|
|      2.0|
+---------+



In [25]:
from pyspark.ml.feature import OneHotEncoder
onehotencoder = OneHotEncoder(inputCol = 'Month_Num', outputCol = 'Month_OneHot')
model = onehotencoder.fit(data)

model.transform(data).select(['Month_Num', 'Month_OneHot']).show()

+---------+---------------+
|Month_Num|   Month_OneHot|
+---------+---------------+
|     11.0|(12,[11],[1.0])|
|     11.0|(12,[11],[1.0])|
|     11.0|(12,[11],[1.0])|
|     11.0|(12,[11],[1.0])|
|     11.0|(12,[11],[1.0])|
|     11.0|(12,[11],[1.0])|
|     11.0|(12,[11],[1.0])|
|     11.0|(12,[11],[1.0])|
|     11.0|(12,[11],[1.0])|
|     11.0|(12,[11],[1.0])|
|     11.0|(12,[11],[1.0])|
|     11.0|(12,[11],[1.0])|
|     11.0|(12,[11],[1.0])|
|     11.0|(12,[11],[1.0])|
|     11.0|(12,[11],[1.0])|
|     11.0|(12,[11],[1.0])|
|     11.0|(12,[11],[1.0])|
|     11.0|(12,[11],[1.0])|
|     11.0|(12,[11],[1.0])|
|     11.0|(12,[11],[1.0])|
+---------+---------------+
only showing top 20 rows



#### 9. Create a weighted session score

In [26]:
data.columns

['Administrative',
 'Administrative_Duration',
 'Informational',
 'Informational_Duration',
 'ProductRelated',
 'ProductRelated_Duration',
 'BounceRates',
 'ExitRates',
 'PageValues',
 'SpecialDay',
 'OperatingSystems',
 'Browser',
 'Region',
 'TrafficType',
 'VisitorType',
 'Weekend',
 'made_purchase',
 'ingestion_timestamp',
 'Month',
 'TotalDuration',
 'PageValuesBucket',
 'Month_Num',
 'scaledBounceRates',
 'InteractionFeat']

##### 10. Drop Intermediate Feature Columns

In [None]:
data.drop('Administrative', 'Month_Num')

# We dont't know what are intermediate features, here we just showed how to drop multple columns at once

DataFrame[Administrative_Duration: double, Informational: int, Informational_Duration: double, ProductRelated: int, ProductRelated_Duration: double, BounceRates: double, ExitRates: double, PageValues: double, SpecialDay: double, OperatingSystems: int, Browser: int, Region: int, TrafficType: int, VisitorType: string, Weekend: boolean, made_purchase: boolean, ingestion_timestamp: date, Month: string, TotalDuration: double, PageValuesBucket: string, scaledBounceRates: float, InteractionFeat: float]