In [33]:
import os
os.environ["SPARK_HOME"] = "/Applications/spark"
os.environ["PYSPARK_DRIVER_PYTHON"] = "jupyter"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"] = "notebook"
os.environ["PYSPARK_PYTHON"] = "python"

In [34]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("pyspark-ml") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4") \
    .getOrCreate()

26/01/24 23:40:09 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
26/01/24 23:40:09 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [35]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime


import pyspark.sql.functions as F
from pyspark.sql.types import (
    StructType, 
    StructField, 
    IntegerType, 
    FloatType, 
    StringType, 
    DoubleType, 
    BooleanType
)
from pyspark.sql import DataFrame

#### 1.	Load the CSV file with header and infer schema.

In [36]:
data = spark.read.csv("../dataset/online_shoppers_intention.csv", header = True, inferSchema = True)

data.printSchema()

root
 |-- Administrative: integer (nullable = true)
 |-- Administrative_Duration: double (nullable = true)
 |-- Informational: integer (nullable = true)
 |-- Informational_Duration: double (nullable = true)
 |-- ProductRelated: integer (nullable = true)
 |-- ProductRelated_Duration: double (nullable = true)
 |-- BounceRates: double (nullable = true)
 |-- ExitRates: double (nullable = true)
 |-- PageValues: double (nullable = true)
 |-- SpecialDay: double (nullable = true)
 |-- Month: string (nullable = true)
 |-- OperatingSystems: integer (nullable = true)
 |-- Browser: integer (nullable = true)
 |-- Region: integer (nullable = true)
 |-- TrafficType: integer (nullable = true)
 |-- VisitorType: string (nullable = true)
 |-- Weekend: boolean (nullable = true)
 |-- Revenue: boolean (nullable = true)



#### 2.	Load the CSV again using an explicitly defined schema.

In [37]:
data_schema = StructType([
    StructField("Administrative", IntegerType()), 
    StructField("Administrative_Duration", DoubleType()), 
    StructField("Informational", IntegerType()), 
    StructField("Informational_Duration", DoubleType()), 
    StructField("ProductRelated", IntegerType()), 
    StructField("ProductRelated_Duration", DoubleType()), 
    StructField("BounceRates", DoubleType()), 
    StructField("ExitRates", DoubleType()), 
    StructField("PageValues", DoubleType()), 
    StructField("SpecialDay", DoubleType()),
    StructField("Month", StringType()), 
    StructField("OperatingSystems", IntegerType()), 
    StructField("Browser", IntegerType()),
    StructField("Region", IntegerType()), 
    StructField("TrafficType",IntegerType()), 
    StructField("VisitorType", StringType()), 
    StructField("Weekend", BooleanType()),
    StructField("Revenue", BooleanType())
])


data = spark.read.csv("../dataset/online_shoppers_intention.csv", schema = data_schema, header = True)

data.printSchema()

root
 |-- Administrative: integer (nullable = true)
 |-- Administrative_Duration: double (nullable = true)
 |-- Informational: integer (nullable = true)
 |-- Informational_Duration: double (nullable = true)
 |-- ProductRelated: integer (nullable = true)
 |-- ProductRelated_Duration: double (nullable = true)
 |-- BounceRates: double (nullable = true)
 |-- ExitRates: double (nullable = true)
 |-- PageValues: double (nullable = true)
 |-- SpecialDay: double (nullable = true)
 |-- Month: string (nullable = true)
 |-- OperatingSystems: integer (nullable = true)
 |-- Browser: integer (nullable = true)
 |-- Region: integer (nullable = true)
 |-- TrafficType: integer (nullable = true)
 |-- VisitorType: string (nullable = true)
 |-- Weekend: boolean (nullable = true)
 |-- Revenue: boolean (nullable = true)



#### 3.	Add a column ingestion_timestamp using current timestamp.

In [38]:
current_date = datetime.now().date()
print(f"Ingestion Date : {current_date}")

Ingestion Date : 2026-01-24


In [39]:
data = data.withColumn("ingestion_timestamp", F.lit(current_date))

data.select('ingestion_timestamp').show()

+-------------------+
|ingestion_timestamp|
+-------------------+
|         2026-01-24|
|         2026-01-24|
|         2026-01-24|
|         2026-01-24|
|         2026-01-24|
|         2026-01-24|
|         2026-01-24|
|         2026-01-24|
|         2026-01-24|
|         2026-01-24|
|         2026-01-24|
|         2026-01-24|
|         2026-01-24|
|         2026-01-24|
|         2026-01-24|
|         2026-01-24|
|         2026-01-24|
|         2026-01-24|
|         2026-01-24|
|         2026-01-24|
+-------------------+
only showing top 20 rows



#### 4.	Rename column Revenue to made_purchase.

In [40]:
data = data.withColumnRenamed("Revenue", "made_purchase")
data.printSchema()

root
 |-- Administrative: integer (nullable = true)
 |-- Administrative_Duration: double (nullable = true)
 |-- Informational: integer (nullable = true)
 |-- Informational_Duration: double (nullable = true)
 |-- ProductRelated: integer (nullable = true)
 |-- ProductRelated_Duration: double (nullable = true)
 |-- BounceRates: double (nullable = true)
 |-- ExitRates: double (nullable = true)
 |-- PageValues: double (nullable = true)
 |-- SpecialDay: double (nullable = true)
 |-- Month: string (nullable = true)
 |-- OperatingSystems: integer (nullable = true)
 |-- Browser: integer (nullable = true)
 |-- Region: integer (nullable = true)
 |-- TrafficType: integer (nullable = true)
 |-- VisitorType: string (nullable = true)
 |-- Weekend: boolean (nullable = true)
 |-- made_purchase: boolean (nullable = true)
 |-- ingestion_timestamp: date (nullable = false)



#### 5. Convert Month to lowercase

In [41]:
data = data.withColumn("Month", F.lower(F.col("Month")))

data.select("Month").show() # all the entries are converted to lowercase characters

+-----+
|Month|
+-----+
|  feb|
|  feb|
|  feb|
|  feb|
|  feb|
|  feb|
|  feb|
|  feb|
|  feb|
|  feb|
|  feb|
|  feb|
|  feb|
|  feb|
|  feb|
|  feb|
|  feb|
|  feb|
|  feb|
|  feb|
+-----+
only showing top 20 rows



#### 6.	Drop records where VisitorType is null.

In [42]:
data.select("VisitorType").distinct().show()
# We can see only three different categories in VisitorType column 

+-----------------+
|      VisitorType|
+-----------------+
|      New_Visitor|
|            Other|
|Returning_Visitor|
+-----------------+



In [43]:
# Dropping th records where VisitorType is null
data = data[~F.col("VisitorType").isNull()]
# Currently we don't have any records with null entires     

#### 7.	Count total number of distinct regions.

In [44]:
data.select("Region").distinct().show()

print(f"Total number of distinct regions in the dataframe: {data.select("Region").distinct().count()}")

+------+
|Region|
+------+
|     1|
|     6|
|     3|
|     5|
|     9|
|     4|
|     8|
|     7|
|     2|
+------+

Total number of distinct regions in the dataframe: 9


#### 8. Cache the DataFrame and trigger an action

In [45]:
data.cache() # It is transformation technique, once we perform any action subsequently then all the transformations we have performed are executed till that point.


data.persist()


data.show()

+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------------+-------+-------------+-------------------+
|Administrative|Administrative_Duration|Informational|Informational_Duration|ProductRelated|ProductRelated_Duration|BounceRates|  ExitRates|PageValues|SpecialDay|Month|OperatingSystems|Browser|Region|TrafficType|      VisitorType|Weekend|made_purchase|ingestion_timestamp|
+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------------+-------+-------------+-------------------+
|             0|                    0.0|            0|                   0.0|             1|                    0.0|        0.2|        0.2|       0.0|       0.0|  feb|             

26/01/24 23:40:10 WARN CacheManager: Asked to cache already cached data.


#### Write the dataset as parquet partitioned by Month.

In [46]:
data.write.partitionBy("Month").mode("overwrite").parquet("../dataset/month_partition_online_shoppers_intention")

                                                                                

In [47]:
spark.stop()