In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col, trim, lower
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql.types import DoubleType

In [5]:
# Initialize Spark session
# This sets up the entry point to use Spark SQL and ML features
spark = SparkSession.builder \
    .appName("TrafficPrediction") \
    .getOrCreate() # Naming the Spark application

In [6]:
# Load the CSV file into a Spark DataFrame with header and inferSchema to detect data types
df = spark.read.csv("/content/Traffic.csv", header=True, inferSchema=True)
print("Total rows before processing:", df.count())  # Print number of rows before cleaning

# Clean 'Traffic Situation' column by trimming whitespaces and converting to lowercase
# This ensures consistent formatting for label mapping
df = df.withColumn("Traffic Situation", trim(lower(col("Traffic Situation"))))

# Show all distinct values in 'Traffic Situation' to verify consistency
df.select("Traffic Situation").distinct().show(truncate=False)

# Map text labels to numeric values:
# 'low' -> 0, 'moderate' -> 1, 'heavy' -> 2, anything else -> None
df = df.withColumn(
    "Traffic Situation",
    when(col("Traffic Situation") == "low", 0)
    .when(col("Traffic Situation") == "moderate", 1)
    .when(col("Traffic Situation") == "heavy", 2)
    .otherwise(None)
)

# Remove any rows where mapping resulted in null values
df = df.dropna(subset=["Traffic Situation"])
print("Rows after mapping:", df.count())  # Print number of rows after filtering invalid labels

# Convert 'Traffic Situation' column to DoubleType as required for regression model training
df = df.withColumn("Traffic Situation", col("Traffic Situation").cast(DoubleType()))

# Define feature columns to be used for prediction
feature_cols = ["CarCount", "BikeCount", "BusCount", "TruckCount", "Total"]

# Assemble multiple feature columns into a single 'features' vector column (required by MLlib)
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df = assembler.transform(df)

# Randomly split data into training and testing sets (80% train, 20% test)
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)
print("Train rows:", train_df.count())  # Number of rows in training data
print("Test rows:", test_df.count())    # Number of rows in testing data

# Initialize Linear Regression model using features and label columns
lr = LinearRegression(featuresCol="features", labelCol="Traffic Situation")
model = lr.fit(train_df)  # Train the model using training data

# Use trained model to predict traffic situation on test dataset
predictions = model.transform(test_df)

# Convert continuous predictions to class labels:
# < 0.5 => 0 (low), 0.5 - 1.5 => 1 (moderate), >= 1.5 => 2 (heavy)
predictions = predictions.withColumn(
    "Predicted Traffic Situation",
    when(col("prediction") < 0.5, 0)
    .when((col("prediction") >= 0.5) & (col("prediction") < 1.5), 1)
    .otherwise(2)
)

# Display actual vs predicted traffic situations along with feature vector
predictions.select("features", "Traffic Situation", "prediction", "Predicted Traffic Situation").show(10, truncate=False)

Total rows before processing: 2976
+-----------------+
|Traffic Situation|
+-----------------+
|low              |
|normal           |
|high             |
|heavy            |
+-----------------+

Rows after mapping: 986
Train rows: 827
Test rows: 159
+---------------------------+-----------------+--------------------+---------------------------+
|features                   |Traffic Situation|prediction          |Predicted Traffic Situation|
+---------------------------+-----------------+--------------------+---------------------------+
|[10.0,1.0,13.0,7.0,31.0]   |0.0              |-0.14655629604544473|0                          |
|[80.0,6.0,12.0,6.0,104.0]  |0.0              |0.7002160159603095  |1                          |
|[18.0,1.0,0.0,10.0,29.0]   |0.0              |-0.27220584657717933|0                          |
|[177.0,56.0,12.0,4.0,249.0]|2.0              |2.158581742012165   |2                          |
|[12.0,1.0,0.0,12.0,25.0]   |0.0              |-0.2979712428880069 |0 

In [7]:
# Stop the Spark session after completion to free up resources
spark.stop()

In [9]:
"""
📌 Concept Notes
🔷 1. PySpark
Distributed computing framework for handling big data.

SparkSession: Entry point to use DataFrame and MLlib.

🔷 2. Data Preprocessing
trim() & lower(): Clean string columns for consistency.

when() & col(): Used for conditional value transformation.

dropna(): Removes rows with missing data.

🔷 3. Label Encoding
String categories converted to numbers:

"low" → 0

"moderate" / "normal" → 1

"heavy" / "high" → 2

Required for ML algorithms which work with numerical data only.

🔷 4. VectorAssembler
Combines multiple feature columns into a single vector.

Required input format for Spark MLlib models.

🔷 5. Linear Regression (MLlib)
Supervised learning algorithm.

Used here to predict numeric traffic levels (0, 1, 2).

featuresCol: Input features vector.

labelCol: Target variable (Traffic Situation).

🔷 6. Prediction Mapping
Model prediction is a continuous value.

Mapped to categories using thresholding:

< 0.5 → 0 (Low traffic)

0.5 to <1.5 → 1 (Moderate traffic)

>= 1.5 → 2 (Heavy traffic)

📊 Dataset Description

Column Name	Description
CarCount	Number of cars observed at a point
BikeCount	Number of bikes observed
BusCount	Number of buses observed
TruckCount	Number of trucks observed
Total	Total vehicles (sum of above)
Traffic Situation	Traffic level (low / normal / high / etc.)
🟨 Traffic Situation Labels

Label	Meaning
0	Low Traffic
1	Moderate / Normal Traffic
2	Heavy / High Traffic

"""

'\n📌 Concept Notes\n🔷 1. PySpark\nDistributed computing framework for handling big data.\n\nSparkSession: Entry point to use DataFrame and MLlib.\n\n🔷 2. Data Preprocessing\ntrim() & lower(): Clean string columns for consistency.\n\nwhen() & col(): Used for conditional value transformation.\n\ndropna(): Removes rows with missing data.\n\n🔷 3. Label Encoding\nString categories converted to numbers:\n\n"low" → 0\n\n"moderate" / "normal" → 1\n\n"heavy" / "high" → 2\n\nRequired for ML algorithms which work with numerical data only.\n\n🔷 4. VectorAssembler\nCombines multiple feature columns into a single vector.\n\nRequired input format for Spark MLlib models.\n\n🔷 5. Linear Regression (MLlib)\nSupervised learning algorithm.\n\nUsed here to predict numeric traffic levels (0, 1, 2).\n\nfeaturesCol: Input features vector.\n\nlabelCol: Target variable (Traffic Situation).\n\n🔷 6. Prediction Mapping\nModel prediction is a continuous value.\n\nMapped to categories using thresholding:\n\n< 0.5 → 

In [None]:
"""
VIVA QUESTIONS
🔹 Basic Questions
1. What is PySpark?
→ PySpark is the Python API for Apache Spark, used for big data processing and machine learning in a distributed environment.

2. What is a SparkSession?
→ SparkSession is the entry point to Spark functionalities, including DataFrame and MLlib operations.

3. What is the purpose of VectorAssembler?
→ It combines multiple feature columns into a single vector column required by MLlib models.

🔹 Data Preprocessing
4. Why did you use trim() and lower() on the 'Traffic Situation' column?
→ To remove extra spaces and convert to lowercase for consistent string matching.

5. Why did we map traffic labels to numbers?
→ ML models in PySpark require numerical input, so categorical labels must be encoded.

6. Why was dropna() used?
→ To remove rows with missing or unmapped traffic labels.

🔹 Model and Training
7. Why did you choose Linear Regression for this task?
→ It was used to predict traffic level as a continuous variable, later mapped to categories.

8. What is the role of labelCol and featuresCol in MLlib?
→ labelCol is the target column; featuresCol contains the input features as a vector.

9. What type of learning is this?
→ Supervised learning (Regression).

🔹 Output Interpretation
10. What does the prediction column represent?
→ It gives the continuous output from the linear regression model.

11. What do 0, 1, and 2 mean in 'Predicted Traffic Situation'?
→

0 → Low Traffic

1 → Moderate/Normal Traffic

2 → Heavy/High Traffic

12. How did you map continuous predictions to categories?
→ Using threshold ranges:

< 0.5 → 0

0.5 to <1.5 → 1

>= 1.5 → 2

🔹 Additional/Conceptual
13. Why not use classification instead of regression?
→ Regression was used here for simplicity. Classification could also be used for better label prediction.

14. Can this model be improved? How?
→ Yes, by using classification algorithms (like Decision Tree, Random Forest), better feature selection, or hyperparameter tuning.

15. What are the advantages of using Spark MLlib?
→ It handles large datasets efficiently using distributed computing, supports pipelines, and is scalable.

"""