<a href="https://colab.research.google.com/github/shrishaameenaa-cmd/Data_Processing_Challenge/blob/main/23BCS160_Real_Time_Data_Streaming_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

!pip install pyspark==3.5.1 kafka-python


Collecting kafka-python
  Downloading kafka_python-2.2.15-py2.py3-none-any.whl.metadata (10.0 kB)
Downloading kafka_python-2.2.15-py2.py3-none-any.whl (309 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.8/309.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kafka-python
Successfully installed kafka-python-2.2.15


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import time
import random


In [3]:
spark = SparkSession.builder \
    .appName("RealTimeDataSimulation") \
    .getOrCreate()


In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, round, avg

# Initialize Spark
spark = SparkSession.builder.appName("StreamingExample").getOrCreate()

# Sample Data
data = [
    (1, 30.56, 60.2),
    (2, 27.34, 55.7),
    (3, 32.15, 58.4)
]
columns = ["sensor_id", "temperature", "humidity"]

df = spark.createDataFrame(data, columns)

# ❌ Wrong: round() from Python
# df = df.withColumn("rounded_temp", round(df["temperature"], 2))

# ✅ Correct: round() from pyspark.sql.functions
df = df.withColumn("rounded_temp", round(col("temperature"), 2))
df.show()


+---------+-----------+--------+------------+
|sensor_id|temperature|humidity|rounded_temp|
+---------+-----------+--------+------------+
|        1|      30.56|    60.2|       30.56|
|        2|      27.34|    55.7|       27.34|
|        3|      32.15|    58.4|       32.15|
+---------+-----------+--------+------------+



In [6]:
!pip install pyspark




In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, round, avg, lit
import pandas as pd
import random, time


In [8]:
spark = SparkSession.builder.appName("RealTimeStreamingSim").getOrCreate()


In [10]:
from pyspark.sql.functions import col, round as spark_round, avg


In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, round as spark_round, avg
import pandas as pd, random, time

spark = SparkSession.builder.appName("RealTimeStreamingSim").getOrCreate()



In [13]:
def generate_sensor_data():
    df = pd.DataFrame({
        "sensor_id": [random.randint(1,5) for _ in range(5)],
        "temperature": [round(random.uniform(25,35),2) for _ in range(5)],
        "humidity": [round(random.uniform(40,70),2) for _ in range(5)],
        "timestamp": [int(time.time()) for _ in range(5)]
    })
    return df


In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, round as spark_round, avg
import pandas as pd
import random, time

spark = SparkSession.builder.appName("RealTimeStreamingSim").getOrCreate()



In [16]:
def generate_sensor_data():
    data = {
        "sensor_id": [random.randint(1,5) for _ in range(5)],
        "temperature": [round(random.uniform(25,35),2) for _ in range(5)],
        "humidity": [round(random.uniform(40,70),2) for _ in range(5)],
        "timestamp": [int(time.time()) for _ in range(5)]
    }
    return pd.DataFrame(data)


In [1]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, round as spark_round, avg
import pandas as pd
import random, time

spark = SparkSession.builder.appName("RealTimeStreamingSim").getOrCreate()


In [2]:

def generate_sensor_data():
    data = {
        "sensor_id": [random.randint(1,5) for _ in range(5)],
        "temperature": [round(random.uniform(25,35),2) for _ in range(5)],
        "humidity": [round(random.uniform(40,70),2) for _ in range(5)],
        "timestamp": [int(time.time()) for _ in range(5)]
    }
    return pd.DataFrame(data)


In [3]:
# Step 3: Simulate streaming batches
for batch in range(3):
    print(f"\n=== Batch {batch+1} ===")

    pandas_df = generate_sensor_data()              # ✅ Just Pandas
    spark_df = spark.createDataFrame(pandas_df)     # ✅ Now to Spark

    # Processing safely inside Spark
    processed_df = (
        spark_df
        .withColumn("rounded_temp", spark_round(col("temperature"), 2))
        .groupBy("sensor_id")
        .agg(
            avg("temperature").alias("avg_temp"),
            avg("humidity").alias("avg_humidity")
        )
    )

    processed_df.show()
    time.sleep(2)



=== Batch 1 ===
+---------+------------------+------------+
|sensor_id|          avg_temp|avg_humidity|
+---------+------------------+------------+
|        1|28.134999999999998|      57.875|
|        4|             27.77|       68.51|
|        5|             28.82|       49.79|
|        2|             27.12|       55.51|
+---------+------------------+------------+


=== Batch 2 ===
+---------+--------+------------+
|sensor_id|avg_temp|avg_humidity|
+---------+--------+------------+
|        3|   28.56|       54.37|
|        2|  28.695|       54.53|
|        5|   29.74|       66.95|
|        1|   29.07|       56.95|
+---------+--------+------------+


=== Batch 3 ===
+---------+--------+------------+
|sensor_id|avg_temp|avg_humidity|
+---------+--------+------------+
|        5|   27.32|       61.53|
|        2|   31.86|      49.785|
|        4|  30.245|       58.25|
+---------+--------+------------+



In [4]:
# Step 1: Import ML dependencies
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np

# Step 2: Create dummy training data (simulate historical sensor data)
train_X = np.random.uniform(low=25, high=35, size=(100, 2))  # temperature, humidity
train_y = (train_X[:,0] > 30).astype(int)  # label 1 if temp > 30 else 0 (high temp)

# Step 3: Scale and train the model
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)

model = LogisticRegression()
model.fit(train_X_scaled, train_y)

print("✅ Model trained for predicting 'High Temperature Risk'")


✅ Model trained for predicting 'High Temperature Risk'


In [5]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

# Create a training dataset (dummy for demonstration)
train_data = [
    (1, 28.5, 50.0, 0),  # normal
    (2, 30.2, 60.0, 0),  # normal
    (3, 33.1, 45.0, 1),  # high
    (4, 35.0, 65.0, 1),  # high
]
columns = ["sensor_id", "temperature", "humidity", "label"]
train_df = spark.createDataFrame(train_data, columns)

# Features and model
assembler = VectorAssembler(inputCols=["temperature", "humidity"], outputCol="features")
lr = LogisticRegression(featuresCol="features", labelCol="label")

pipeline = Pipeline(stages=[assembler, lr])
model = pipeline.fit(train_df)

print("✅ Model trained successfully!")


✅ Model trained successfully!


In [6]:
from pyspark.sql.functions import col, round as spark_round, avg
import pandas as pd
import random, time

def generate_sensor_data():
    return pd.DataFrame({
        "sensor_id": [random.randint(1,5) for _ in range(5)],
        "temperature": [round(random.uniform(25,35),2) for _ in range(5)],
        "humidity": [round(random.uniform(40,70),2) for _ in range(5)],
        "timestamp": [int(time.time()) for _ in range(5)]
    })

for batch in range(3):
    print(f"\n=== Batch {batch+1} ===")

    pandas_df = generate_sensor_data()
    spark_df = spark.createDataFrame(pandas_df)

    # Apply trained model
    predictions = model.transform(spark_df)
    predictions.select("sensor_id", "temperature", "humidity", "prediction").show()

    time.sleep(2)



=== Batch 1 ===
+---------+-----------+--------+----------+
|sensor_id|temperature|humidity|prediction|
+---------+-----------+--------+----------+
|        4|      32.43|   57.49|       1.0|
|        3|      31.09|   59.24|       0.0|
|        2|      27.12|   59.86|       0.0|
|        3|       27.4|   54.86|       0.0|
|        5|      30.15|   62.79|       0.0|
+---------+-----------+--------+----------+


=== Batch 2 ===
+---------+-----------+--------+----------+
|sensor_id|temperature|humidity|prediction|
+---------+-----------+--------+----------+
|        5|      33.36|   47.54|       1.0|
|        2|      33.77|   49.33|       1.0|
|        4|      28.88|   57.38|       0.0|
|        3|      33.62|   59.04|       1.0|
|        5|      26.24|    54.1|       0.0|
+---------+-----------+--------+----------+


=== Batch 3 ===
+---------+-----------+--------+----------+
|sensor_id|temperature|humidity|prediction|
+---------+-----------+--------+----------+
|        1|      26.67|