Install the dependency packages

In [1]:
!python -m venv .venv
!.venv\Scripts\Activate.ps1
!pip install google-cloud-pubsub python-dotenv pandas




[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\zeond\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Import Function 

In [2]:
import pandas as pd
import json, pathlib, os, csv
from google.cloud import pubsub_v1



Initialization

In [7]:
PROJECT_ID = "iisc-data-engineering-project"
TOPIC_ID = "match_data"
SUBSCRIPTION_ID = "match_data-sub"
matches_FILE = "matches.csv"
deliveries_FILE = "deliveries.csv"
batch_setting = pubsub_v1.types.BatchSettings(
    max_bytes=1024,  # Maximum bytes to batch
    max_messages=1000,  # Maximum number of messages to batch 
    max_latency=0.5,  # Maximum latency in seconds
)
publisher = pubsub_v1.PublisherClient(batch_settings=batch_setting)
topic_path = publisher.topic_path(PROJECT_ID, TOPIC_ID)



Two dataset are used of the following structure. The dataset is from IPL 2008-2024
1. Matches  - Contains Match level data, Each row represents a single IPL match, including venue, teams, toss results, match outcomes, and player awards.
2. Deliveries - Each row represents one delivery (ball) in an IPL match. It links to matches.csv via the match_id column.



In [4]:
matches = pd.read_csv(matches_FILE)
deliveries = pd.read_csv(deliveries_FILE)

print("Matches DataFrame:")
print(matches.head(5))

print("\nDeliveries DataFrame:")
print(deliveries.head(5))

Matches DataFrame:
       id   season        city        date match_type player_of_match  \
0  335982  2007/08   Bangalore  2008-04-18     League     BB McCullum   
1  335983  2007/08  Chandigarh  2008-04-19     League      MEK Hussey   
2  335984  2007/08       Delhi  2008-04-19     League     MF Maharoof   
3  335985  2007/08      Mumbai  2008-04-20     League      MV Boucher   
4  335986  2007/08     Kolkata  2008-04-20     League       DJ Hussey   

                                        venue                        team1  \
0                       M Chinnaswamy Stadium  Royal Challengers Bangalore   
1  Punjab Cricket Association Stadium, Mohali              Kings XI Punjab   
2                            Feroz Shah Kotla             Delhi Daredevils   
3                            Wankhede Stadium               Mumbai Indians   
4                                Eden Gardens        Kolkata Knight Riders   

                         team2                  toss_winner toss_decision

This is a sample toy data for testing Publish and Subsribe Flow in pubsub

In [12]:
sample_match = matches.sample(1, random_state=42).to_dict(orient="records")[0]
print ("Sample Match Record:")
print (sample_match)
message = {
    "event_type": "match",
    "payload": sample_match
}
data = json.dumps(message, default=str).encode("utf-8")
future = publisher.publish(topic_path, data)
print(f"Published message ID: {future.result()}")
print(json.dumps(message, indent=2)[:600])

subscriber = pubsub_v1.SubscriberClient()
subscription_path = subscriber.subscription_path(PROJECT_ID, SUBSCRIPTION_ID)
response = subscriber.pull(
    request={"subscription": subscription_path, "max_messages": 10}
)
for received_message in response.received_messages:
    print(f"Received message: {received_message.message.data.decode('utf-8')}")
    subscriber.acknowledge(
        request={
            "subscription": subscription_path,
            "ack_ids": [received_message.ack_id]
        }
    )

Sample Match Record:
{'id': 980933, 'season': '2016', 'city': 'Delhi', 'date': '2016-04-23', 'match_type': 'League', 'player_of_match': 'SV Samson', 'venue': 'Feroz Shah Kotla', 'team1': 'Delhi Daredevils', 'team2': 'Mumbai Indians', 'toss_winner': 'Mumbai Indians', 'toss_decision': 'field', 'winner': 'Delhi Daredevils', 'result': 'runs', 'result_margin': 10.0, 'target_runs': 165.0, 'target_overs': 20.0, 'super_over': 'N', 'method': nan, 'umpire1': 'S Ravi', 'umpire2': 'C Shamshuddin'}
Published message ID: 16956330235317210
{
  "event_type": "match",
  "payload": {
    "id": 980933,
    "season": "2016",
    "city": "Delhi",
    "date": "2016-04-23",
    "match_type": "League",
    "player_of_match": "SV Samson",
    "venue": "Feroz Shah Kotla",
    "team1": "Delhi Daredevils",
    "team2": "Mumbai Indians",
    "toss_winner": "Mumbai Indians",
    "toss_decision": "field",
    "winner": "Delhi Daredevils",
    "result": "runs",
    "result_margin": 10.0,
    "target_runs": 165.0,
   



Received message: {"eventType": "deliveries", "payload": {"match_id": "1136595", "inning": "1", "batting_team": "Royal Challengers Bangalore", "bowling_team": "Chennai Super Kings", "over": "13", "ball": "3", "batter": "TG Southee", "bowler": "Harbhajan Singh", "non_striker": "C de Grandhomme", "batsman_runs": "0", "extra_runs": "0", "total_runs": "0", "extras_type": "", "is_wicket": "0", "player_dismissed": "NA", "dismissal_kind": "NA", "fielder": "NA"}}
Received message: {"eventType": "deliveries", "payload": {"match_id": "1136594", "inning": "2", "batting_team": "Mumbai Indians", "bowling_team": "Kings XI Punjab", "over": "15", "ball": "4", "batter": "KH Pandya", "bowler": "AJ Tye", "non_striker": "RG Sharma", "batsman_runs": "0", "extra_runs": "1", "total_runs": "1", "extras_type": "wides", "is_wicket": "0", "player_dismissed": "NA", "dismissal_kind": "NA", "fielder": "NA"}}
Received message: {"eventType": "deliveries", "payload": {"match_id": "1136594", "inning": "2", "batting_tea

Push all the Matches.CSV to PubSub broker service

In [None]:
def publish_csv(file_path: str, event_type: str) -> int:
    """
    Reads a CSV and publishes each row as:
    {
      "eventType": "<event_type>",
      "payload": {<csv row as JSON>}
    }
    Returns number of published messages.
    """
    p = pathlib.Path(file_path)
    if not p.exists():
        raise FileNotFoundError(f"CSV not found: {p.resolve()}")

    futures = []
    with p.open("r", newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in enumerate(reader):
           
            message_obj = {
                "eventType": event_type,
                "payload": row,           # row already dict from DictReader
            }
            data_bytes = json.dumps(message_obj, ensure_ascii=False).encode("utf-8")

            # Optional attributes for filtering/monitoring
            fut = publisher.publish(
                topic_path,
                data=data_bytes,
                source_file=p.name,
                eventType=event_type
            )
            futures.append(fut)

    # Ensure all publishes complete (raise on error)
    for fut in futures:
        fut.result()
    return len(futures)

# Publish both CSVs
count_matches = publish_csv(matches_FILE, "match")
count_deliveries = publish_csv(deliveries_FILE, "deliveries")

print(f"Published {count_matches} 'match' messages and {count_deliveries} 'deliveries' messages.")