<a href="https://colab.research.google.com/github/shubham620/cdti_jaipur_project/blob/main/cdti_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import re


In [4]:
# Load CSV file
df = pd.read_csv("demo_input_data_2.csv")

# Display the raw data
print("RAW INPUT DATA:")
df.head()


RAW INPUT DATA:


Unnamed: 0,id,text,source,time,location
0,1,Crowd gathering near City Mall entrance,social,10:05:00 AM,"City Mall, Jaipur"
1,2,Large group of people reported at City Mall,social,10:07:00 AM,"City Mall, Jaipur"
2,3,Police control room received call about crowd ...,emergency,10:10:00 AM,"City Mall, Jaipur"
3,4,Smoke and fire seen near railway station platf...,social,10:15:00 AM,"Railway Station, Jaipur"
4,5,Heavy smoke visible near Jaipur railway station,social,10:16:00 AM,"Railway Station, Jaipur"


In [5]:
def clean_text(text):
    text = text.lower()                     # convert to lowercase
    text = re.sub(r"[^a-zA-Z\s]", "", text) # remove special characters
    text = re.sub(r"\s+", " ", text)        # remove extra spaces
    return text.strip()


In [6]:
# Apply text cleaning
df["clean_text"] = df["text"].apply(clean_text)

# Show cleaned text
print("CLEANED DATA:")
df[["id", "text", "clean_text"]]


CLEANED DATA:


Unnamed: 0,id,text,clean_text
0,1,Crowd gathering near City Mall entrance,crowd gathering near city mall entrance
1,2,Large group of people reported at City Mall,large group of people reported at city mall
2,3,Police control room received call about crowd ...,police control room received call about crowd ...
3,4,Smoke and fire seen near railway station platf...,smoke and fire seen near railway station platform
4,5,Heavy smoke visible near Jaipur railway station,heavy smoke visible near jaipur railway station
5,6,Fire brigade dispatched to railway station,fire brigade dispatched to railway station
6,7,Message circulating about bomb blast at bus stand,message circulating about bomb blast at bus stand
7,8,No confirmation of bomb blast at bus stand,no confirmation of bomb blast at bus stand
8,9,Police deny blast rumors at Jaipur bus stand,police deny blast rumors at jaipur bus stand
9,10,Traffic jam reported on NH-21 due to truck bre...,traffic jam reported on nh due to truck breakdown


In [7]:
!pip install sentence-transformers




In [8]:
from sentence_transformers import SentenceTransformer
import numpy as np




In [9]:
# Load a lightweight sentence embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
# Convert cleaned text into embeddings (numeric meaning)
embeddings = model.encode(df["clean_text"].tolist())

# Check embedding shape
print("Embedding shape:", embeddings.shape)


Embedding shape: (22, 384)


In [11]:
# Store embeddings inside dataframe
df["embedding"] = embeddings.tolist()

df[["id", "clean_text"]].head()


Unnamed: 0,id,clean_text
0,1,crowd gathering near city mall entrance
1,2,large group of people reported at city mall
2,3,police control room received call about crowd ...
3,4,smoke and fire seen near railway station platform
4,5,heavy smoke visible near jaipur railway station


In [12]:
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity


In [13]:
# Convert list of embeddings to numpy array
X = np.array(df["embedding"].tolist())

# Compute cosine similarity between all messages
similarity_matrix = cosine_similarity(X)


In [14]:
# Convert similarity to distance
distance_matrix = 1 - similarity_matrix

# Fix floating point negatives (important)
distance_matrix[distance_matrix < 0] = 0



In [15]:
# Apply DBSCAN clustering
dbscan = DBSCAN(
    eps=0.3,          # similarity threshold (tunable)
    min_samples=2,    # minimum messages to form an event
    metric="precomputed"
)

df["event_id"] = dbscan.fit_predict(distance_matrix)


In [16]:
# Display messages grouped by event
for event in df["event_id"].unique():
    print(f"\nEVENT {event}:")
    display(df[df["event_id"] == event][["id", "text", "source", "location"]])



EVENT 0:


Unnamed: 0,id,text,source,location
0,1,Crowd gathering near City Mall entrance,social,"City Mall, Jaipur"
1,2,Large group of people reported at City Mall,social,"City Mall, Jaipur"
2,3,Police control room received call about crowd ...,emergency,"City Mall, Jaipur"



EVENT -1:


Unnamed: 0,id,text,source,location
3,4,Smoke and fire seen near railway station platf...,social,"Railway Station, Jaipur"
4,5,Heavy smoke visible near Jaipur railway station,social,"Railway Station, Jaipur"
5,6,Fire brigade dispatched to railway station,official,"Railway Station, Jaipur"
8,9,Police deny blast rumors at Jaipur bus stand,official,"Bus Stand, Jaipur"
12,13,Water level rising near low-lying area after h...,social,"Mansarovar, Jaipur"
13,14,Local residents report waterlogging in Mansarovar,social,"Mansarovar, Jaipur"
14,15,Municipal team deployed to inspect waterlogging,official,"Mansarovar, Jaipur"
17,18,Minor protest reported near university gate,social,"University Road, Jaipur"
18,19,Small group protesting peacefully near campus,social,"University Road, Jaipur"
21,22,Person stabilized and taken to hospital,official,"Metro Station, Jaipur"



EVENT 1:


Unnamed: 0,id,text,source,location
6,7,Message circulating about bomb blast at bus stand,social,"Bus Stand, Jaipur"
7,8,No confirmation of bomb blast at bus stand,news,"Bus Stand, Jaipur"



EVENT 2:


Unnamed: 0,id,text,source,location
9,10,Traffic jam reported on NH-21 due to truck bre...,social,"NH-21, Jaipur"
10,11,Traffic police confirms vehicle breakdown on N...,official,"NH-21, Jaipur"
11,12,Traffic cleared on NH-21,official,"NH-21, Jaipur"



EVENT 3:


Unnamed: 0,id,text,source,location
15,16,Single post claiming city-wide flood,social,Jaipur
16,17,Unverified claim of major flood spreading online,social,Jaipur



EVENT 4:


Unnamed: 0,id,text,source,location
19,20,Medical emergency reported at metro station,social,"Metro Station, Jaipur"
20,21,Ambulance dispatched to metro station,emergency,"Metro Station, Jaipur"


In [17]:
# Remove noise messages (event_id = -1)
events_df = df[df["event_id"] != -1]

# Group messages by event
event_summary = events_df.groupby("event_id").agg(
    message_count=("id", "count"),
    sources=("source", lambda x: list(set(x))),
    locations=("location", lambda x: list(set(x)))
).reset_index()

event_summary



Unnamed: 0,event_id,message_count,sources,locations
0,0,3,"[emergency, social]","[City Mall, Jaipur]"
1,1,2,"[social, news]","[Bus Stand, Jaipur]"
2,2,3,"[social, official]","[NH-21, Jaipur]"
3,3,2,[social],[Jaipur]
4,4,2,"[emergency, social]","[Metro Station, Jaipur]"


In [18]:
def calculate_credibility(message_count, sources):
    sources = set(sources)

    # Strong confirmation
    if message_count >= 3 and ("official" in sources or "emergency" in sources):
        return "High"

    # News + social but no official confirmation → suspicious
    if "news" in sources and "official" not in sources and "emergency" not in sources:
        return "Low"

    # Multiple reports but only public sources
    if message_count >= 2:
        return "Medium"

    return "Low"



In [19]:
event_summary["credibility"] = event_summary.apply(
    lambda row: calculate_credibility(row["message_count"], row["sources"]),
    axis=1
)

event_summary


Unnamed: 0,event_id,message_count,sources,locations,credibility
0,0,3,"[emergency, social]","[City Mall, Jaipur]",High
1,1,2,"[social, news]","[Bus Stand, Jaipur]",Low
2,2,3,"[social, official]","[NH-21, Jaipur]",High
3,3,2,[social],[Jaipur],Medium
4,4,2,"[emergency, social]","[Metro Station, Jaipur]",Medium


In [20]:
def detect_risk_level(texts, locations, message_count):
    text_combined = " ".join(texts).lower()
    locations = " ".join(locations).lower()

    # High-risk keywords & places
    high_risk_keywords = ["fire", "blast", "explosion", "smoke"]
    sensitive_places = ["railway", "bus stand", "station"]

    # Medium-risk keywords & places
    medium_risk_keywords = ["crowd", "gathering", "people"]
    public_places = ["mall", "market"]

    # High risk conditions
    if any(word in text_combined for word in high_risk_keywords):
        return "High"
    if any(place in locations for place in sensitive_places):
        return "High"

    # Medium risk conditions
    if any(word in text_combined for word in medium_risk_keywords):
        return "Medium"
    if any(place in locations for place in public_places):
        return "Medium"

    # Low risk otherwise
    return "Low"


In [21]:
# Collect texts per event
event_texts = (
    events_df
    .groupby("event_id")["clean_text"]
    .apply(list)
    .reset_index(name="texts")
)

# Merge texts into event_summary
event_summary = event_summary.merge(event_texts, on="event_id")


In [22]:
event_summary["risk"] = event_summary.apply(
    lambda row: detect_risk_level(
        row["texts"],
        row["locations"],
        row["message_count"]
    ),
    axis=1
)

event_summary[["event_id", "credibility", "risk"]]


Unnamed: 0,event_id,credibility,risk
0,0,High,Medium
1,1,Low,High
2,2,High,Low
3,3,Medium,Low
4,4,Medium,High


In [23]:
def make_decision(credibility, risk):
    # High risk cases
    if risk == "High":
        if credibility in ["High", "Medium"]:
            return "ALERT"
        else:
            return "MONITOR"

    # Medium risk cases
    if risk == "Medium":
        if credibility == "High":
            return "MONITOR"
        else:
            return "MONITOR"

    # Low risk cases
    return "IGNORE"


In [24]:
event_summary["decision"] = event_summary.apply(
    lambda row: make_decision(row["credibility"], row["risk"]),
    axis=1
)

event_summary[["event_id", "credibility", "risk", "decision"]]



Unnamed: 0,event_id,credibility,risk,decision
0,0,High,Medium,MONITOR
1,1,Low,High,MONITOR
2,2,High,Low,IGNORE
3,3,Medium,Low,IGNORE
4,4,Medium,High,ALERT


In [25]:
final_output = event_summary[[
    "event_id",
    "locations",
    "message_count",
    "sources",
    "credibility",
    "risk",
    "decision"
]]

final_output


Unnamed: 0,event_id,locations,message_count,sources,credibility,risk,decision
0,0,"[City Mall, Jaipur]",3,"[emergency, social]",High,Medium,MONITOR
1,1,"[Bus Stand, Jaipur]",2,"[social, news]",Low,High,MONITOR
2,2,"[NH-21, Jaipur]",3,"[social, official]",High,Low,IGNORE
3,3,[Jaipur],2,[social],Medium,Low,IGNORE
4,4,"[Metro Station, Jaipur]",2,"[emergency, social]",Medium,High,ALERT
