In [3]:
from google.colab import files
uploaded = files.upload()


Saving final_accident_data.csv to final_accident_data.csv


In [4]:
import pandas as pd

# Load the uploaded CSV
df = pd.read_csv("final_accident_data.csv")

# Preview the first few rows
df.head()


Unnamed: 0,District,PS Name,Date Report,Date Accident,Time Accident,Accident type,Death,Grievous,Minor,Pedestrian,...,Collision,Type Road,Road Features,Visibility,Traffic Control,Accussed Vehicle,Victim Vehicle,Recommendation,Unstructured Text,Cleaned Text
0,THIRUVANANTHAPURAM CITY,Vattiyoorkavu,01 Dec 2021,01 Dec 2021,05:30 PM,Minor Injury,0,0,2,0,...,Hit from Back,National Highway,Straight Road,Good,Uncontrolled,Tipper,Motor Cycle,Improve general road safety awareness and enfo...,"On 01 Dec 2021 at 05:30 PM, an accident occurr...",01 dec 2021 05:30 pm accident occurred thiruva...
1,THIRUVANANTHAPURAM CITY,Vanchiyoor,31 Dec 2024,31 Dec 2024,06:30 AM,Fatal,1,0,0,1,...,Hit Pedestrian,National Highway,Straight Road,Good,Uncontrolled,Motor Cycle,Motor Cycle,Improve pedestrian crossings and install bette...,"On 31 Dec 2024 at 06:30 AM, an accident occurr...",31 dec 2024 06:30 accident occurred thiruvanan...
2,THIRUVANANTHAPURAM CITY,Vanchiyoor,24 Dec 2024,24 Dec 2024,08:45 AM,Grevious Injury,0,1,0,0,...,Hit from Back,State Highway,Straight Road,Good,Uncontrolled,Motor Cycle,Scooter,Improve general road safety awareness and enfo...,"On 24 Dec 2024 at 08:45 AM, an accident occurr...",24 dec 2024 08:45 accident occurred thiruvanan...
3,THIRUVANANTHAPURAM CITY,Vanchiyoor,01 Jan 2023,01 Jan 2023,02:15 PM,Grevious Injury,0,1,0,0,...,Hit from Back,Other Road,Curved Road,Good,Uncontrolled,Auto rickshaw,Motor Cycle,Improve general road safety awareness and enfo...,"On 01 Jan 2023 at 02:15 PM, an accident occurr...",01 jan 2023 02:15 pm accident occurred thiruva...
4,THIRUVANANTHAPURAM CITY,Vanchiyoor,17 Jan 2024,17 Jan 2024,05:45 PM,Grevious Injury,0,1,1,0,...,Hit from Side,State Highway,Straight Road,Good,Uncontrolled,Motor Cycle,Car,Improve general road safety awareness and enfo...,"On 17 Jan 2024 at 05:45 PM, an accident occurr...",17 jan 2024 05:45 pm accident occurred thiruva...


Install and Import NLTK, Then Download Tokenizers & Stopwords

In [5]:
import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re


Define the Clean Text Function

In [9]:
def clean_text(text):
    if pd.isnull(text):
        return ""

    text = text.lower()

    # Remove punctuation except colon
    text = text.translate(str.maketrans('', '', string.punctuation.replace(':', '')))

    # Simple tokenization using split (to avoid nltk bug)
    tokens = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Keep useful patterns
    tokens = [word for word in tokens if re.match(r'\d+|p\d+|:\d+|[a-zA-Z]+', word)]

    # Add start and end tokens
    cleaned_text = "<start> " + ' '.join(tokens) + " <end>"

    return cleaned_text


In [10]:
df['Cleaned Text'] = df['Unstructured Text'].apply(clean_text)


In [15]:
!pip install spacy
import spacy

# Load small English model
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m98.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [31]:
import re
import pandas as pd

def extract_summary(
    text,
    recommendation="",
    actual_district="unknown",
    actual_date="unknown",
    actual_time="unknown"
):
    # === STEP 1: Reformat cleaned text ===
    text = text.replace("<start>", "").replace("<end>", "").strip()

    # Capitalize months
    months = {
        "jan": "Jan", "feb": "Feb", "mar": "Mar", "apr": "Apr",
        "may": "May", "jun": "Jun", "jul": "Jul", "aug": "Aug",
        "sep": "Sep", "oct": "Oct", "nov": "Nov", "dec": "Dec"
    }
    for month, cap_month in months.items():
        text = re.sub(rf"\b{month}\b", cap_month, text)

    # Capitalize am/pm
    text = re.sub(r"\b(am|pm)\b", lambda m: m.group(1).upper(), text)

    # Capitalize known places (add more if needed)
    custom_words = [
        "thiruvananthapuram", "vattiyoorkavu", "moothakunnam", "kollam",
        "ernakulam", "kozhikode", "palakkad", "alappuzha", "idukki",
        "kasaragod", "malappuram", "wayanad", "pathanamthitta", "kannur",
        "kochi", "calicut", "choolaimedu", "ambattur", "poonkunnam"
    ]
    for word in custom_words:
        text = re.sub(rf"\b{word}\b", word.capitalize(), text)

    # === STEP 2: spaCy NER ===
    doc = nlp(text)

    summary = {
        "district": "unknown",
        "date": "unknown",
        "time": "unknown",
        "deaths": "0",
        "grievous": "0",
        "minor": "0",
        "type": "unknown",
        "recommendation": recommendation if pd.notnull(recommendation) else "N/A"
    }

    for ent in doc.ents:
        if ent.label_ == "GPE" and summary["district"] == "unknown":
            summary["district"] = ent.text
        elif ent.label_ == "DATE" and summary["date"] == "unknown":
            summary["date"] = ent.text
        elif ent.label_ == "TIME" and summary["time"] == "unknown":
            summary["time"] = ent.text

    # === STEP 3: Use real values if spaCy fails or is wrong ===
    if summary["district"].lower() in ["unknown", "moothakunnam"]:
        summary["district"] = actual_district

    if summary["date"] == "unknown" and pd.notnull(actual_date):
        summary["date"] = actual_date

    if summary["time"] == "unknown" and pd.notnull(actual_time):
        summary["time"] = actual_time

    # === STEP 4: Regex Extraction ===
    if match := re.search(r"(?:death|fatalit(?:y|ies))\D*(\d+)", text):
        summary["deaths"] = match.group(1)

    if match := re.search(r"grievous injuries?\D*(\d+)", text):
        summary["grievous"] = match.group(1)

    if match := re.search(r"minor injuries?\D*(\d+)", text):
        summary["minor"] = match.group(1)

    if match := re.search(r"accident type\s*([a-z\s]+)", text):
        summary["type"] = match.group(1).strip().replace("grevious", "grievous")

    return summary


In [32]:
df["NER_Extracted_Info"] = df.apply(
    lambda row: extract_summary(row["Cleaned Text"], row["Recommendation"], row["District"]),
    axis=1
)


In [33]:
for i in range(10):
    print(f"\n🔢 Row {i} — NER Extracted Info:")
    print(df.loc[i, "NER_Extracted_Info"])




🔢 Row 0 — NER Extracted Info:
{'district': 'THIRUVANANTHAPURAM CITY', 'date': '01 Dec 2021', 'time': '05:30 PM', 'deaths': '0', 'grievous': '2', 'minor': '0', 'type': 'minor injury', 'recommendation': 'Improve general road safety awareness and enforcement.'}

🔢 Row 1 — NER Extracted Info:
{'district': 'THIRUVANANTHAPURAM CITY', 'date': 'unknown', 'time': 'unknown', 'deaths': '0', 'grievous': '0', 'minor': '0', 'type': 'fatal', 'recommendation': 'Improve pedestrian crossings and install better signage. Increase road safety measures and emergency response times.'}

🔢 Row 2 — NER Extracted Info:
{'district': 'THIRUVANANTHAPURAM CITY', 'date': '24 Dec 2024 08:45', 'time': 'unknown', 'deaths': '1', 'grievous': '0', 'minor': '0', 'type': 'grievous injury', 'recommendation': 'Improve general road safety awareness and enforcement.'}

🔢 Row 3 — NER Extracted Info:
{'district': 'THIRUVANANTHAPURAM CITY', 'date': '01 Jan 2023 02:15 PM', 'time': 'unknown', 'deaths': '1', 'grievous': '0', 'minor':

In [34]:
df.to_csv("ner_summarized_data.csv", index=False)


In [35]:
pd.read_csv("ner_summarized_data.csv").head()


Unnamed: 0,District,PS Name,Date Report,Date Accident,Time Accident,Accident type,Death,Grievous,Minor,Pedestrian,...,Road Features,Visibility,Traffic Control,Accussed Vehicle,Victim Vehicle,Recommendation,Unstructured Text,Cleaned Text,NER_Extracted_Info,NER_Summary
0,THIRUVANANTHAPURAM CITY,Vattiyoorkavu,01 Dec 2021,01 Dec 2021,05:30 PM,Minor Injury,0,0,2,0,...,Straight Road,Good,Uncontrolled,Tipper,Motor Cycle,Improve general road safety awareness and enfo...,"On 01 Dec 2021 at 05:30 PM, an accident occurr...",<start> 01 dec 2021 05:30 pm accident occurred...,"{'district': 'THIRUVANANTHAPURAM CITY', 'date'...",district: Moothakunnam | date: 01 Dec 2021 | t...
1,THIRUVANANTHAPURAM CITY,Vanchiyoor,31 Dec 2024,31 Dec 2024,06:30 AM,Fatal,1,0,0,1,...,Straight Road,Good,Uncontrolled,Motor Cycle,Motor Cycle,Improve pedestrian crossings and install bette...,"On 31 Dec 2024 at 06:30 AM, an accident occurr...",<start> 31 dec 2024 06:30 accident occurred th...,"{'district': 'THIRUVANANTHAPURAM CITY', 'date'...",district: unknown | date: unknown | time: unkn...
2,THIRUVANANTHAPURAM CITY,Vanchiyoor,24 Dec 2024,24 Dec 2024,08:45 AM,Grevious Injury,0,1,0,0,...,Straight Road,Good,Uncontrolled,Motor Cycle,Scooter,Improve general road safety awareness and enfo...,"On 24 Dec 2024 at 08:45 AM, an accident occurr...",<start> 24 dec 2024 08:45 accident occurred th...,"{'district': 'THIRUVANANTHAPURAM CITY', 'date'...",district: unknown | date: 24 Dec 2024 08:45 | ...
3,THIRUVANANTHAPURAM CITY,Vanchiyoor,01 Jan 2023,01 Jan 2023,02:15 PM,Grevious Injury,0,1,0,0,...,Curved Road,Good,Uncontrolled,Auto rickshaw,Motor Cycle,Improve general road safety awareness and enfo...,"On 01 Jan 2023 at 02:15 PM, an accident occurr...",<start> 01 jan 2023 02:15 pm accident occurred...,"{'district': 'THIRUVANANTHAPURAM CITY', 'date'...",district: unknown | date: 01 Jan 2023 02:15 PM...
4,THIRUVANANTHAPURAM CITY,Vanchiyoor,17 Jan 2024,17 Jan 2024,05:45 PM,Grevious Injury,0,1,1,0,...,Straight Road,Good,Uncontrolled,Motor Cycle,Car,Improve general road safety awareness and enfo...,"On 17 Jan 2024 at 05:45 PM, an accident occurr...",<start> 17 jan 2024 05:45 pm accident occurred...,"{'district': 'THIRUVANANTHAPURAM CITY', 'date'...",district: unknown | date: 17 Jan 2024 | time: ...
