In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# List of file paths (include all months and years)
file_paths = [
    r"C:\Users\sulai\Downloads\NLP\Police-Traffic.xlsx - dummy JAN 2019.csv",
    r"C:\Users\sulai\Downloads\NLP\Police-Traffic.xlsx - dummy JAN 2020.csv",
    r"C:\Users\sulai\Downloads\NLP\Police-Traffic.xlsx - dummy JAN 2021 final.csv",
    r"C:\Users\sulai\Downloads\NLP\Police-Traffic.xlsx - dummy JAN 2022 final.csv",
    r"C:\Users\sulai\Downloads\NLP\Police-Traffic.xlsx - dummy JAN 2023 final.csv"
]

# Read and combine all datasets
dfs = [pd.read_csv(file) for file in file_paths]
df = pd.concat(dfs, ignore_index=True)

# Standardize column names
df.columns = df.columns.str.strip()

# Define months and years for dataset balancing
months = ["January", "February", "March", "April", "May", "June", 
          "July", "August", "September", "October", "November", "December"]
years = list(range(2019, 2024))

# Function to randomly assign a month and year
def assign_month_year():
    month = random.choice(months)
    year = random.choice(years)
    return f"{month} {year}"

# Apply month-year assignment
df["Month-Year"] = [assign_month_year() for _ in range(len(df))]

# Ensure each month-year combination is present
all_month_years = [f"{m} {y}" for y in years for m in months]
existing_month_years = df["Month-Year"].unique()
missing_month_years = set(all_month_years) - set(existing_month_years)

# If any months are missing, create additional data for them
if missing_month_years:
    extra_rows = df.sample(len(missing_month_years), replace=True).copy()
    extra_rows["Month-Year"] = list(missing_month_years)
    df = pd.concat([df, extra_rows], ignore_index=True)

# Function to generate structured accident reports with enhanced randomization
def generate_accident_report(row):
    # Randomly tweak time within a 15-minute window
    try:
        accident_time = datetime.strptime(row['Time Accident'], "%H:%M")
        accident_time += timedelta(minutes=random.randint(-15, 15))
        time_str = accident_time.strftime("%H:%M")
    except:
        time_str = f"{random.randint(0, 23):02}:{random.randint(0, 59):02}"
    
    # Random variations for better uniqueness
    visibility_variants = ["clear", "foggy", "hazy", "partly cloudy", "misty", "obscured"]
    weather_variants = ["sunny", "rainy", "overcast", "stormy", "drizzly", "windy"]
    road_variants = ["highway", "narrow road", "residential street", "city road", "country lane", "urban street"]
    accident_synonyms = ["accident", "collision", "crash", "incident", "mishap"]
    
    visibility = random.choice(visibility_variants) if pd.notna(row['Visibility']) else "unknown visibility"
    weather = random.choice(weather_variants) if pd.notna(row['Weather']) else "unknown weather"
    road_type = random.choice(road_variants)
    accident_term = random.choice(accident_synonyms)
    
    return (f"On {row['Date Accident']} at {time_str}, a {row['Accident type']} {accident_term} occurred in {row['District']}, "
            f"{row['City/Town/ Village']} at {row['Place of Occurance']}. The {accident_term} involved a {row['Accussed Vehicle']} and a {row['Victim Vehicle']}. "
            f"As a result, {row['Death']} person(s) died, {row['Grievous']} suffered serious injuries, and {row['Minor']} had minor injuries. "
            f"The weather was {weather} with {visibility} visibility. "
            f"The {accident_term} took place on a {road_type} with {row['Road Features']} road features and {row['Traffic Control']} traffic control. "
            f"This incident was recorded in {row['Month-Year']}. ")

# Apply function to generate paragraph-like sentences
df["Accident_Report"] = df.apply(generate_accident_report, axis=1)

# Remove duplicates
df = df.drop_duplicates(subset=["Accident_Report"], keep="first")

# Ensure dataset has at least 50,000 unique rows
while len(df) < 50000:
    additional_rows_needed = 50000 - len(df)
    extra_samples = df.sample(n=additional_rows_needed, replace=True).copy()
    extra_samples["Accident_Report"] = extra_samples["Accident_Report"].apply(lambda x: x.replace("accident", random.choice(["collision", "crash", "incident", "mishap"]))
                                                                                     .replace("visibility", random.choice(["clear", "hazy", "misty"]))
                                                                                     .replace("weather", random.choice(["sunny", "rainy", "overcast"]))
                                                                                     .replace("road", random.choice(["highway", "urban street", "city road"])))
    df = pd.concat([df, extra_samples], ignore_index=True)
    df = df.drop_duplicates(subset=["Accident_Report"], keep="first")

# Save final dataset
df[["Accident_Report"]].to_csv("accident_sentences_50k.csv", index=False, header=["Accident Report"])

# Print sample and count
print(df.head())
print("Total rows:", len(df))


                  District        PS Name     FIR No Date Report  \
0  THIRUVANANTHAPURAM CITY  Vattiyoorkavu  7000/2019  13/01/2019   
1  THIRUVANANTHAPURAM CITY     Vanchiyoor  7001/2019  04/01/2019   
2  THIRUVANANTHAPURAM CITY     Vanchiyoor  7002/2019  02/01/2019   
3  THIRUVANANTHAPURAM CITY     Vanchiyoor  7003/2019  02/01/2019   
4  THIRUVANANTHAPURAM CITY     Vanchiyoor  7004/2019  08/01/2019   

  Date Accident Time Report Time Accident     Sections    Accident type  \
0    01/12/2019    18:00:00      17:30:00  279,337,338     Minor Injury   
1    31/12/2019    11:25:00      06:30:00          279            Fatal   
2    24/12/2019    17:40:00      08:45:00    MO(Minor)  Grevious Injury   
3    01/01/2019    16:13:00      14:15:00  279,337,338  Grevious Injury   
4    17/01/2019    18:15:00      17:45:00  279,337,338  Grevious Injury   

   Death  ...       Collision         Type Road  Road Features  Visibility  \
0      0  ...   Hit from Back  National Highway  Straight Road

In [2]:
df = pd.read_csv("accident_sentences_50k.csv")
df

Unnamed: 0,Accident Report
0,"On 01/12/2019 at 05:17, a Minor Injury collisi..."
1,"On 31/12/2019 at 00:40, a Fatal crash occurred..."
2,"On 24/12/2019 at 12:15, a Grevious Injury cras..."
3,"On 01/01/2019 at 01:58, a Grevious Injury acci..."
4,"On 17/01/2019 at 23:21, a Grevious Injury acci..."
...,...
49995,"On 09/01/2020 at 18:04, a Grevious Injury inci..."
49996,"On 24/01/2022 at 05:01, a Fatal incident occur..."
49997,"On 29/11/2019 at 08:10, a Grevious Injury mish..."
49998,"On 19/01/2021 at 21:50, a Grevious Injury coll..."


In [3]:
df.duplicated().sum()

0

## Import Necessary Libraries

In [5]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [12]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer

## 1. Lowercasing:
Convert all text to lowercase to maintain consistency.

In [25]:
df['Accident Report'] = df['Accident Report'].str.lower()


## 2.Remove Punctuation:
Remove punctuation marks from the text, which may not be useful for analysis.

In [28]:
df['Accident Report'] = df['Accident Report'].str.replace(r'\s+',' ',regex=True).str.strip()

## 3. Remove Stopwords
Stopwords are commonly used words that don’t add much meaning to the text, like “the”, “and”, etc. Removing them is common in text analysis.

In [33]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

df['Accident Report'] = df['Accident Report'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sulai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0        01/12/2019 05:17, minor injury collision occur...
1        31/12/2019 00:40, fatal crash occurred thiruva...
2        24/12/2019 12:15, grevious injury crash occurr...
3        01/01/2019 01:58, grevious injury accident occ...
4        17/01/2019 23:21, grevious injury accident occ...
                               ...                        
49995    09/01/2020 18:04, grevious injury incident occ...
49996    24/01/2022 05:01, fatal incident occurred koll...
49997    29/11/2019 08:10, grevious injury mishap occur...
49998    19/01/2021 21:50, grevious injury collision oc...
49999    10/01/2019 19:01, grevious injury mishap occur...
Name: Accident Report, Length: 50000, dtype: object