In [1]:
import pandas as pd
import numpy as np

1. Reading data & preprocessing

In [2]:
#1. reading the police dataset

file_path = 'supplied-data/police_press_releases.csv'
df = pd.read_csv(file_path, usecols=['date_published', 'content'], parse_dates = ['date_published'])

In [3]:
#1.2 Reading local news articles
from datetime import datetime, timedelta

file_path_ln = 'supplied-data/local_news_articles.csv'
local_news_df = pd.read_csv(file_path_ln, usecols=['publish_date', 'content'], parse_dates = ['publish_date'])

In [4]:
#2. A few rows in the content column of the Local news articles report multiple accidents. Flagging these rows for splitting later on
import re
patterns = [
    r"\b(two|three|\d+)\s+(separate\s+)?(accidents|crashes|collisions|incidents)\b",
    r"\bin a separate\s+(accident|incident|collision|crash)\b",
    r"\bin\s+separate\s+(accidents|crashes|collisions|incidents)\b",
    r"\bin\s+(another|a\s+second|a\s+third)\s+(accident|incident|collision|crash)\b",
    r"\bthe\s+(second|third|another)\s+(accident|incident|collision|crash)\b",
    r"\b(two|three)\s+(traffic\s+)?(incidents|accidents|collisions|crashes)\b",
]
compiled = [re.compile(p, flags=re.IGNORECASE) for p in patterns]

flags = []
for content in local_news_df['content'].astype(str):
    is_multi = any(rx.search(content) for rx in compiled)
    flags.append('1' if is_multi else '')

# Add the flag column
local_news_df['multi_accident_flag'] = flags
local_news_df['multi_accident_flag'].value_counts()


multi_accident_flag
     302
1     19
Name: count, dtype: int64

In [5]:
#3. Splitting multi-accident articles into separate rows
#flag detection
def is_flag_one(x):
    try:
        if isinstance(x, (int, float)):
            return float(x) == 1.0
        s = str(x).strip()
        return s in {'1','1.0','True','true'}
    except Exception:
        return False

flagged_mask = local_news_df['multi_accident_flag'].apply(is_flag_one)

# Exact markers 
markers = [
    r"in a separate accident",
    r"another accident",
    r"the second accident",
    r"hours later",  # will also catch 'two hours later', 'an hour later' as substring
]
pattern = re.compile(r"(?i)" + r"|".join(markers))

rows_out = []

for idx, row in local_news_df.iterrows():
    content = str(row['content']) if pd.notna(row['content']) else ''
    if flagged_mask.iloc[idx]:
        matches = list(pattern.finditer(content))
        if matches:
            # Compute segment boundaries
            starts = [0] + [m.end() for m in matches]
            ends = [matches[0].start()] + [matches[i+1].start() if i+1 < len(matches) else len(content) for i in range(len(matches))]
            segments = []
            # First segment (before first marker): first accident
            first_seg = content[starts[0]:ends[0]].strip()
            if first_seg:
                segments.append(('before_marker', first_seg))
            # Subsequent segments: text AFTER each marker
            for i, m in enumerate(matches):
                seg_text = content[starts[i+1]:ends[i+1]].strip()
                if seg_text:
                    segments.append((m.group(0).lower(), seg_text))
            total = len(segments)
            for part_no, (marker_name, seg_text) in enumerate(segments, start=1):
                new_row = row.copy()
                new_row['content'] = seg_text
                new_row['accident_part_number'] = part_no
                new_row['accident_parts_total'] = total
                rows_out.append(new_row)
        else:
            rows_out.append(row)
    else:
        rows_out.append(row)

out_df = pd.DataFrame(rows_out)

local_news_df = out_df
local_news_df.drop(['multi_accident_flag', 'accident_part_number', 'accident_parts_total'], axis=1, inplace=True)
local_news_df


Unnamed: 0,publish_date,content
0,2024-12-07,A motorist claims his car mirror was shattered...
1,2024-12-09,The PN on Monday slammed the government for di...
2,2024-12-11,A motorcyclist was rushed to hospital in a cri...
3,2024-12-12,A private contractor who placed a skip on St P...
4,2024-12-14,A 29-year-old man and 17-year-old girl were cr...
...,...,...
316,2025-10-12,The Msida flyover will open by the end of the ...
317,2025-10-13,The following are the top stories in Malta's n...
318,2025-10-13,"Traffic, parking and public transport-related ..."
319,2025-10-14,A court has sharply criticised the police and ...


In [6]:
# 4.1. Extracting accident time from content and normalizing to 24h format for each dataset separately
# police dataset has a consistent format for time extraction

def extract_time(text):
    match = re.search(r'\b(\d{1,2})(?::?)(\d{2})\s*hrs\b', text, flags=re.IGNORECASE)
    if match:
        hour = int(match.group(1))
        minute = int(match.group(2))
        # Validate 24-hour time
        if 0 <= hour <= 23 and 0 <= minute <= 59:
            return f"{hour:02d}:{minute:02d}"
    return None
df['time_24h'] = df['content'].apply(extract_time)
print(df['time_24h'])


0      09:30
1      18:30
2      08:00
3      18:00
4      20:45
       ...  
106    09:00
107    10:45
108    08:00
109    22:15
110    07:00
Name: time_24h, Length: 111, dtype: object


In [7]:
#4.2 extracting time in 24h format from local news articles (used Copilot to help with the code)
#Local news articles do not use a consistent formart to report time of accident, unlike the police reports, so the code caters for multiple formats
#some news articles have "Updated at " banners before reporting time of the accident. Stripping these first

#extracting time in 24h format (used Copilot to help with the code)

# Strip leading "Updated …" banners
UPDATED_HDR = re.compile(r"^Updated\s+[A-Za-z]*\s*\d{1,2}[:.]?\d{0,2}\s?(?:am|pm)\b[:,]?\s*", re.IGNORECASE)
def clean_text(s: str) -> str:
    if not isinstance(s, str):
        return ''
    return UPDATED_HDR.sub('', s).strip()


# Helper to normalize various time strings to 24-hour HH:MM

def to_24h(t):
    if not isinstance(t, str) or not t.strip():
        return ''
    s = t.strip().lower()
    # Replace dot separator with colon for minutes (e.g., 5.30pm -> 5:30pm)
    s = re.sub(r'(\d{1,2})\.(\d{2})', r'\1:\2', s)
    # Ensure there's no extra spaces
    s = re.sub(r'\s+', ' ', s)

    try:
        # Cases with am/pm and possibly minutes
        if re.search(r'(am|pm)', s):
            # Add :00 if only hour is present (e.g., 5pm -> 5:00pm)
            if re.match(r'^\d{1,2}\s?(am|pm)$', s):
                s = re.sub(r'^(\d{1,2})\s?(am|pm)$', r'\1:00\2', s)
            # Remove spaces before am/pm: '2.15 pm' -> '2:15pm'
            s = re.sub(r'\s?(am|pm)$', r'\1', s)
            dt = datetime.strptime(s, '%I:%M%p')
            return dt.strftime('%H:%M')
        else:
            # 24h format like 10:30 or 7:05
            if re.match(r'^\d{1,2}:\d{2}$', s):
                # Normalize to two-digit hour
                h, m = s.split(':')
                return f"{int(h):02d}:{m}"
    except Exception:
        return ''
    return ''

# Create normalized column
if 'accident_time' not in local_news_df.columns:
    # Fallback: compute from content if previous step wasn't run
    import re
    time_patterns = [
        r"\b(?:at|around|about|reported at|occurred at|happened at|the accident was reported at|the incident was reported at|was reported at|was informed .* at|crash was reported at|reported to the police at|police (?:said|reported) .* at|the police (?:said|were informed).* at)\s*(\d{1,2}[:\.]?\d{0,2}\s?(?:am|pm))\b",
        r"\b(?:at|around|about|reported at|occurred at|happened at|the accident was reported at|the incident was reported at|was reported at|was informed .* at|crash was reported at|reported to the police at|police (?:said|reported) .* at|the police (?:said|were informed).* at)\s*(\d{1,2}:\d{2})\b",
        r"\b(?:at|around|about|reported at|occurred at|happened at)\s*(\d{1,2}\s?(?:am|pm))\b",
    ]
    compiled = [re.compile(p, flags=re.IGNORECASE) for p in time_patterns]
    def extract_time(text: str) -> str:
        if not isinstance(text, str):
            return ''
        text2 = re.sub(r"Updated\s+[A-Za-z]*\s*\d{1,2}[:\.]?\d{0,2}\s?(?:am|pm)\b[:,]?\s*", "", text, flags=re.IGNORECASE)
        text2 = re.sub(r"Updated\s+\d{1,2}[:\.]?\d{0,2}\s?(?:am|pm)\b[:,]?\s*", "", text2, flags=re.IGNORECASE)
        for rx in compiled:
            m = rx.search(text2)
            if m:
                return m.group(1).strip()
        m2 = re.search(r"\b(\d{1,2}[:\.]\d{2}\s?(?:am|pm))\b", text2, flags=re.IGNORECASE)
        if m2:
            return m2.group(1).strip()
        m3 = re.search(r"\b(\d{1,2}:\d{2})\b", text2)
        if m3:
            return m3.group(1).strip()
        return ''
    local_news_df['accident_time'] = local_news_df['content'].apply(extract_time)


local_news_df['time_24h'] = local_news_df['accident_time'].apply(to_24h)

# extracting time that's mentioned as around noon/midnight etc
# Phrase → 24h mapping
PHRASE_MAP = [
    (re.compile(r"\b(?:around|about)\s+(?:noon|midday)\b", re.IGNORECASE), '12:00'),
    (re.compile(r"\b(?:noon|midday)\b", re.IGNORECASE), '12:00'),
    (re.compile(r"\b(?:around|about)\s+midnight\b", re.IGNORECASE), '00:00'),
    (re.compile(r"\bmidnight\b", re.IGNORECASE), '00:00'),
    (re.compile(r"\b(?:just\s+|shortly\s+)?before\s+midnight\b", re.IGNORECASE), '23:30'),
    (re.compile(r"\b(?:just\s+|shortly\s+)?before\s+(?:noon|midday)\b", re.IGNORECASE), '11:30'),
    (re.compile(r"\baround\s+the\s+stroke\s+of\s+midnight\b", re.IGNORECASE), '00:00'),
]

def extract_approx_time(text: str) -> str:
    t = clean_text(text)
    for rx, hhmm in PHRASE_MAP:
        if rx.search(t):
            return hhmm
    return ''

idx_sel_rows = local_news_df.loc[((local_news_df['time_24h']== ' '))].index #identifying rows without time
local_news_df.loc[idx_sel_rows, 'time_24h'] = local_news_df.loc[idx_sel_rows, 'content'].apply(extract_approx_time) #applying this function to rows without time only
local_news_df.drop('accident_time', axis=1, inplace=True)


local_news_df['time_24h']

0      17:00
1           
2      17:00
3      13:00
4      17:30
       ...  
316         
317         
318         
319         
320         
Name: time_24h, Length: 328, dtype: object

In [8]:
# removing rows without time information from local news dataset since these usually report to past accidents or no specific accidents 
blank_time = local_news_df[local_news_df['time_24h'].isna() | (local_news_df['time_24h'].astype(str).str.strip() == '')]
blank_indices = blank_time.index.tolist()
#blank_indices

local_news_df = local_news_df.drop(blank_indices)
local_news_df

Unnamed: 0,publish_date,content,time_24h
0,2024-12-07,A motorist claims his car mirror was shattered...,17:00
2,2024-12-11,A motorcyclist was rushed to hospital in a cri...,17:00
3,2024-12-12,A private contractor who placed a skip on St P...,13:00
4,2024-12-14,A 29-year-old man and 17-year-old girl were cr...,17:30
4,2024-12-14,", police were busy responding to another serio...",19:45
...,...,...,...
307,2025-10-07,A crowdfunding campaign for a nurse left in a ...,08:00
309,2025-10-08,The families of the two couples killed in a ho...,09:30
311,2025-10-08,A Polish man has died after getting into diffi...,13:00
313,2025-10-09,A motorcyclist was left with serious injuries ...,09:30


In [9]:
#dropping rows in the police dataset that do not have time because they are exceptions and are unlike the rest of the records
blank_time = df[df['time_24h'].isna() | (df['time_24h'].astype(str).str.strip() == '')]
blank_indices = blank_time.index.tolist()
blank_indices

df = df.drop(blank_indices)
df

Unnamed: 0,date_published,content,time_24h
0,2025-10-09,"Today, at around 0930hrs, the Police were info...",09:30
1,2025-06-20,"Yesterday, at around 1830hrs, the Police were ...",18:30
2,2025-05-12,"Today, at around 0800hrs, the Police were info...",08:00
3,2025-07-30,"Yesterday, at around 1800hrs, the Police were ...",18:00
4,2025-04-07,"Yesterday, at around quarter to nine in the ev...",20:45
...,...,...,...
106,2025-02-05,"A 52-year-old man and residing in Ħaż-Żebbuġ, ...",09:00
107,2024-12-19,"Today, at around 1045hrs, the Police were info...",10:45
108,2025-03-16,"Today, at around 0800hrs, the Police were info...",08:00
109,2025-07-18,"Yesterday, at around 2215 hrs, the Police were...",22:15


In [10]:
#5. Concatenating the two dataset into a single dataframe
combined_df = pd.concat([
    df.rename(columns={'date_published': 'publish_date'}),local_news_df], ignore_index=True)

combined_df


Unnamed: 0,publish_date,content,time_24h
0,2025-10-09,"Today, at around 0930hrs, the Police were info...",09:30
1,2025-06-20,"Yesterday, at around 1830hrs, the Police were ...",18:30
2,2025-05-12,"Today, at around 0800hrs, the Police were info...",08:00
3,2025-07-30,"Yesterday, at around 1800hrs, the Police were ...",18:00
4,2025-04-07,"Yesterday, at around quarter to nine in the ev...",20:45
...,...,...,...
285,2025-10-07,A crowdfunding campaign for a nurse left in a ...,08:00
286,2025-10-08,The families of the two couples killed in a ho...,09:30
287,2025-10-08,A Polish man has died after getting into diffi...,13:00
288,2025-10-09,A motorcyclist was left with serious injuries ...,09:30


2. Correcting date (some rows report accidents that happened a day before the publication date) 

In [11]:
#2.1 adding a column with values same day, previous day, undetermined based on the content of the article and the publish date

same_day_pattern = re.compile(r"\b(today|same day)\b", re.IGNORECASE)
previous_day_pattern = re.compile(r"\b(yesterday|previous day|last night|before midnight)\b", re.IGNORECASE)
weekday_pattern = re.compile(r"\b(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\b", re.IGNORECASE)

accident_day_list = []
for idx, row in combined_df.iterrows():
    content = str(row['content'])
    publish_date = row['publish_date']
    classification = "undetermined"

    if same_day_pattern.search(content):
        classification = "same day"
    elif previous_day_pattern.search(content):
        classification = "previous day"
    else:
        weekday_match = weekday_pattern.search(content)
        if weekday_match and publish_date is not pd.NaT:
            mentioned_day = weekday_match.group(0).lower()
            weekdays = {'monday': 0, 'tuesday': 1, 'wednesday': 2, 'thursday': 3, 'friday': 4, 'saturday': 5, 'sunday': 6}
            mentioned_num = weekdays.get(mentioned_day)
            publish_num = publish_date.weekday()
            if mentioned_num == publish_num:
                classification = "same day"
            elif (publish_num - mentioned_num) % 7 == 1:
                classification = "previous day"
    accident_day_list.append(classification)

combined_df['accident_day'] = accident_day_list
combined_df

Unnamed: 0,publish_date,content,time_24h,accident_day
0,2025-10-09,"Today, at around 0930hrs, the Police were info...",09:30,same day
1,2025-06-20,"Yesterday, at around 1830hrs, the Police were ...",18:30,previous day
2,2025-05-12,"Today, at around 0800hrs, the Police were info...",08:00,same day
3,2025-07-30,"Yesterday, at around 1800hrs, the Police were ...",18:00,previous day
4,2025-04-07,"Yesterday, at around quarter to nine in the ev...",20:45,previous day
...,...,...,...,...
285,2025-10-07,A crowdfunding campaign for a nurse left in a ...,08:00,undetermined
286,2025-10-08,The families of the two couples killed in a ho...,09:30,undetermined
287,2025-10-08,A Polish man has died after getting into diffi...,13:00,same day
288,2025-10-09,A motorcyclist was left with serious injuries ...,09:30,same day


In [12]:
#2.2 correcting the date according to the accident_day column, adding day_of_week column
pub_dates = combined_df['publish_date']

# Compute corrected_date per instruction
corr_dates = []
for i, row in combined_df.iterrows():
    pub = pub_dates.iloc[i]
    day = str(row.get('accident_day', '')).strip().lower()
    if pd.isna(pub):
        corr_dates.append('')
    else:
        if day == 'previous day':
            corr_dates.append((pub - timedelta(days=1)).isoformat())
        else:
            corr_dates.append(pub.isoformat())

combined_df['corrected_date'] = corr_dates
combined_df['corrected_date'] = pd.to_datetime(combined_df['corrected_date'])
combined_df['day_of_week'] = combined_df['corrected_date'].dt.day_name()
combined_df.drop(['accident_day'], axis=1, inplace=True)
combined_df

Unnamed: 0,publish_date,content,time_24h,corrected_date,day_of_week
0,2025-10-09,"Today, at around 0930hrs, the Police were info...",09:30,2025-10-09,Thursday
1,2025-06-20,"Yesterday, at around 1830hrs, the Police were ...",18:30,2025-06-19,Thursday
2,2025-05-12,"Today, at around 0800hrs, the Police were info...",08:00,2025-05-12,Monday
3,2025-07-30,"Yesterday, at around 1800hrs, the Police were ...",18:00,2025-07-29,Tuesday
4,2025-04-07,"Yesterday, at around quarter to nine in the ev...",20:45,2025-04-06,Sunday
...,...,...,...,...,...
285,2025-10-07,A crowdfunding campaign for a nurse left in a ...,08:00,2025-10-07,Tuesday
286,2025-10-08,The families of the two couples killed in a ho...,09:30,2025-10-08,Wednesday
287,2025-10-08,A Polish man has died after getting into diffi...,13:00,2025-10-08,Wednesday
288,2025-10-09,A motorcyclist was left with serious injuries ...,09:30,2025-10-09,Thursday


In [13]:
# creating a timestamp column and filtering out duplicates (accidents reported both in the police reports and local news articles
#keepinh only the reports in the police dataset)
combined_df['time_24h_dt'] = pd.to_datetime(combined_df['time_24h'], format = '%H:%M')
combined_df['time_24h_dt'] = combined_df['time_24h_dt'].dt.time
combined_df['timestamp'] = [datetime.combine(a, b) for a, b in zip(combined_df['corrected_date'], combined_df['time_24h_dt'])]
combined_df = combined_df.drop_duplicates(subset=['timestamp'], keep='first') #dropping duplicates, keeping the records from the police dataset
combined_df = combined_df.sort_values(by='timestamp')
combined_df

Unnamed: 0,publish_date,content,time_24h,corrected_date,day_of_week,time_24h_dt,timestamp
108,2024-12-07,A motorist claims his car mirror was shattered...,17:00,2024-12-07,Saturday,17:00:00,2024-12-07 17:00:00
80,2024-12-11,"A 60-year-old woman residing in Swieqi, at aro...",11:15,2024-12-11,Wednesday,11:15:00,2024-12-11 11:15:00
110,2024-12-12,A private contractor who placed a skip on St P...,13:00,2024-12-11,Wednesday,13:00:00,2024-12-11 13:00:00
97,2024-12-11,"Update:Unfortunately, the 54-year-old Englishm...",17:00,2024-12-11,Wednesday,17:00:00,2024-12-11 17:00:00
88,2024-12-14,"A 17-year-old girl, residing in Qormi, yesterd...",17:30,2024-12-13,Friday,17:30:00,2024-12-13 17:30:00
...,...,...,...,...,...,...,...
285,2025-10-07,A crowdfunding campaign for a nurse left in a ...,08:00,2025-10-07,Tuesday,08:00:00,2025-10-07 08:00:00
286,2025-10-08,The families of the two couples killed in a ho...,09:30,2025-10-08,Wednesday,09:30:00,2025-10-08 09:30:00
287,2025-10-08,A Polish man has died after getting into diffi...,13:00,2025-10-08,Wednesday,13:00:00,2025-10-08 13:00:00
0,2025-10-09,"Today, at around 0930hrs, the Police were info...",09:30,2025-10-09,Thursday,09:30:00,2025-10-09 09:30:00


2. Flagging rows that do not report traffic accidents

In [14]:
# these rows will be filtered out later on

TRAFFIC_WORDS = re.compile(
    r"\b(traffic accident|driven|driver|driving|road accident|collision|crash|hit[- ]?by|run[- ]?over|slammed|lost control|skidded|overturned)\b",
    re.IGNORECASE
)

VEHICLE_WORDS = re.compile(
    r"\b(car|vehicle|van|bus|truck|lorry|motorcycle|motorbike|motor[- ]?cycle|bike|scooter|e-scooter|bicycle|cyclist|pedestrian)\b",
    re.IGNORECASE
)
CAR_BRANDS = re.compile(
    r"\b(peugeot|volkswagen|vw|mercedes|bmw|audi|toyota|kia|ford|hyundai|nissan|mazda|honda|vauxhall|lexmoto|mercedes|bmw|toyota|nissan|skoda|mazda|ford|renault|citroen|kia)\b",
    re.IGNORECASE
)

KEYWORDS = re.compile(r"\b(crowdfunding|data|permit|fundraiser|memory|tribute|court|croatia|sicily|donate|alejandro)\b", re.IGNORECASE) #report past accidents or accidents abroad

def refers_to_traffic_accident(text: str) -> int:
    if not isinstance(text, str):
        return 0
    #t = strip_updated(text)
    t = text.strip()
    has_traffic_event = bool(TRAFFIC_WORDS.search(t))
    has_vehicle = bool(VEHICLE_WORDS.search(t) or CAR_BRANDS.search(t))
    has_keywords = bool(KEYWORDS.search(t))
    return 1 if has_traffic_event and has_vehicle and not has_keywords else 0

combined_df['accident_flag'] = combined_df['content'].apply(refers_to_traffic_accident)
combined_df['accident_flag'].value_counts()


accident_flag
1    169
0     43
Name: count, dtype: int64

3. Extracting injuiry severity, affected party and the type of accident

In [15]:
# detect injury severity
patterns = {
    'no injuries': re.compile(r"\b(no injuries|uninjured|escaped injury)\b", re.IGNORECASE),
    'minor': re.compile(r"\b(minor|minor injuries|slight injuries|light injuries)\b", re.IGNORECASE),
    'serious': re.compile(r"\b(serious|seriously injured)\b", re.IGNORECASE),
    'grievous': re.compile(r"\b(grievous|grievously injured)\b", re.IGNORECASE),
    'critical': re.compile(r"\b(critical|critically injured|critical condition)\b", re.IGNORECASE),
    'death': re.compile(r"\b(died|death|dead|fatal|succumbed|killed)\b", re.IGNORECASE)
}
severity_list = []
for content in combined_df['content'].astype(str):
    severity = 'unknown'
    for category, pattern in patterns.items():
        if pattern.search(content):
            severity = category
            break
    severity_list.append(severity)

combined_df['injury_severity'] = severity_list


# Build regex for death-related terms (case-insensitive), to improve detection of fatal accidents
terms = [
    r"killed",
    r"succumbed",
    r"fatal",
    r"fatally",
    r"died",
    r"dead",
    r"lost\s+(?:his|her)\s+life"
]

death_pattern = re.compile(r"\b(" + r"|".join(terms) + r")\b", re.I)

# Flag function
def flag_death_terms(text):
    return 1 if death_pattern.search(str(text)) else 0

combined_df['flag_death_terms'] = combined_df['content'].apply(flag_death_terms)
idx_death_flag = combined_df.loc[(combined_df['flag_death_terms']  == 1) & (combined_df['injury_severity'] != 'death')].index
combined_df.loc[idx_death_flag, 'injury_severity'] = 'death'
combined_df.drop(['flag_death_terms'], axis=1, inplace=True)

#using the flag column to improve detection of fatalities

combined_df

Unnamed: 0,publish_date,content,time_24h,corrected_date,day_of_week,time_24h_dt,timestamp,accident_flag,injury_severity
108,2024-12-07,A motorist claims his car mirror was shattered...,17:00,2024-12-07,Saturday,17:00:00,2024-12-07 17:00:00,1,unknown
80,2024-12-11,"A 60-year-old woman residing in Swieqi, at aro...",11:15,2024-12-11,Wednesday,11:15:00,2024-12-11 11:15:00,1,grievous
110,2024-12-12,A private contractor who placed a skip on St P...,13:00,2024-12-11,Wednesday,13:00:00,2024-12-11 13:00:00,0,serious
97,2024-12-11,"Update:Unfortunately, the 54-year-old Englishm...",17:00,2024-12-11,Wednesday,17:00:00,2024-12-11 17:00:00,1,death
88,2024-12-14,"A 17-year-old girl, residing in Qormi, yesterd...",17:30,2024-12-13,Friday,17:30:00,2024-12-13 17:30:00,1,serious
...,...,...,...,...,...,...,...,...,...
285,2025-10-07,A crowdfunding campaign for a nurse left in a ...,08:00,2025-10-07,Tuesday,08:00:00,2025-10-07 08:00:00,0,death
286,2025-10-08,The families of the two couples killed in a ho...,09:30,2025-10-08,Wednesday,09:30:00,2025-10-08 09:30:00,0,death
287,2025-10-08,A Polish man has died after getting into diffi...,13:00,2025-10-08,Wednesday,13:00:00,2025-10-08 13:00:00,0,death
0,2025-10-09,"Today, at around 0930hrs, the Police were info...",09:30,2025-10-09,Thursday,09:30:00,2025-10-09 09:30:00,1,serious


In [16]:
#detecting the affected party of the accident (injuired, hospitalised etc)
#detecting pedestrians, cyclists (escooter is also included here) and motorcyclists. The default cathegory will be "driver"
#limitation: this method picks only one affected party, whereas some rows report multiple affected parties

# terms
PED_TERMS = r"\b(pedestrian|walker|passer[- ]?by|passerby|foot\s*traveller|on foot)\b"
CYCLIST_TERMS = r"\b(cyclist|bicyclist|bicycle|push\s?bike|pedal\s?cycle|e\-?bike|ebike|e-scooter|escooter)\b" #escooter is also included here
MOTORCYCLE_TERMS = r"\b(motorcyclist|motorcycle|motor\s?bike|motorbike|bike\b(?!\s?ride)|riding|biker)\b"

# phrases indicating being hurt
HURT_TERMS = r"\b(hurt|minor|treated|injur(?:ed|ies)|serious(?:ly)?|grievously|critically|run over|hit|struck|died|dead|death|fatally|deadly|knocked down|overturned|lost|hospitalis(?:ed|ed))\b"

# traffic context keywords
TRAFFIC_TERMS = r"\b(road|street|triq|lane|bypass|tunnel|roundabout|junction|seafront|coast road|regional road|wall|traffic|collision|accident|crash|bollard)\b"
IGNORE = re.compile(r"\b(rota)\b", re.IGNORECASE) #report past accidents or accidents abroad

PED_RX = re.compile(PED_TERMS, re.IGNORECASE)
CYCLIST_RX = re.compile(CYCLIST_TERMS, re.IGNORECASE)
MOTO_RX = re.compile(MOTORCYCLE_TERMS, re.IGNORECASE)

HURT_RX = re.compile(HURT_TERMS, re.IGNORECASE)
TRAFFIC_RX = re.compile(TRAFFIC_TERMS, re.IGNORECASE)

def mark_motorcyclist(text: str) -> str:
    #t = strip_updated(text)
    t = text.strip()
    if MOTO_RX.search(t) and HURT_RX.search(t) and TRAFFIC_RX.search(t) and not CYCLIST_RX.search(t):
        return 'motorcyclist'
    return ''

def mark_cyclist(text: str) -> str:
    #t = strip_updated(text)
    t = text.strip()
    # Must mention a cyclist and harm in traffic context, but NOT a motorcyclist
    if CYCLIST_RX.search(t) and HURT_RX.search(t) and TRAFFIC_RX.search(t) and not IGNORE.search(t) and not MOTO_RX.search(t):
        return 'cyclist'
    # Catch constructions like "was cycling / on a bicycle ... and was hit"
    cycling_phrase = re.search(
        r"\b(cycling|on\s+a\s+bicycle|on\s+his\s+bicycle|on\s+her\s+bicycle|on\s+a\s+push\s?bike)\b",
        t, re.IGNORECASE
    )
    if cycling_phrase and HURT_RX.search(t) and TRAFFIC_RX.search(t) and not MOTO_RX.search(t):
        return 'cyclist'
    return ''

def mark_pedestrian(text: str) -> str:
    #t = strip_updated(text)
    t = text.strip()
    if PED_RX.search(t) and HURT_RX.search(t) and TRAFFIC_RX.search(t):
        return 'pedestrian'
    # also catch constructions like "a woman was run over" without the word pedestrian
    if re.search(r"\b(run over|knocked down|hit|struck)\b", t, re.IGNORECASE) and \
       re.search(r"\b(man|woman|boy|girl|elderly|child|teenager|youth|person)\b", t, re.IGNORECASE) and \
       TRAFFIC_RX.search(t):
        # ensure it's not inside vehicle context (driver etc.) by simple heuristic
        if not re.search(r"\b(driver|motorist|cyclist|motorcyclist)\b", t, re.IGNORECASE):
            return 'pedestrian'
    return ''

# Apply
combined_df['pedestrian'] = combined_df['content'].apply(mark_pedestrian)
combined_df['motorcyclist'] = combined_df['content'].apply(mark_motorcyclist)
combined_df['cyclist'] = combined_df['content'].apply(mark_cyclist)

idx_pedestrian = combined_df.loc[(combined_df['pedestrian'] == 'pedestrian') & (combined_df['motorcyclist'] != 'motorcyclist') & (combined_df['cyclist'] != 'cyclist')].index
#idx_pedestrian
combined_df.loc[idx_pedestrian, 'affected_party'] = 'pedestrian'

idx_motorcyclist = combined_df.loc[(combined_df['motorcyclist'] == 'motorcyclist')].index
combined_df.loc[idx_motorcyclist, 'affected_party'] = 'motorcyclist'

idx_cyclist = combined_df.loc[(combined_df['cyclist'] == 'cyclist') & (combined_df['motorcyclist'] != 'motorcyclist')].index
combined_df.loc[idx_cyclist, 'affected_party'] = 'cyclist'

idx_driver = combined_df.loc[(combined_df['pedestrian'] != 'pedestrian') & (combined_df['motorcyclist'] != 'motorcyclist') & (combined_df['cyclist'] != 'cyclist') & (combined_df['accident_flag'] == 1)].index
combined_df.loc[idx_driver, 'affected_party'] = 'driver'

combined_df.drop(['pedestrian', 'motorcyclist', 'cyclist'], axis=1, inplace=True)

combined_df

Unnamed: 0,publish_date,content,time_24h,corrected_date,day_of_week,time_24h_dt,timestamp,accident_flag,injury_severity,affected_party
108,2024-12-07,A motorist claims his car mirror was shattered...,17:00,2024-12-07,Saturday,17:00:00,2024-12-07 17:00:00,1,unknown,driver
80,2024-12-11,"A 60-year-old woman residing in Swieqi, at aro...",11:15,2024-12-11,Wednesday,11:15:00,2024-12-11 11:15:00,1,grievous,pedestrian
110,2024-12-12,A private contractor who placed a skip on St P...,13:00,2024-12-11,Wednesday,13:00:00,2024-12-11 13:00:00,0,serious,motorcyclist
97,2024-12-11,"Update:Unfortunately, the 54-year-old Englishm...",17:00,2024-12-11,Wednesday,17:00:00,2024-12-11 17:00:00,1,death,motorcyclist
88,2024-12-14,"A 17-year-old girl, residing in Qormi, yesterd...",17:30,2024-12-13,Friday,17:30:00,2024-12-13 17:30:00,1,serious,pedestrian
...,...,...,...,...,...,...,...,...,...,...
285,2025-10-07,A crowdfunding campaign for a nurse left in a ...,08:00,2025-10-07,Tuesday,08:00:00,2025-10-07 08:00:00,0,death,pedestrian
286,2025-10-08,The families of the two couples killed in a ho...,09:30,2025-10-08,Wednesday,09:30:00,2025-10-08 09:30:00,0,death,
287,2025-10-08,A Polish man has died after getting into diffi...,13:00,2025-10-08,Wednesday,13:00:00,2025-10-08 13:00:00,0,death,
0,2025-10-09,"Today, at around 0930hrs, the Police were info...",09:30,2025-10-09,Thursday,09:30:00,2025-10-09 09:30:00,1,serious,motorcyclist


In [17]:
#dropping rows that report no accidents or have missing time value
idx_drop_rows = combined_df.loc[(combined_df['accident_flag']== 0) | (combined_df['injury_severity']== 'unknown') | (combined_df['affected_party'].isna())].index #identifying rows to be dropped
combined_df = combined_df.drop(idx_drop_rows)
combined_df

Unnamed: 0,publish_date,content,time_24h,corrected_date,day_of_week,time_24h_dt,timestamp,accident_flag,injury_severity,affected_party
80,2024-12-11,"A 60-year-old woman residing in Swieqi, at aro...",11:15,2024-12-11,Wednesday,11:15:00,2024-12-11 11:15:00,1,grievous,pedestrian
97,2024-12-11,"Update:Unfortunately, the 54-year-old Englishm...",17:00,2024-12-11,Wednesday,17:00:00,2024-12-11 17:00:00,1,death,motorcyclist
88,2024-12-14,"A 17-year-old girl, residing in Qormi, yesterd...",17:30,2024-12-13,Friday,17:30:00,2024-12-13 17:30:00,1,serious,pedestrian
20,2024-12-14,"Yesterday, at around (1945hrs), the police wer...",19:45,2024-12-13,Friday,19:45:00,2024-12-13 19:45:00,1,serious,driver
112,2024-12-14,", police were busy responding to another serio...",19:45,2024-12-14,Saturday,19:45:00,2024-12-14 19:45:00,1,serious,pedestrian
...,...,...,...,...,...,...,...,...,...,...
84,2025-10-02,"This morning, at around 0530 hrs, the Police w...",05:30,2025-10-02,Thursday,05:30:00,2025-10-02 05:30:00,1,grievous,motorcyclist
26,2025-10-05,"Late yesterday evening, at around 11:00 p.m. (...",23:00,2025-10-04,Saturday,23:00:00,2025-10-04 23:00:00,1,minor,driver
50,2025-10-06,"An 84-year-old woman, a resident of Naxxar, wa...",09:30,2025-10-06,Monday,09:30:00,2025-10-06 09:30:00,1,grievous,pedestrian
0,2025-10-09,"Today, at around 0930hrs, the Police were info...",09:30,2025-10-09,Thursday,09:30:00,2025-10-09 09:30:00,1,serious,motorcyclist


In [18]:
#flagging vehicle types involved in the accident. Creating separate columns for car, motorbike and larger vehicles (truck, bus, lorry, pickup, van)
#this way, we know which vehicle types were involved into collisions and running over pedestrians/cyclists


# Work with lower-cased content for simple token checks
text = combined_df['content'].astype(str).str.lower()

# Cars: generic words and common makes
car_tokens = [
    ' car', ' cars', ' auto', ' sedan', ' hatchback','vehicle',
    ' coupe', ' convertible', ' saloon', 'suzuki', 'vauxhall', 'audi',
    ' toyota', 'aqua',' mazda', ' ford ', ' bmw', 'bmw 525','honda', 'mitsubishi', 'alfa romeo','chevrolet','aveo'
    ' seat', ' opel', ' jeep', ' subaru', ' lexus', ' jaguar', 'land rover',
    'chrysler','dodge','fiat','golf','audi','dacia', 'maserati','levante',
    ' mercedes', ' volkswagen', ' vw ', ' renault', 'captur', ' peugeot',
    ' skoda', 'volvo', ' nissan', ' smart ', ' jeep', ' kia', 
    ' hyundai', ' seat', ' citroen', ' isuzu', ' fiat', 'nissan', 'citroen', 'smart', 'fiesta', 'civic'
]

# Motorbikes: generic words and makes
bike_tokens = [
    ' motorbike', ' motorbikes', ' motorcycle', ' motorcycles', 'motorcyclist',
    ' bike', ' bikes',' yamaha', 'vespa','daelim',' kawasaki', 'kawazaki', ' aprilia', ' benelli', ' piaggio',
    ' kymco', ' harley', ' ducati', ' triumph', 'lexmoto'
]

# Larger vehicles: vans, trucks, buses/coaches (+ fleet models)
large_tokens = [
    ' bus', ' buses', ' coach', ' coaches', ' van', ' vans',
    ' minivan', ' minivans', ' minibus', ' minibuses', ' truck', 'double-decker',
    ' trucks', ' lorry', ' lorries', ' trailer', ' trailers','optare',
    ' pickup', ' pick-up']
    
# Flagging the vehicles 
car_flag    = text.apply(lambda s: int(any(tok in s for tok in car_tokens)))
bike_flag   = text.apply(lambda s: int(any(tok in s for tok in bike_tokens)))
large_flag  = text.apply(lambda s: int(any(tok in s for tok in large_tokens)))

combined_df['car']             = car_flag
combined_df['motorbike']       = bike_flag
combined_df['larger_vehicle']  = large_flag
combined_df

Unnamed: 0,publish_date,content,time_24h,corrected_date,day_of_week,time_24h_dt,timestamp,accident_flag,injury_severity,affected_party,car,motorbike,larger_vehicle
80,2024-12-11,"A 60-year-old woman residing in Swieqi, at aro...",11:15,2024-12-11,Wednesday,11:15:00,2024-12-11 11:15:00,1,grievous,pedestrian,1,0,0
97,2024-12-11,"Update:Unfortunately, the 54-year-old Englishm...",17:00,2024-12-11,Wednesday,17:00:00,2024-12-11 17:00:00,1,death,motorcyclist,1,1,0
88,2024-12-14,"A 17-year-old girl, residing in Qormi, yesterd...",17:30,2024-12-13,Friday,17:30:00,2024-12-13 17:30:00,1,serious,pedestrian,1,0,0
20,2024-12-14,"Yesterday, at around (1945hrs), the police wer...",19:45,2024-12-13,Friday,19:45:00,2024-12-13 19:45:00,1,serious,driver,1,0,0
112,2024-12-14,", police were busy responding to another serio...",19:45,2024-12-14,Saturday,19:45:00,2024-12-14 19:45:00,1,serious,pedestrian,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,2025-10-02,"This morning, at around 0530 hrs, the Police w...",05:30,2025-10-02,Thursday,05:30:00,2025-10-02 05:30:00,1,grievous,motorcyclist,1,1,0
26,2025-10-05,"Late yesterday evening, at around 11:00 p.m. (...",23:00,2025-10-04,Saturday,23:00:00,2025-10-04 23:00:00,1,minor,driver,1,0,0
50,2025-10-06,"An 84-year-old woman, a resident of Naxxar, wa...",09:30,2025-10-06,Monday,09:30:00,2025-10-06 09:30:00,1,grievous,pedestrian,1,0,0
0,2025-10-09,"Today, at around 0930hrs, the Police were info...",09:30,2025-10-09,Thursday,09:30:00,2025-10-09 09:30:00,1,serious,motorcyclist,1,1,0


In [19]:
#type of accident detection (Copilot coding help, modified to fit the context better)

from typing import List

# ---------------------------------------------------------------------
# 1) Lexicons and regex patterns
## ---------------------------------------------------------------------

VEHICLES = r"""
car|van|truck|pickup|vehicle|bus|jeep|coach|double-decker|minivan|school\s*van|vauxhall|lexmoto|
mercedes|bmw|toyota|nissan|peugeot|skoda|mazda|ford|renault|captur|citroen|kia|optare|
suzuki|isuzu|honda|kymco|aprilia|yamaha|benelli|dacia|smart|passo|mitsubishi|alfa|romeo|chevrolet|aveo|
chrysler|dodge|fiat|golf|audi|volkswagen|vw|dacia|hyundai|honda|maserati|levante|
sprinter|master|fiesta|civic|swift|fit|starlet|funcargo|aygo|hilux|
volvo|jaguar|optare|otokar|dac|vitz|demio|cx-?3|c180|xf|b\s*max|
gpd125-?a|tweet|vespa|piaggio
"""

# Fixed obstacles (roadside infrastructure and static objects)
OBSTACLES = r"""
wall|tree|tunnel|light\s*post|lamp\s*post|barrier|crash\s*barrier|bollard|
building|house|signpost|pillar|electricity\s*pole|skip|stationary\s*skip|
rubble\s*wall|bridge|guardrail|canopy|
"""

# Person terms likely to indicate pedestrians (on foot)
PERSON_WORDS = r"""
pedestrian|man|woman|boy|girl|child|toddler|elderly|teen(?:ager)?|
people|aged|passer\s*by|passers\s*by|\d{1,3}\s*-?\s*year\s*-?\s*old
"""

# Rider/driver terms (exclude these from pedestrian struck logic)
RIDER_WORDS = r"""
cyclist|bicycl(?:e|ist)|motorcyclist|rider|driver
"""

# Accident/impact verbs and phrases
V_CRASH = r"crash(?:ed|es|ing)?|smashed\s+into|ramm(?:ed|ing)|struck|slammed|clip(?:ped|s)"
V_RUN_OVER = r"run(?:\s*-)over|ran\s*over|hit|run\s+over"
V_LOST_CONTROL = r"lost\s+control|went\s+out\s+of\s+control|careen(?:ed|ing)|skidd(?:ed|ing)|went\s+off\s+the\s+road|fell\s+off\s+(?:the\s+)?(?:bike|motorcycle|motorbike)"
V_OVERTURN = r"overturn(?:ed|s|ing)|rolled\s+over|flip(?:ped|s|ping)|ended\s+up\s+on\s+its\s+side"
V_COLL = r"collid(?:ed|es|ing)|collision(s)?|crash(?:ed|es|ing)?"

# Compile regexes
P_VEHICLE = re.compile(VEHICLES, re.I | re.X)
P_OBSTACLE = re.compile(OBSTACLES, re.I | re.X)
P_PERSON = re.compile(PERSON_WORDS, re.I | re.X)
P_RIDER = re.compile(RIDER_WORDS, re.I | re.X)
P_CRASH = re.compile(V_CRASH, re.I)
P_COLLISION = re.compile(V_COLL, re.I)
P_RUN_OVER = re.compile(V_RUN_OVER, re.I)
P_LOST_CONTROL = re.compile(V_LOST_CONTROL, re.I)
P_OVERTURN = re.compile(V_OVERTURN, re.I)
P_GENERIC_ACCIDENT = re.compile(r"\b(accident|incident|injur(?:y|ies))\b", re.I)

# ---------------------------------------------------------------------
# 2) Sentence splitter (simple, rule-based)
# ---------------------------------------------------------------------
def split_sentences(t: str) -> List[str]:
    if not isinstance(t, str):
        return []
    return [s.strip() for s in re.split(r"(?<=[.!?])\s+|\n+", t) if s.strip()]

# ---------------------------------------------------------------------
# 3) Per-sentence classification
#    Returns granular flags for the sentence.
# ---------------------------------------------------------------------
def classify_sentence(s: str):
    s_low = s.lower()
    has_vehicle = bool(P_VEHICLE.search(s_low))
    has_obstacle = bool(P_OBSTACLE.search(s_low))
    has_person = bool(P_PERSON.search(s_low))
    has_rider = bool(P_RIDER.search(s_low))
    has_crash = bool(P_CRASH.search(s_low))
    has_collision = bool(P_COLLISION.search(s_low))
    has_run_over = bool(P_RUN_OVER.search(s_low))
    has_lost_control = bool(P_LOST_CONTROL.search(s_low))
    has_overturn = bool(P_OVERTURN.search(s_low))

    # Pedestrian struck/run-over: person + (hit/struck/run-over) + vehicle, excluding rider contexts
    #correcting running over pedestrian flag
    
    running_over_pedestrian = 1 if (has_person and has_run_over and not has_rider) else 0

    # Collision with fixed object: crash verb + obstacle
    crushed_into_obstacle = 1 if (has_crash and has_obstacle) else 0

    # Overturned: rollover phrases
    overturned = 1 if has_overturn else 0

    # Lost control: phrases directly
    lost_control = 1 if has_lost_control else 0

    # Vehicle–vehicle collision: collision verb + >= 2 mentions of vehicles in the sentence
    vehicle_mentions = len(re.findall(VEHICLES, s_low, re.I | re.X))
    collision_vv = 1 if ((has_collision or has_crash or has_run_over) and vehicle_mentions >= 2 and not running_over_pedestrian and not crushed_into_obstacle) else 0
   
    # Generic collision: collision verb + some vehicle/rider context, excluding pedestrian/obstacle cases
    collision_generic = 1 if (has_collision and has_rider and not (running_over_pedestrian or crushed_into_obstacle)) else 0
    #collision_generic = 1 if (has_collision and (has_vehicle or has_rider) and not running_over_pedestrian) else 0

    return {
        'collision_vv': collision_vv,
        'collision_generic': collision_generic,
        'running_over_pedestrian': running_over_pedestrian,
        'lost_control': lost_control,
        'crushed_into_obstacle': crushed_into_obstacle,
        'overturned': overturned
    }

# ---------------------------------------------------------------------
# 4) Per-article aggregation + precedence rules
#    Combines sentence flags and applies differentiation rules.
# ---------------------------------------------------------------------
def classify_article(text: str):
    sentences = split_sentences(text)
    flags = {
        'collision': 0,
        'running_over_pedestrian': 0,
        'lost_control': 0,
        'crushed_into_obstacle': 0,
        'overturned': 0,
        'other': 0
    }

    any_accident_mention = False
    saw_vv_collision = False

    for s in sentences:
        if P_GENERIC_ACCIDENT.search(s):
            any_accident_mention = True
        sf = classify_sentence(s)

        # accumulate non-collision flags
        flags['running_over_pedestrian'] |= sf['running_over_pedestrian']
        flags['lost_control'] |= sf['lost_control']
        flags['crushed_into_obstacle'] |= sf['crushed_into_obstacle']
        flags['overturned'] |= sf['overturned']

        # collision handling (prefer vehicle–vehicle over generic)
        if sf['collision_vv']:
            flags['collision'] = 1
            saw_vv_collision = True
        elif sf['collision_generic']:
            flags['collision'] |= 1

    # If obstacle crash detected and collision flagged only by generic sentences, drop collision unless vv also seen
    if flags['crushed_into_obstacle'] and flags['collision'] == 1:
        flags['crushed_into_obstacle'] = 0

    # "Other" only if there is accident context but no specific type matched
    if (flags['collision'] + flags['running_over_pedestrian'] + flags['lost_control'] +
        flags['crushed_into_obstacle'] + flags['overturned'] == 0) and any_accident_mention:
        flags['other'] = 1

    return flags

# ---------------------------------------------------------------------
# 5) Apply detection into separate columns (this is done because some accidents are hybrid - lost control and overturned etc.
# these columns can be passed to the ML models as parameters separately from the primary accident column or together with it)
# ---------------------------------------------------------------------
cols = ['collision','running_over_pedestrian','lost_control','crushed_into_obstacle','overturned','other']
new_flags = combined_df['content'].apply(classify_article)
for c in cols:
    combined_df[c] = new_flags.apply(lambda d: d[c])

#correcting running over pedestrian flag
idx_pdst = combined_df.loc[(combined_df['affected_party']  != 'pedestrian') & (combined_df['running_over_pedestrian'] == 1)].index
#combined_df.loc[idx_pdst, 'running_over_pedestrian'] = 0
combined_df.loc[idx_pdst, 'affected_party'] = 'pedestrian'

# ---------------------------------------------------------------------
# 6) Create mutually exclusive primary_accident_type (hierarchy)
#    Order can be adjusted to your preference.
# ---------------------------------------------------------------------
# Ensure numeric
for c in cols:
    combined_df[c] = pd.to_numeric(combined_df[c], errors='coerce').fillna(0).astype(int)

# Hierarchy / precedence for single-label assignment
precedence = [
    ('running_over_pedestrian', 'running over pedestrian'),
    ('overturned', 'overturned'),
    ('crushed_into_obstacle', 'crushed into an obstacle'),
    ('collision', 'collision'),
    ('lost_control', 'lost control of the vehicle'),
    ('other', 'other')
]

labels = []
for _, row in combined_df.iterrows():
    label = 'none'
    for col, name in precedence:
        if int(row[col]) == 1:
            label = name
            break
    labels.append(label)

combined_df['primary_accident_type'] = labels

code_map = {
    'running over pedestrian': 'PED',
    'overturned': 'ROLL',
    'crushed into an obstacle': 'FXOBJ',
    'collision': 'COLL',
    'lost control of the vehicle': 'LOC',
    'other': 'OTH',
    'none': 'NONE'
}
#pd.set_option('display.max_rows', None)
combined_df['primary_accident_type_code'] = combined_df['primary_accident_type'].map(code_map)
combined_df

Unnamed: 0,publish_date,content,time_24h,corrected_date,day_of_week,time_24h_dt,timestamp,accident_flag,injury_severity,affected_party,...,motorbike,larger_vehicle,collision,running_over_pedestrian,lost_control,crushed_into_obstacle,overturned,other,primary_accident_type,primary_accident_type_code
80,2024-12-11,"A 60-year-old woman residing in Swieqi, at aro...",11:15,2024-12-11,Wednesday,11:15:00,2024-12-11 11:15:00,1,grievous,pedestrian,...,0,0,0,1,0,0,0,0,running over pedestrian,PED
97,2024-12-11,"Update:Unfortunately, the 54-year-old Englishm...",17:00,2024-12-11,Wednesday,17:00:00,2024-12-11 17:00:00,1,death,motorcyclist,...,1,0,0,0,1,1,0,0,crushed into an obstacle,FXOBJ
88,2024-12-14,"A 17-year-old girl, residing in Qormi, yesterd...",17:30,2024-12-13,Friday,17:30:00,2024-12-13 17:30:00,1,serious,pedestrian,...,0,0,1,1,0,0,0,0,running over pedestrian,PED
20,2024-12-14,"Yesterday, at around (1945hrs), the police wer...",19:45,2024-12-13,Friday,19:45:00,2024-12-13 19:45:00,1,serious,driver,...,0,0,1,0,0,0,0,0,collision,COLL
112,2024-12-14,", police were busy responding to another serio...",19:45,2024-12-14,Saturday,19:45:00,2024-12-14 19:45:00,1,serious,pedestrian,...,0,1,0,1,0,0,0,0,running over pedestrian,PED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,2025-10-02,"This morning, at around 0530 hrs, the Police w...",05:30,2025-10-02,Thursday,05:30:00,2025-10-02 05:30:00,1,grievous,motorcyclist,...,1,0,1,0,0,0,0,0,collision,COLL
26,2025-10-05,"Late yesterday evening, at around 11:00 p.m. (...",23:00,2025-10-04,Saturday,23:00:00,2025-10-04 23:00:00,1,minor,driver,...,0,0,1,0,0,0,0,0,collision,COLL
50,2025-10-06,"An 84-year-old woman, a resident of Naxxar, wa...",09:30,2025-10-06,Monday,09:30:00,2025-10-06 09:30:00,1,grievous,pedestrian,...,0,0,0,1,0,0,0,0,running over pedestrian,PED
0,2025-10-09,"Today, at around 0930hrs, the Police were info...",09:30,2025-10-09,Thursday,09:30:00,2025-10-09 09:30:00,1,serious,motorcyclist,...,1,0,1,0,0,0,0,0,collision,COLL


In [20]:
combined_df['primary_accident_type_code'].value_counts()

primary_accident_type_code
COLL     60
PED      39
FXOBJ    32
OTH      10
ROLL      7
LOC       7
Name: count, dtype: int64

In [21]:
#correcting car flags PT 1: suzuki motorcycle is flagged as a car sometimes, so for the rows that report collisions between a motorcycle and a fixed object, 
#if the row reports presence of car and motorcycle together and the accident type is other, it is most likely a collison was not detected properly
idx_other = combined_df.loc[(combined_df['car']  == 1) & (combined_df['motorbike'] == 1) & (combined_df['primary_accident_type_code'] == 'OTH')].index
combined_df.loc[idx_other, 'collision'] = 1
combined_df.loc[idx_other, 'primary_accident_type'] = 'collision'
combined_df.loc[idx_other, 'primary_accident_type_code'] = 'COLL'

idx_oth = combined_df.loc[combined_df['primary_accident_type_code'] == 'OTH'].index
combined_df.loc[idx_oth, 'collision'] = 1
combined_df.loc[idx_other, 'primary_accident_type'] = 'collision'
combined_df.loc[idx_other, 'primary_accident_type_code'] = 'COLL'

#Honda notorcycle is picked as a car, correcting this by removing a car in accidents that involve only a Honda Motorcycle
idx_Honda = combined_df.loc[(combined_df['car']  == 1) & (combined_df['affected_party'] == 'motorcyclist') & (combined_df['primary_accident_type_code'] == 'FXOBJ')].index
combined_df.loc[idx_Honda, 'car'] = 0


In [22]:
#printing out the accident types and examining them
#combined_df.to_csv('accident_type.csv', index=False)

4. Extracting age and location

In [23]:
import json
#extracting age of all parties involved
def extract_driver_ages(text):
    ages_all = []

    # 1. Standard forms: YY-year-old / YY year old / YY years old
    ages_all += re.findall(
        r'\b(\d{1,3})\s*[-]?\s*(?:year|years)\s*[-]?\s*old\b',
        text,
        flags=re.IGNORECASE
    )

    # 2. "aged YY"
    ages_all += re.findall(
        r'\baged\s+(\d{1,3})\b',
        text,
        flags=re.IGNORECASE
    )

 
    return ages_all

combined_df['age'] = combined_df['content'].apply(extract_driver_ages)


#update: driver age detection specifically for collisions and running over pedestrians

# Keep original shape/order for sanity checks
n_in = len(combined_df)
original_index = combined_df.index

# ---------- Regex setup ----------
# Sentence-aware split: prevents cross-sentence matches (e.g., "driven by" in one sentence, age in another)
SENT_SPLIT = re.compile(r'(?<=[\.!?])\s+')

# Ages AFTER "driven by" within the same sentence:
#   1) "driven by ... 47-year-old"
#   2) "driven by ... aged 47"
PAT_YEAR_OLD = re.compile(
    r"\bdriven\s+by\b[^.?!]*?\b(\d{1,3})\s*[-\s]?year\s*[-\s]?old\b",
    re.IGNORECASE
)
PAT_AGED = re.compile(
    r"\bdriven\s+by\b[^.?!]*?\baged\s*(\d{1,3})\b",
    re.IGNORECASE
)

def ages_after_driven_by(text: str) -> list[int]:
    """
    Return ALL matched ages (duplicates kept, order preserved) that appear
    AFTER 'driven by' in the same sentence of the provided text.
    """
    if pd.isna(text):
        return []
    ages: list[int] = []
    for sent in [s.strip() for s in SENT_SPLIT.split(str(text)) if s.strip()]:
        # Pattern 1: driven by ... NN-year-old
        for m in PAT_YEAR_OLD.finditer(sent):
            try:
                n = int(m.group(1))
                if 0 < n < 120:
                    ages.append(n)  # KEEP duplicates
            except Exception:
                # Ignore conversion errors safely
                pass
        # Pattern 2: driven by ... aged NN
        for m in PAT_AGED.finditer(sent):
            try:
                n = int(m.group(1))
                if 0 < n < 120:
                    ages.append(n)  # KEEP duplicates
            except Exception:
                pass
    return ages

# ---------- Apply to ALL rows; build columns aligned to df ----------
ages_list_json: list[str] = []
min_vals: list[float] = []
max_vals: list[float] = []

for _, row in combined_df.iterrows():
    lst = ages_after_driven_by(row.get('content', ''))
    # Store audit list as JSON (easier for downstream parsing)
    ages_list_json.append(json.dumps(lst, ensure_ascii=False))
    # Compute min/max from raw list (NaN if none)
    if lst:
        min_vals.append(min(lst))
        max_vals.append(max(lst))
    else:
        min_vals.append(pd.NA)
        max_vals.append(pd.NA)

# Assign back without altering row count or order
combined_df['ages_after_driven_by'] = ages_list_json


#combing two age detection techniques
# 1. Creating min_age and max_age columns from all ages
age_df = combined_df["age"].apply(pd.Series)
age_df = age_df.fillna(0).astype(int)
age_df[age_df > 100] = 0
age_df = age_df.replace(0, np.nan)

combined_df["min_age"] = age_df.min(axis='columns')
combined_df["max_age"] = age_df.max(axis='columns')

#repeating the same min and max age detection for ages_after_driven_by
source_col = "ages_after_driven_by"
min_col = "driver_age_min"
max_col = "driver_age_max"

# Helper: extract integers via regex from any string-like cell
def extract_numbers(cell):
    if pd.isna(cell):
        return []
    s = str(cell).strip()
    if not s:
        return []
    # Find all integer substrings (handles negatives too, if ever present)
    nums = re.findall(r"-?\d+", s)
    return [int(n) for n in nums]

# Parse, then compute per-row min/max
nums_series = combined_df["ages_after_driven_by"].apply(extract_numbers)
combined_df["driver_age_min"] = nums_series.apply(lambda arr: min(arr) if arr else pd.NA)
combined_df["driver_age_max"] = nums_series.apply(lambda arr: max(arr) if arr else pd.NA)

idx_empty = combined_df.loc[(combined_df["driver_age_min"].isna()) | (combined_df["driver_age_min"].astype(str).str.strip() == "")].index
combined_df.loc[idx_empty, "driver_age_min"] = combined_df.loc[idx_empty, "min_age"]
combined_df.loc[idx_empty, "driver_age_max"] = combined_df.loc[idx_empty, "max_age"]
combined_df.drop(['min_age', 'max_age'], axis=1, inplace=True)

combined_df

Unnamed: 0,publish_date,content,time_24h,corrected_date,day_of_week,time_24h_dt,timestamp,accident_flag,injury_severity,affected_party,...,lost_control,crushed_into_obstacle,overturned,other,primary_accident_type,primary_accident_type_code,age,ages_after_driven_by,driver_age_min,driver_age_max
80,2024-12-11,"A 60-year-old woman residing in Swieqi, at aro...",11:15,2024-12-11,Wednesday,11:15:00,2024-12-11 11:15:00,1,grievous,pedestrian,...,0,0,0,0,running over pedestrian,PED,"[60, 48]",[48],48,48
97,2024-12-11,"Update:Unfortunately, the 54-year-old Englishm...",17:00,2024-12-11,Wednesday,17:00:00,2024-12-11 17:00:00,1,death,motorcyclist,...,1,1,0,0,crushed into an obstacle,FXOBJ,"[54, 54]",[],54.0,54.0
88,2024-12-14,"A 17-year-old girl, residing in Qormi, yesterd...",17:30,2024-12-13,Friday,17:30:00,2024-12-13 17:30:00,1,serious,pedestrian,...,0,0,0,0,running over pedestrian,PED,"[17, 52]",[52],52,52
20,2024-12-14,"Yesterday, at around (1945hrs), the police wer...",19:45,2024-12-13,Friday,19:45:00,2024-12-13 19:45:00,1,serious,driver,...,0,0,0,0,collision,COLL,"[50, 29]",[50],50,50
112,2024-12-14,", police were busy responding to another serio...",19:45,2024-12-14,Saturday,19:45:00,2024-12-14 19:45:00,1,serious,pedestrian,...,0,0,0,0,running over pedestrian,PED,"[50, 29]",[50],50,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,2025-10-02,"This morning, at around 0530 hrs, the Police w...",05:30,2025-10-02,Thursday,05:30:00,2025-10-02 05:30:00,1,grievous,motorcyclist,...,0,0,0,0,collision,COLL,"[53, 47]","[53, 47]",47,53
26,2025-10-05,"Late yesterday evening, at around 11:00 p.m. (...",23:00,2025-10-04,Saturday,23:00:00,2025-10-04 23:00:00,1,minor,driver,...,0,0,0,0,collision,COLL,"[49, 19, 50, 14, 14, 43, 14, 43, 69]","[49, 19, 50]",19,50
50,2025-10-06,"An 84-year-old woman, a resident of Naxxar, wa...",09:30,2025-10-06,Monday,09:30:00,2025-10-06 09:30:00,1,grievous,pedestrian,...,0,0,0,0,running over pedestrian,PED,"[84, 84]",[84],84,84
0,2025-10-09,"Today, at around 0930hrs, the Police were info...",09:30,2025-10-09,Thursday,09:30:00,2025-10-09 09:30:00,1,serious,motorcyclist,...,0,0,0,0,collision,COLL,"[67, 61]","[67, 61]",61,67


In [24]:
#combined_df.to_csv('age_detection_new.csv', index=False)

In [25]:
#correcting collisions which were misclassified as crushing into fixed objects. In the case of the latter, min_age and max_age are the same
idx_crush_flag = combined_df.loc[(combined_df['primary_accident_type_code'] == 'FXOBJ') & (combined_df['driver_age_min'] != combined_df['driver_age_max'])].index
combined_df.loc[idx_crush_flag, 'primary_accident_type_code'] = 'COLL'
combined_df.loc[idx_crush_flag, 'car'] = 1

idx_car_flag = combined_df.loc[(combined_df['affected_party']  == 'motorcyclist') & (combined_df['primary_accident_type_code'] == 'FXOBJ') & (combined_df['driver_age_min'] != combined_df['driver_age_min'])].index
combined_df.loc[idx_car_flag, 'car'] = 0

In [26]:
import re
import unicodedata

# -----------------------
# NORMALISATION
# -----------------------
def clean_text(text):
    if not isinstance(text, str):
        return text

    text = unicodedata.normalize("NFC", text)

    fixes = {
        "â€™": "’",
        "Å¼": "ż",
        "Ä¦": "Ħ",
        "Å»": "Ż",
        "Ä¡": "ħ",
        "iÄ‹": "Ċ",
        "ÄŠ": "Š",
        "Ä§": "ħ",
        "Ä¨": "Ħ",
        "Dâ€™": "D’",
        "Ä¤": "ġ",
        "GÄ§": "Għ",
        "gÄ§": "għ",
        "Taâ€™": "Ta’"
    }

    for k, v in fixes.items():
        text = text.replace(k, v)

    return text

# -----------------------
# TEMPORARY STREET PREFIX NORMALIZATION
# -----------------------
def normalize_prefixes(text):
    # Capitalize common lowercase prefixes so regex can match them
    return re.sub(r'\b(ix|il|ta’)','\\1'.capitalize(), text, flags=re.IGNORECASE)
def extract_accident_clause(text):
    patterns = [
        r"(?:happened|occurred|took place|was involved).*",
        r"(?:involved in a collision).*"
    ]
    for p in patterns:
        m = re.search(p, text, flags=re.IGNORECASE)
        if m:
            return m.group(0)
    return text
def trim_street(street):
    words = street.split()
    cleaned = []

    for w in words:
        if w.lower() in STOP_WORDS:
            break
        cleaned.append(w)

    return " ".join(cleaned)
def is_residence_context(text, loc_start):
    window = text[max(0, loc_start - 50):loc_start].lower()
    return " from " in window or " resident of " in window
# -----------------------
# STREET KEYWORDS
# -----------------------
street_keywords = [
    "Triq", "Dawret", "Vjal", "Bypass", "bypass", "Road", "Street",
    "Square", "Avenue", "Drive", "Lane",
    "Sqaq", "Pjazza", "Misraħ", "Roundabout", "Circle", "By-Pass",
    "Seafront", "Rue"
]

street_kw = "|".join(map(re.escape, street_keywords))

# -----------------------
# LOCALITIES
# -----------------------
LOCALITIES = {
    "Valletta", "Gudja", "Qormi", "Mellieħa", "Swieqi", "Għaxaq", "Rabat",
    "Marsascala", "Żurrieq", "Mqabba", "San Ġwann", "Nadur", "Naxxar",
    "Birżebbuġa", "Għajnsielem", "Bugibba", "Msida", "Gżira", "Floriana",
    "Marsa", "Tarxien", "Xewkija", "Sannat", "Attard", "Luqa",
    "Żabbar", "Senglea", "Cospicua", "Birkirkara", "St Paul’s Bay",
    "il-Marsa", "Mtarfa", "St Julian’s", "Sliema", "Paola", "San Gwann", "Żejtun",
    "Mġarr", "Siġġiewi", "Tax-Xbiex", "Pembroke", "Zabbar", "Xagħra", "Għargħur", "Ta’ Xbiex",
    "Fgura", "Iklin", "Żebbuġ", "Victoria", "Ħamrun", "Santa Venera", "Mellieha", "Balzan",
    "Marsaxlokk", "Qala", "Għarb"
}

# -----------------------
# STREET REGEX
# -----------------------
street_pattern = re.compile(
    rf"""
    \b
    (?:                                        # Either:
        (?:{street_kw})                        # 1️⃣ keyword first
        (?:\s+(?:il-|l-|ta’)?[A-ZĠŻĦĊŠa-zġżħċ][A-Za-zĠŻĦĊŠġżħċ’\-]+)+
    |
        [A-Z][a-zġżħċ’\-]+(?:\s+[A-Z][a-zġżħċ’\-]+)*  # 2️⃣ keyword last
        \s+(?:{street_kw})
    )
    \b
    """,
    re.VERBOSE | re.UNICODE
)

# -----------------------
# STREET → LOCALITY FALLBACK
# -----------------------
street_to_locality = {
    "Triq Ċ-Šern": "Bugibba",
    "Rue D’Argens": "Gżira",
    "December 13 Road": "Marsa",
    "Pinto Wharf": "Floriana",
    "Xatt l-Għassara tal-Għeneb": "il-Marsa",
    "Triq ix-Xagħra": "Xagħra",
    "Birkirkara bypass": "Birkirkara",
    "Triq ix-Xatt Ta’ Xbiex": "Ta’ Xbiex",
    "St Paul’s Roundabout": "St Paul’s Bay",
    "Cospicua Seafront": "Cospicua",
    "Ta' Liesse": "Valletta",
    "Floriana Park and Ride": "Naxxar"
}
street_to_locality = {clean_text(k): clean_text(v) for k, v in street_to_locality.items()}
STOP_WORDS = {
    "at", "around", "on", "by", "near", "when", "while", "after", "before"
}
# -----------------------
# MAIN FUNCTION
# -----------------------
def extract_location_final(text):
    if not isinstance(text, str):
        return None

    # 1️⃣ Clean text
    text = clean_text(text)

    # 2️⃣ Extract street using clean text
    street_match = street_pattern.search(text)
    if street_match:
        start, end = street_match.span()
        street_raw = text[start:end].strip()
        street = trim_street(street_raw)
    else:
        street = None
    limit_match = re.search(
        r'\bin\s+the\s+limits?\s+of\s+(' + "|".join(map(re.escape, LOCALITIES)) + r')\b',
        text,
        re.IGNORECASE
    )
    limit_locality = clean_text(limit_match.group(1)) if limit_match else None

    if limit_locality and street:
        return f"{street}, {limit_locality}"
    elif limit_locality:
        return limit_locality

    # 3️⃣ Collect all locality mentions
    loc_matches = [(m.start(), m.group(0)) for m in re.finditer(
        r"\b(" + "|".join(map(re.escape, LOCALITIES)) + r")\b", text
    )]

    # 4️⃣ If street exists, try locality strategies
    if street:
        # 4a️⃣ Comma immediately after street
        after = text[end:end + 40]
        comma_match = re.match(r"\s*,\s*([A-ZĠŻĦĊŠ][A-Za-zĠŻĦĊŠġżħċ’\-]+)", after)
        if comma_match:
            loc = clean_text(comma_match.group(1))
            if loc in LOCALITIES:
                return f"{street}, {loc}"
        street_clean = street.strip()
        for s, loc in street_to_locality.items():
            if street_clean in s:
                return f"{s}, {loc}"
        nearest = None
        nearest_dist = None
        for pos, loc in loc_matches:
            if pos < start and not is_residence_context(text, pos):
                dist = start - pos
                if nearest is None or dist < nearest_dist:
                    nearest = loc
                    nearest_dist = dist
        if nearest:
            return f"{street}, {nearest}"

        # fallback to nearest in whole text
        nearest = min(
            [lm for lm in loc_matches if not is_residence_context(text, lm[0])],
            key=lambda x: abs(end - x[0]),
            default=(None, None)
        )
        if nearest[1]:
            return f"{street}, {nearest[1]}"
        return street

    # 6️⃣ Locality-only fallback
    if loc_matches:
        return loc_matches[0][1]

    return None

# -----------------------
# APPLY TO DATAFRAME
# -----------------------
combined_df["location"] = combined_df["content"].apply(extract_location_final)
print(combined_df["location"])

80     Triq il-Kappella tax-Xagħra, Naxxar
97               Dawret San Pawl, Mellieħa
88                 Triq il-Marfa, Mellieha
20                  Dawret il-Gudja, Gudja
112                 Dawret il-Gudja, Gudja
                      ...                 
84                     Triq L-Imġarr, Qala
26             Triq Dawret il-Gudja, Gudja
50     Triq il-Kappella Ta’ Xagħra, Naxxar
0           Triq il-Belt Valletta, Żurrieq
289             Triq l-Imġarr, Għajnsielem
Name: location, Length: 155, dtype: object


In [27]:
def split_location(loc):
    if not isinstance(loc, str):
        return pd.Series([None, None])

    # Split by comma
    parts = [p.strip() for p in loc.split(',')]

    # Case 1: street, locality, (optional) Gozo
    if len(parts) >= 2:
        return pd.Series([parts[0], parts[1]])

    # Case 2: only street
    return pd.Series([parts[0], None])

# Apply to dataframe
combined_df[['street', 'locality']] = combined_df['location'].apply(split_location)
print(combined_df['locality']) 
print(combined_df['street'])

80          Naxxar
97        Mellieħa
88        Mellieha
20           Gudja
112          Gudja
          ...     
84            Qala
26           Gudja
50          Naxxar
0          Żurrieq
289    Għajnsielem
Name: locality, Length: 155, dtype: object
80     Triq il-Kappella tax-Xagħra
97                 Dawret San Pawl
88                   Triq il-Marfa
20                 Dawret il-Gudja
112                Dawret il-Gudja
                  ...             
84                   Triq L-Imġarr
26            Triq Dawret il-Gudja
50     Triq il-Kappella Ta’ Xagħra
0            Triq il-Belt Valletta
289                  Triq l-Imġarr
Name: street, Length: 155, dtype: object


5. Flagging public holidays and eves of public holidays to adjust day of the week accordingly

In [28]:
#flagging public holidays

# Malta public holidays for 2024 and 2025
holidays = {
    # 2024 
    datetime(2024,1,1): "New Year's Day",
    datetime(2024,2,10): "Feast of St. Paul's Shipwreck",
    datetime(2024,3,19): "Feast of St. Joseph",
    datetime(2024,3,29): "Good Friday",
    datetime(2024,3,31): "Freedom Day",
    datetime(2024,5,1): "Workers' Day",
    datetime(2024,6,7): "Sette Giugno",
    datetime(2024,6,29): "Feast of St. Peter and St. Paul",
    datetime(2024,8,15): "Assumption (Santa Marija)",
    datetime(2024,9,8): "Victory Day",
    datetime(2024,9,21): "Independence Day",
    datetime(2024,12,8): "Immaculate Conception",
    datetime(2024,12,13): "Republic Day",
    datetime(2024,12,25): "Christmas Day",
    # 2025 
    datetime(2025,1,1): "New Year's Day",
    datetime(2025,2,10): "Feast of St. Paul's Shipwreck",
    datetime(2025,3,19): "Feast of St. Joseph",
    datetime(2025,3,31): "Freedom Day",
    datetime(2025,4,18): "Good Friday",
    datetime(2025,5,1): "Workers' Day",
    datetime(2025,6,7): "Sette Giugno",
    datetime(2025,6,29): "Feast of St. Peter and St. Paul",
    datetime(2025,8,15): "Assumption (Santa Marija)",
    datetime(2025,9,8): "Victory Day",
    datetime(2025,9,21): "Independence Day",
    datetime(2025,12,8): "Immaculate Conception",
    datetime(2025,12,13): "Republic Day",
    datetime(2025,12,25): "Christmas Day",
}

holiday_dates = set(holidays.keys())
# Eves: the day before each holiday
holiday_eves = {d - timedelta(days=1): holidays[d] for d in holiday_dates}

# Prepare flagging functions

def flag_holiday(date_val):
    return date_val.normalize() in holiday_dates

def holiday_name(date_val):
    return holidays.get(date_val.normalize())

def flag_eve(date_val):
    return date_val.normalize() in holiday_eves

def eve_name(date_val):
    return holiday_eves.get(date_val.normalize())

# Create columns
combined_df['is_ph'] = combined_df['corrected_date'].apply(flag_holiday)
combined_df['is_eve_ph'] = combined_df['corrected_date'].apply(flag_eve)

In [29]:
#marking day of week in numbers from 0 to 6 (Monday to Sunday). Weekdays that fall on public holidays are marked 6, like Sundays. Eve of public holiday is marked 4 like Friday

map_week = {'Monday':0,'Tuesday':1,'Wednesday':2,'Thursday':3,'Friday':4,'Saturday':5,'Sunday':6}

def to_num(day):
    return map_week.get(str(day).strip(), pd.NA)

combined_df['day_of_week_num'] = combined_df['day_of_week'].apply(to_num)

# helpers
def is_true(val):
    if isinstance(val,bool):
        return val
    return str(val).strip().lower() in ['true']

is_weekend = combined_df['day_of_week'].astype(str).str.strip().isin(['Saturday','Sunday'])

# Apply eve_ph first (Friday=4) only if not weekend
mask_eve = combined_df['is_eve_ph'].apply(is_true) & (~is_weekend)
combined_df.loc[mask_eve, 'day_of_week_num'] = 4

# Apply ph (Sunday=6) only if not weekend
mask_ph = combined_df['is_ph'].apply(is_true) & (~is_weekend)
combined_df.loc[mask_ph, 'day_of_week_num'] = 6

combined_df.drop(['is_ph', 'is_eve_ph'], axis=1, inplace=True)

In [30]:
# Define rush hour ranges
morning_start = datetime.strptime("07:30", "%H:%M").time()
morning_end = datetime.strptime("10:00", "%H:%M").time()
afternoon_start = datetime.strptime("16:00", "%H:%M").time()
afternoon_end = datetime.strptime("19:00", "%H:%M").time()

# Function to check if time falls in rush hour
def is_rush_hour(row):
    time_val = row['time_24h_dt']
    day_num = row['day_of_week_num']
    if time_val and 0 <= day_num <= 4:  # Weekdays only
        if morning_start <= time_val <= morning_end or afternoon_start <= time_val <= afternoon_end:
            return 1
    return 0

# Apply function to create rush_hour column
combined_df['rush_hour'] = combined_df.apply(is_rush_hour, axis=1)
combined_df

Unnamed: 0,publish_date,content,time_24h,corrected_date,day_of_week,time_24h_dt,timestamp,accident_flag,injury_severity,affected_party,...,primary_accident_type_code,age,ages_after_driven_by,driver_age_min,driver_age_max,location,street,locality,day_of_week_num,rush_hour
80,2024-12-11,"A 60-year-old woman residing in Swieqi, at aro...",11:15,2024-12-11,Wednesday,11:15:00,2024-12-11 11:15:00,1,grievous,pedestrian,...,PED,"[60, 48]",[48],48,48,"Triq il-Kappella tax-Xagħra, Naxxar",Triq il-Kappella tax-Xagħra,Naxxar,2,0
97,2024-12-11,"Update:Unfortunately, the 54-year-old Englishm...",17:00,2024-12-11,Wednesday,17:00:00,2024-12-11 17:00:00,1,death,motorcyclist,...,FXOBJ,"[54, 54]",[],54.0,54.0,"Dawret San Pawl, Mellieħa",Dawret San Pawl,Mellieħa,2,1
88,2024-12-14,"A 17-year-old girl, residing in Qormi, yesterd...",17:30,2024-12-13,Friday,17:30:00,2024-12-13 17:30:00,1,serious,pedestrian,...,PED,"[17, 52]",[52],52,52,"Triq il-Marfa, Mellieha",Triq il-Marfa,Mellieha,6,0
20,2024-12-14,"Yesterday, at around (1945hrs), the police wer...",19:45,2024-12-13,Friday,19:45:00,2024-12-13 19:45:00,1,serious,driver,...,COLL,"[50, 29]",[50],50,50,"Dawret il-Gudja, Gudja",Dawret il-Gudja,Gudja,6,0
112,2024-12-14,", police were busy responding to another serio...",19:45,2024-12-14,Saturday,19:45:00,2024-12-14 19:45:00,1,serious,pedestrian,...,PED,"[50, 29]",[50],50,50,"Dawret il-Gudja, Gudja",Dawret il-Gudja,Gudja,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,2025-10-02,"This morning, at around 0530 hrs, the Police w...",05:30,2025-10-02,Thursday,05:30:00,2025-10-02 05:30:00,1,grievous,motorcyclist,...,COLL,"[53, 47]","[53, 47]",47,53,"Triq L-Imġarr, Qala",Triq L-Imġarr,Qala,3,0
26,2025-10-05,"Late yesterday evening, at around 11:00 p.m. (...",23:00,2025-10-04,Saturday,23:00:00,2025-10-04 23:00:00,1,minor,driver,...,COLL,"[49, 19, 50, 14, 14, 43, 14, 43, 69]","[49, 19, 50]",19,50,"Triq Dawret il-Gudja, Gudja",Triq Dawret il-Gudja,Gudja,5,0
50,2025-10-06,"An 84-year-old woman, a resident of Naxxar, wa...",09:30,2025-10-06,Monday,09:30:00,2025-10-06 09:30:00,1,grievous,pedestrian,...,PED,"[84, 84]",[84],84,84,"Triq il-Kappella Ta’ Xagħra, Naxxar",Triq il-Kappella Ta’ Xagħra,Naxxar,0,1
0,2025-10-09,"Today, at around 0930hrs, the Police were info...",09:30,2025-10-09,Thursday,09:30:00,2025-10-09 09:30:00,1,serious,motorcyclist,...,COLL,"[67, 61]","[67, 61]",61,67,"Triq il-Belt Valletta, Żurrieq",Triq il-Belt Valletta,Żurrieq,3,1


6. Extracting weather, roadworks and speed limits

In [31]:
#adding weather data
import weather

def extract_weather(row):
    dt = row["corrected_date"]
    time_24h = row["time_24h"]
    hour = int(time_24h.split(":")[0]) if time_24h else 0

    rained_today = weather.rain_on_date(dt)
    rained_last_3h = weather.rain_before(dt, hour, 3)

    daily = weather.get_daily_weather_data(dt)
    hourly = weather.get_hourly_weather_data(dt - timedelta(hours=3), dt)

    # Check if data is empty before extracting values
    temp_avg = daily["tavg"].values[0] if not daily.empty and "tavg" in daily else None
    temp_min = daily["tmin"].values[0] if not daily.empty and "tmin" in daily else None
    temp_max = daily["tmax"].values[0] if not daily.empty and "tmax" in daily else None
    precip_mm_day = daily["prcp"].values[0] if not daily.empty and "prcp" in daily else None

    return {
        "rain_today": rained_today,
        "rain_last_3h": rained_last_3h,
        "temp_avg": temp_avg,
        "temp_min": temp_min,
        "temp_max": temp_max,
        "precip_mm_day": precip_mm_day,
        "hourly_records": len(hourly) if hourly is not None else 0
    }
combined_df["weather"] = combined_df.apply(extract_weather, axis=1)
print(combined_df["weather"])



80     {'rain_today': False, 'rain_last_3h': False, '...
97     {'rain_today': False, 'rain_last_3h': False, '...
88     {'rain_today': <NA>, 'rain_last_3h': False, 't...
20     {'rain_today': <NA>, 'rain_last_3h': False, 't...
112    {'rain_today': False, 'rain_last_3h': False, '...
                             ...                        
84     {'rain_today': False, 'rain_last_3h': False, '...
26     {'rain_today': False, 'rain_last_3h': False, '...
50     {'rain_today': True, 'rain_last_3h': False, 't...
0      {'rain_today': False, 'rain_last_3h': False, '...
289    {'rain_today': False, 'rain_last_3h': False, '...
Name: weather, Length: 155, dtype: object


In [32]:
weather_df = combined_df["weather"].apply(pd.Series)
combined_df['rain_today'] = weather_df['rain_today']
combined_df['rain_last_3h'] = weather_df['rain_last_3h']
combined_df['temp_avg'] = weather_df['temp_avg']
combined_df['temp_min'] = weather_df['temp_min']
combined_df['temp_max'] = weather_df['temp_max']
combined_df['precip_mm_day'] = weather_df['precip_mm_day']
combined_df


Unnamed: 0,publish_date,content,time_24h,corrected_date,day_of_week,time_24h_dt,timestamp,accident_flag,injury_severity,affected_party,...,locality,day_of_week_num,rush_hour,weather,rain_today,rain_last_3h,temp_avg,temp_min,temp_max,precip_mm_day
80,2024-12-11,"A 60-year-old woman residing in Swieqi, at aro...",11:15,2024-12-11,Wednesday,11:15:00,2024-12-11 11:15:00,1,grievous,pedestrian,...,Naxxar,2,0,"{'rain_today': False, 'rain_last_3h': False, '...",False,False,14.1,8.9,19.6,0.0
97,2024-12-11,"Update:Unfortunately, the 54-year-old Englishm...",17:00,2024-12-11,Wednesday,17:00:00,2024-12-11 17:00:00,1,death,motorcyclist,...,Mellieħa,2,1,"{'rain_today': False, 'rain_last_3h': False, '...",False,False,14.1,8.9,19.6,0.0
88,2024-12-14,"A 17-year-old girl, residing in Qormi, yesterd...",17:30,2024-12-13,Friday,17:30:00,2024-12-13 17:30:00,1,serious,pedestrian,...,Mellieha,6,0,"{'rain_today': <NA>, 'rain_last_3h': False, 't...",,False,18.8,16.9,21.0,
20,2024-12-14,"Yesterday, at around (1945hrs), the police wer...",19:45,2024-12-13,Friday,19:45:00,2024-12-13 19:45:00,1,serious,driver,...,Gudja,6,0,"{'rain_today': <NA>, 'rain_last_3h': False, 't...",,False,18.8,16.9,21.0,
112,2024-12-14,", police were busy responding to another serio...",19:45,2024-12-14,Saturday,19:45:00,2024-12-14 19:45:00,1,serious,pedestrian,...,Gudja,5,0,"{'rain_today': False, 'rain_last_3h': False, '...",False,False,18.2,15.0,21.3,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,2025-10-02,"This morning, at around 0530 hrs, the Police w...",05:30,2025-10-02,Thursday,05:30:00,2025-10-02 05:30:00,1,grievous,motorcyclist,...,Qala,3,0,"{'rain_today': False, 'rain_last_3h': False, '...",False,False,22.4,20.0,25.5,0.0
26,2025-10-05,"Late yesterday evening, at around 11:00 p.m. (...",23:00,2025-10-04,Saturday,23:00:00,2025-10-04 23:00:00,1,minor,driver,...,Gudja,5,0,"{'rain_today': False, 'rain_last_3h': False, '...",False,False,20.3,17.0,24.0,0.0
50,2025-10-06,"An 84-year-old woman, a resident of Naxxar, wa...",09:30,2025-10-06,Monday,09:30:00,2025-10-06 09:30:00,1,grievous,pedestrian,...,Naxxar,0,1,"{'rain_today': True, 'rain_last_3h': False, 't...",True,False,22.4,20.0,24.6,0.3
0,2025-10-09,"Today, at around 0930hrs, the Police were info...",09:30,2025-10-09,Thursday,09:30:00,2025-10-09 09:30:00,1,serious,motorcyclist,...,Żurrieq,3,1,"{'rain_today': False, 'rain_last_3h': False, '...",False,False,20.4,16.0,25.0,0.0


In [33]:
import roadworks

def check_roadworks_near_location(row):
    if pd.isna(row['street']) or pd.isna(row['locality']):
        return False
    return roadworks.had_roadworks(row['locality'], row['street'], row['corrected_date'])

combined_df['roadworks_nearby'] = combined_df.apply(check_roadworks_near_location, axis=1)
combined_df['roadworks_nearby'].value_counts()

roadworks_nearby
False    144
True      11
Name: count, dtype: int64

In [34]:
#checking speed limits with speed cameras data
import cameras
def check_speed_camera_in_street(row):
	if pd.isna(row['street']) or pd.isna(row['corrected_date']):
		return None
	return cameras.had_speed_camera(row['street'], row['corrected_date'].year)

combined_df['street_had_speed_camera'] = combined_df.apply(check_speed_camera_in_street, axis=1)
combined_df['street_had_speed_camera'].value_counts()

street_had_speed_camera
False    134
True      20
Name: count, dtype: int64

7. Adding more training data for collisions

In [35]:
#collisions involve two or more vehicles. In the case of cars colliding with motorcycles, only the injured party (motorcyclist) was reported.
# The other party (the driver) is presumed to have sustained no injuries. Same applies to collisions between cars

idx_collision = combined_df.loc[(combined_df['primary_accident_type_code']  == 'COLL')].index
collisions = combined_df.loc[idx_collision,:]
collisions['injury_severity'] = 'minor'
collisions['affected_party'] = 'driver'

#concatenating these accidents with the combined_df
combined_df = pd.concat([combined_df,collisions], axis = 0)
combined_df = combined_df.sort_values(by='timestamp')
combined_df

Unnamed: 0,publish_date,content,time_24h,corrected_date,day_of_week,time_24h_dt,timestamp,accident_flag,injury_severity,affected_party,...,rush_hour,weather,rain_today,rain_last_3h,temp_avg,temp_min,temp_max,precip_mm_day,roadworks_nearby,street_had_speed_camera
80,2024-12-11,"A 60-year-old woman residing in Swieqi, at aro...",11:15,2024-12-11,Wednesday,11:15:00,2024-12-11 11:15:00,1,grievous,pedestrian,...,0,"{'rain_today': False, 'rain_last_3h': False, '...",False,False,14.1,8.9,19.6,0.0,False,False
97,2024-12-11,"Update:Unfortunately, the 54-year-old Englishm...",17:00,2024-12-11,Wednesday,17:00:00,2024-12-11 17:00:00,1,death,motorcyclist,...,1,"{'rain_today': False, 'rain_last_3h': False, '...",False,False,14.1,8.9,19.6,0.0,False,True
88,2024-12-14,"A 17-year-old girl, residing in Qormi, yesterd...",17:30,2024-12-13,Friday,17:30:00,2024-12-13 17:30:00,1,serious,pedestrian,...,0,"{'rain_today': <NA>, 'rain_last_3h': False, 't...",,False,18.8,16.9,21.0,,True,False
20,2024-12-14,"Yesterday, at around (1945hrs), the police wer...",19:45,2024-12-13,Friday,19:45:00,2024-12-13 19:45:00,1,minor,driver,...,0,"{'rain_today': <NA>, 'rain_last_3h': False, 't...",,False,18.8,16.9,21.0,,True,False
20,2024-12-14,"Yesterday, at around (1945hrs), the police wer...",19:45,2024-12-13,Friday,19:45:00,2024-12-13 19:45:00,1,serious,driver,...,0,"{'rain_today': <NA>, 'rain_last_3h': False, 't...",,False,18.8,16.9,21.0,,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50,2025-10-06,"An 84-year-old woman, a resident of Naxxar, wa...",09:30,2025-10-06,Monday,09:30:00,2025-10-06 09:30:00,1,grievous,pedestrian,...,1,"{'rain_today': True, 'rain_last_3h': False, 't...",True,False,22.4,20.0,24.6,0.3,False,False
0,2025-10-09,"Today, at around 0930hrs, the Police were info...",09:30,2025-10-09,Thursday,09:30:00,2025-10-09 09:30:00,1,serious,motorcyclist,...,1,"{'rain_today': False, 'rain_last_3h': False, '...",False,False,20.4,16.0,25.0,0.0,False,False
0,2025-10-09,"Today, at around 0930hrs, the Police were info...",09:30,2025-10-09,Thursday,09:30:00,2025-10-09 09:30:00,1,minor,driver,...,1,"{'rain_today': False, 'rain_last_3h': False, '...",False,False,20.4,16.0,25.0,0.0,False,False
289,2025-10-10,The Mayor of Għajnsielem on Friday called on t...,09:00,2025-10-10,Friday,09:00:00,2025-10-10 09:00:00,1,minor,motorcyclist,...,1,"{'rain_today': False, 'rain_last_3h': False, '...",False,False,20.8,16.0,25.2,0.0,False,False


In [36]:
#dropping intermediate columns
combined_df.drop(['day_of_week', 'time_24h_dt', 'timestamp', 'accident_flag', 'collision', 'running_over_pedestrian','lost_control','crushed_into_obstacle','overturned','other','primary_accident_type', 'weather'], axis=1, inplace=True)

In [37]:
#writing the output to a csv file
output_file = "combined_accidents.csv"
combined_df.to_csv(output_file, index=False)