In [1]:
import pandas as pd
df=pd.read_csv("UK_Accident.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Accident_Index,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Police_Force,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,...,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident,LSOA_of_Accident_Location,Year
0,0,200501BS00001,525680.0,178240,-0.19117,51.489096,1,2,1,1,...,Zebra crossing,Daylight: Street light present,Raining without high winds,Wet/Damp,,,1,Yes,E01002849,2005
1,1,200501BS00002,524170.0,181650,-0.211708,51.520075,1,3,1,1,...,Pedestrian phase at traffic signal junction,Darkness: Street lights present and lit,Fine without high winds,Dry,,,1,Yes,E01002909,2005
2,2,200501BS00003,524520.0,182240,-0.206458,51.525301,1,3,2,1,...,No physical crossing within 50 meters,Darkness: Street lights present and lit,Fine without high winds,Dry,,,1,Yes,E01002857,2005
3,3,200501BS00004,526900.0,177530,-0.173862,51.482442,1,3,1,1,...,No physical crossing within 50 meters,Daylight: Street light present,Fine without high winds,Dry,,,1,Yes,E01002840,2005
4,4,200501BS00005,528060.0,179040,-0.156618,51.495752,1,3,1,1,...,No physical crossing within 50 meters,Darkness: Street lighting unknown,Fine without high winds,Wet/Damp,,,1,Yes,E01002863,2005


In [2]:
df.shape


(1048575, 33)

In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 33 columns):
 #   Column                                       Non-Null Count    Dtype  
---  ------                                       --------------    -----  
 0   Unnamed: 0                                   1048575 non-null  int64  
 1   Accident_Index                               1048575 non-null  object 
 2   Location_Easting_OSGR                        1048474 non-null  float64
 3   Location_Northing_OSGR                       1048575 non-null  int64  
 4   Longitude                                    1048474 non-null  float64
 5   Latitude                                     1048575 non-null  float64
 6   Police_Force                                 1048575 non-null  int64  
 7   Accident_Severity                            1048575 non-null  int64  
 8   Number_of_Vehicles                           1048575 non-null  int64  
 9   Number_of_Casualties                         1

In [4]:
df.isnull().sum()


Unnamed: 0                                           0
Accident_Index                                       0
Location_Easting_OSGR                              101
Location_Northing_OSGR                               0
Longitude                                          101
Latitude                                             0
Police_Force                                         0
Accident_Severity                                    0
Number_of_Vehicles                                   0
Number_of_Casualties                                 0
Date                                                 0
Day_of_Week                                          0
Time                                               104
Local_Authority_(District)                           0
Local_Authority_(Highway)                            0
1st_Road_Class                                       0
1st_Road_Number                                      0
Road_Type                                            0
Speed_limi

In [5]:
df.drop(columns=[
    'Junction_Control',
    'Special_Conditions_at_Site',
    'Carriageway_Hazards',
    'Unnamed: 0'   # useless index column
], inplace=True)


In [6]:
df = df.dropna(subset=['Longitude', 'Location_Easting_OSGR'])

df['Time'] = df['Time'].fillna("Unknown")
df['Pedestrian_Crossing-Human_Control'] = df['Pedestrian_Crossing-Human_Control'].fillna("Unknown")
df['Pedestrian_Crossing-Physical_Facilities'] = df['Pedestrian_Crossing-Physical_Facilities'].fillna("Unknown")
df['LSOA_of_Accident_Location'] = df['LSOA_of_Accident_Location'].fillna("Unknown")



In [7]:
df.isnull().sum()


Accident_Index                                 0
Location_Easting_OSGR                          0
Location_Northing_OSGR                         0
Longitude                                      0
Latitude                                       0
Police_Force                                   0
Accident_Severity                              0
Number_of_Vehicles                             0
Number_of_Casualties                           0
Date                                           0
Day_of_Week                                    0
Time                                           0
Local_Authority_(District)                     0
Local_Authority_(Highway)                      0
1st_Road_Class                                 0
1st_Road_Number                                0
Road_Type                                      0
Speed_limit                                    0
2nd_Road_Class                                 0
2nd_Road_Number                                0
Pedestrian_Crossing-

In [8]:
# Clean hidden unicode characters and spaces
df['Date'] = df['Date'].astype(str).str.replace(r'[^\d\-\/]', '', regex=True).str.strip()

# Convert dd-mm-YYYY
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y', errors='coerce')



df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Weekday'] = df['Date'].dt.day_name()



In [9]:
def extract_hour(t):
    if t == "Unknown":
        return -1
    try:
        return int(t.split(':')[0])
    except:
        return -1

df['Hour'] = df['Time'].apply(extract_hour)


In [10]:
def time_bucket(h):
    if h == -1:
        return "Unknown"
    elif 5 <= h < 12:
        return "Morning"
    elif 12 <= h < 17:
        return "Afternoon"
    elif 17 <= h < 21:
        return "Evening"
    else:
        return "Night"

df['Time_of_Day'] = df['Hour'].apply(time_bucket)


In [11]:
df[['Date', 'Month', 'Day', 'Weekday', 'Hour', 'Time_of_Day']].head()


Unnamed: 0,Date,Month,Day,Weekday,Hour,Time_of_Day
0,2005-01-04,1,4,Tuesday,17,Evening
1,2005-01-05,1,5,Wednesday,17,Evening
2,2005-01-06,1,6,Thursday,0,Night
3,2005-01-07,1,7,Friday,10,Morning
4,2005-01-10,1,10,Monday,21,Night


In [12]:
df[['Date','Month','Day','Weekday']].isnull().sum()





Date       0
Month      0
Day        0
Weekday    0
dtype: int64

In [13]:
df.to_csv("cleaned_UK_Accident.csv" ,index=False)

In [14]:
import os, csv
os.makedirs(r"C:\temp", exist_ok=True)
df.to_csv(r"C:\temp\cleaned_UK_Accident.csv",
          index=False, encoding="utf-8",
          quoting=csv.QUOTE_ALL)  # force quotes so commas don't break rows
