In [5]:
df = pd.read_csv("../data/processed/df_young_fair_2024.csv")

print("Shape:", df.shape)
df.info()

Shape: (13376, 76)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13376 entries, 0 to 13375
Data columns (total 76 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   collision_index                                   13376 non-null  object 
 1   collision_year_x                                  13376 non-null  int64  
 2   collision_ref_no_x                                13376 non-null  object 
 3   vehicle_reference                                 13376 non-null  int64  
 4   vehicle_type                                      13376 non-null  int64  
 5   towing_and_articulation                           13376 non-null  int64  
 6   vehicle_manoeuvre_historic                        13376 non-null  int64  
 7   vehicle_manoeuvre                                 13376 non-null  int64  
 8   vehicle_direction_from                            13376 non-null  int64  
 9 

In [6]:
print("\nPotential ID columns:")
print([c for c in df.columns if "index" in c or "ref" in c or "vehicle_reference" in c])

print("\nSeverity-related columns:")
print([c for c in df.columns if "severity" in c])

print("\nLocation columns:")
print([c for c in df.columns if "location" in c or "longitude" in c or "latitude" in c])

print("\nIMD / deprivation columns:")
print([c for c in df.columns if "imd" in c.lower()])

print("\nLSOA columns:")
print([c for c in df.columns if "lsoa" in c.lower()])


Potential ID columns:
['collision_index', 'collision_ref_no_x', 'vehicle_reference', 'collision_ref_no_y']

Severity-related columns:
['collision_severity', 'enhanced_severity_collision', 'collision_adjusted_severity_serious', 'collision_adjusted_severity_slight']

Location columns:
['vehicle_location_restricted_lane_historic', 'vehicle_location_restricted_lane', 'junction_location', 'location_easting_osgr', 'location_northing_osgr', 'longitude', 'latitude', 'lsoa_of_accident_location']

IMD / deprivation columns:
['driver_imd_decile']

LSOA columns:
['lsoa_of_driver', 'lsoa_of_accident_location']


In [7]:
cols_to_drop = [
    # ID
    'collision_index',
    'collision_ref_no_x',
    'vehicle_reference',
    'collision_ref_no_y',
    
    # Severity leakage
    'collision_severity',
    'enhanced_severity_collision',
    'collision_adjusted_severity_serious',
    'collision_adjusted_severity_slight',
    
    # Raw geo
    'location_easting_osgr',
    'location_northing_osgr',
    'longitude',
    'latitude',
    
    # Historic duplicate
    'vehicle_location_restricted_lane_historic',
    
    # LSOA
    'lsoa_of_driver',
    'lsoa_of_accident_location'
]

df_model = df.drop(columns=cols_to_drop)

print("New shape:", df_model.shape)

New shape: (13376, 61)


In [8]:
# Define target
y = df_model['high_risk']

# Define protected attributes (for fairness evaluation only)
A = df_model[['sex_of_driver', 'age_band_of_driver']]

# Define feature matrix (exclude target + protected attributes)
X = df_model.drop(columns=['high_risk', 'sex_of_driver', 'age_band_of_driver'])

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Protected attributes shape:", A.shape)

X shape: (13376, 58)
y shape: (13376,)
Protected attributes shape: (13376, 2)


In [9]:
X.dtypes.value_counts()

int64     52
object     6
Name: count, dtype: int64

In [10]:
cat_cols = X.select_dtypes(include=['object']).columns
cat_cols

Index(['generic_make_model', 'date', 'time', 'local_authority_ons_district',
       'local_authority_highway', 'local_authority_highway_current'],
      dtype='object')

In [12]:
high_card_cols = [
    'generic_make_model',
    'local_authority_ons_district',
    'local_authority_highway',
    'local_authority_highway_current'
]

X = X.drop(columns=high_card_cols)

print("After dropping high-cardinality columns:", X.shape)

After dropping high-cardinality columns: (13376, 54)


In [13]:
# Check unique format patterns (sample)
df_model['time'].dropna().head(20)

0     21:13
1     17:45
2     18:20
3     08:25
4     22:16
5     08:14
6     10:30
7     08:45
8     19:00
9     07:51
10    10:32
11    11:30
12    07:37
13    16:04
14    16:25
15    17:55
16    18:32
17    22:57
18    17:50
19    12:30
Name: time, dtype: object

In [14]:
# Check how many unique string lengths exist
df_model['time'].dropna().str.len().value_counts()

time
5    13376
Name: count, dtype: int64

In [15]:
# Check object columns again
X.select_dtypes(include=['object']).columns

Index(['date', 'time'], dtype='object')

In [17]:
# Convert time to hour
df_model['hour'] = pd.to_datetime(df_model['time'], format='%H:%M').dt.hour

# Drop original time column
df_model = df_model.drop(columns=['time'])

print("Added hour column. New shape:", df_model.shape)

Added hour column. New shape: (13376, 61)


In [18]:
df_model['date'].head()

0    31/03/2024
1    05/12/2024
2    04/07/2024
3    26/06/2024
4    13/02/2024
Name: date, dtype: object

In [19]:
df_model['date'].str.len().value_counts()

date
10    13376
Name: count, dtype: int64

In [20]:
# Convert date to datetime
df_model['date_parsed'] = pd.to_datetime(df_model['date'], dayfirst=True)

# Extract month
df_model['month'] = df_model['date_parsed'].dt.month

# Drop original date columns
df_model = df_model.drop(columns=['date', 'date_parsed'])

print("After extracting month:", df_model.shape)

After extracting month: (13376, 61)


In [21]:
# Always redefine after structural changes
# Redefine target
y = df_model['high_risk']

# Protected attributes (for fairness only)
A = df_model[['sex_of_driver', 'age_band_of_driver']]

# Feature matrix
X = df_model.drop(columns=['high_risk', 'sex_of_driver', 'age_band_of_driver'])

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Protected shape:", A.shape)

X shape: (13376, 58)
y shape: (13376,)
Protected shape: (13376, 2)


In [22]:
X.select_dtypes(include=['object']).columns

Index(['generic_make_model', 'local_authority_ons_district',
       'local_authority_highway', 'local_authority_highway_current'],
      dtype='object')

In [23]:
for col in X.select_dtypes(include=['object']).columns:
    print(col, X[col].nunique())

generic_make_model 552
local_authority_ons_district 162
local_authority_highway 162
local_authority_highway_current 160


In [24]:
# For a baseline model (and to keep fairness analysis clean + avoid exploding one-hot dimensions), we should drop these four for now.
high_card_cols = [
    "generic_make_model",
    "local_authority_ons_district",
    "local_authority_highway",
    "local_authority_highway_current"
]

X = X.drop(columns=high_card_cols)
df_model = df_model.drop(columns=high_card_cols)  # keep df_model consistent too

print("X shape after dropping high-card cols:", X.shape)
print("df_model shape:", df_model.shape)

X shape after dropping high-card cols: (13376, 54)
df_model shape: (13376, 57)


In [25]:
X.dtypes.value_counts()
X.select_dtypes(include=["object"]).columns

Index([], dtype='object')

In [26]:
df_model.to_csv("../data/processed/df_young_model_ready_2024.csv", index=False)