In [218]:
import pandas as pd
import numpy as np

# Load datasets with needed columns
india = pd.read_csv('trauma_india_brain_injury.csv', usecols=['age','sex','sbp_1','hr_1','rr_1','gcs_t_1','doa','toa','dodd','todd','moi','died'])
jordan = pd.read_csv('traumatic_brain_injury.csv', usecols=['Gender','age of diagnosis','ER-HR','ER-RR','ER-systolic BP','GCS in ER','type of trauma','length of stay in the hospital (in days)','outcome'])
florida = pd.read_csv('updated_dataset (1).csv', usecols=['Age', 'Gender', 'SBP', 'HR', 'RR', 'MOI', 'GCS', 'Hospital LOS Days', 'Death'])
california = pd.read_csv('traumatic_brain_injury_usa.csv', usecols=['Age', 'Sex', 'SBP', 'HR', 'RR', 'MOI', 'Trauma GCS', 'LOS', 'Mortality'])


In [219]:
india.columns

Index(['age', 'sex', 'doa', 'toa', 'moi', 'sbp_1', 'rr_1', 'hr_1', 'gcs_t_1',
       'died', 'dodd', 'todd'],
      dtype='object')

In [220]:
jordan.columns

Index(['Gender', 'age of diagnosis', 'type of trauma', 'GCS in ER', 'ER-HR',
       'ER-RR', 'ER-systolic BP', 'length of stay in the hospital (in days)',
       'outcome'],
      dtype='object')

In [221]:
florida.columns

Index(['Age', 'Gender', 'SBP', 'HR', 'RR', 'MOI', 'GCS', 'Hospital LOS Days',
       'Death'],
      dtype='object')

In [222]:
california.columns

Index(['Age', 'Sex', 'MOI', 'Mortality', 'SBP', 'HR', 'RR', 'Trauma GCS',
       'LOS'],
      dtype='object')

In [223]:
def map_moi(text):
    if pd.isnull(text):
        return "Other / Unknown"

    text = str(text).strip().lower()

    # Normalize structured RTA-type values (India, Jordan)
    rta_keywords = ['road traffic injury', 'rta', 'mvc', 'vehicle', 'auto', 'car', 'driver', 'passenger', 'collision']
    if any(kw in text for kw in rta_keywords):
        if 'motorcyclist' in text or 'motorcycle' in text:
            return "Motorcycle Crash"
        if 'bicyclist' in text or 'bike' in text:
            return "Bicycle Crash"
        if 'pedestrian' in text:
            return "Pedestrian Struck"
        return "Motor Vehicle Collision"

    # Specific mappings
    if 'motorcycle' in text or 'motorcyclist' in text or 'motocycle' in text:
        return "Motorcycle Crash"
    if 'bicycle' in text or 'bicyclist' in text or 'bike' in text:
        return "Bicycle Crash"
    if 'pedestrian' in text or 'peds vs auto' in text or 'scooter' in text:
        return "Pedestrian Struck"
    if 'fall' in text or 'fell' in text or 'found down' in text or 'stairs' in text or 'roof' in text:
        return "Fall"
    if 'assault' in text or 'struck' in text or 'stab' in text or 'hit' in text or 'pole' in text or 'collision with horse' in text:
        return "Assault / Blunt Force"
    if 'gsw' in text or 'gunshot' in text:
        return "Gunshot Wound (GSW)"
    if 'fire' in text or 'burn' in text or 'blast' in text or 'explosion' in text:
        return "Burn / Fire / Blast"
    if 'train' in text or 'railway' in text:
        return "Other / Unknown"
    if 'alligator' in text or 'animal' in text or 'boat' in text or 'golf cart' in text or 'atv' in text:
        return "Animal / Environmental"

    # Final fallback
    return "Other / Unknown"


# Apply MOI mapping
india['MOI'] = india['moi'].apply(map_moi)
jordan['MOI'] = jordan['type of trauma'].apply(map_moi)
florida['MOI'] = florida['MOI'].apply(map_moi)
california['MOI'] = california['MOI'].apply(map_moi)


In [224]:
def hr_category(age, hr):
    if pd.isnull(hr) or pd.isnull(age): return np.nan
    if age < 1: return 'tachycardia' if hr > 160 else 'normal'
    elif age < 3: return 'tachycardia' if hr > 150 else 'normal'
    elif age < 6: return 'tachycardia' if hr > 140 else 'normal'
    elif age < 12: return 'tachycardia' if hr > 120 else 'normal'
    else: return 'tachycardia' if hr > 100 else 'normal'

def rr_category(age, rr):
    if pd.isnull(rr) or pd.isnull(age): return np.nan
    if age < 1: return 'tachypnea' if rr > 60 else 'normal'
    elif age < 3: return 'tachypnea' if rr > 40 else 'normal'
    elif age < 6: return 'tachypnea' if rr > 34 else 'normal'
    elif age < 12: return 'tachypnea' if rr > 30 else 'normal'
    else: return 'tachypnea' if rr > 20 else 'normal'

def sbp_category(age, sbp):
    if pd.isnull(sbp) or pd.isnull(age): return np.nan
    if age < 1: return 'low' if sbp < 70 else 'normal'
    elif age < 10: return 'low' if sbp < (70 + 2*age) else 'normal'
    else: return 'low' if sbp < 90 else 'normal'


In [225]:
standard_cols = ['age', 'sex', 'sbp', 'hr', 'rr', 'gcs', 'los', 'MOI']


In [226]:
# INDIA
india['sex'] = india['sex'].map({'Male':1,'Female':0})
india['los'] = (pd.to_datetime(india['dodd'] + ' ' + india['todd'], errors='coerce') - 
                pd.to_datetime(india['doa'] + ' ' + india['toa'], errors='coerce')).dt.total_seconds() / 3600 / 24
india['event'] = (india['died'] == 'Yes').astype(int)
india_clean = india.rename(columns={'sbp_1':'sbp', 'hr_1':'hr', 'rr_1':'rr', 'gcs_t_1':'gcs'})[standard_cols + ['event']]

# JORDAN
jordan['sex'] = jordan['Gender'].map({'Male':1,'Female':0})
jordan['age'] = jordan['age of diagnosis']
jordan['sbp'] = jordan['ER-systolic BP']
jordan['hr'] = jordan['ER-HR']
jordan['rr'] = jordan['ER-RR']
jordan['gcs'] = jordan['GCS in ER']
jordan['los'] = jordan['length of stay in the hospital (in days)']
jordan['event'] = (jordan['outcome'] == 'died').astype(int)
jordan_clean = jordan[standard_cols + ['event']]

# FLORIDA
florida = florida.rename(columns={
    'Age': 'age',
    'Gender': 'sex',
    'SBP': 'sbp',
    'HR': 'hr',
    'RR': 'rr',
    'GCS': 'gcs',
    'MOI': 'MOI',
    'Hospital LOS Days': 'los'
})

florida['sex'] = florida['sex'].replace({'Male': 1, 'Female': 0, '1': 1, '0': 0})
florida['event'] = florida['Death'].map({'Yes': 1, 'No': 0, '1': 1, '0': 0, 1: 1, 0: 0}).fillna(0).astype(int)


florida_clean = florida[standard_cols + ['event']].copy()

# CALIFORNIA
california = california.rename(columns={
    'Age': 'age',
    'Sex': 'sex',
    'SBP': 'sbp',
    'HR': 'hr',
    'RR': 'rr',
    'Trauma GCS': 'gcs',
    'MOI': 'MOI',
    'LOS': 'los'
})

california['sex'] = california['sex'].replace({'Male': 1, 'Female': 0, '1': 1, '0': 0,
                                               'M': 1, 'F': 0, 'm': 1, 'f': 0})
california['event'] = california['Mortality'].map({'Yes': 1, 'No': 0, '1': 1, '0': 0, 1: 1, 0: 0}).fillna(0).astype(int)
california_clean = california[standard_cols + ['event']].copy()

# Step 1: Ensure all numerics are truly numeric (age, sbp, hr, rr, gcs, los)
for df in [india_clean, jordan_clean, florida_clean, california_clean]:
    for col in ['age', 'sex', 'sbp', 'hr', 'rr', 'gcs', 'los']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(random_state=42)
numerical_cols = ['age', 'sex', 'sbp', 'hr', 'rr', 'gcs', 'los']

for df in [india_clean, jordan_clean, florida_clean, california_clean]:
    available = [col for col in numerical_cols if col in df.columns and df[col].notna().any()]
    missing = [col for col in numerical_cols if col not in available]

    # Impute only available
    imputed = pd.DataFrame(imputer.fit_transform(df[available]), columns=available, index=df.index)
    
    # Fill missing columns with 0 or np.nan
    for col in missing:
        imputed[col] = 0  # or use np.nan

    # Reorder to match numerical_cols
    imputed = imputed[numerical_cols]
    
    # Assign back
    df[numerical_cols] = imputed


for df in [india_clean, jordan_clean, florida_clean, california_clean]:
    df['hr_cat'] = [hr_category(a, h) for a, h in zip(df['age'], df['hr'])]
    df['rr_cat'] = [rr_category(a, r) for a, r in zip(df['age'], df['rr'])]
    df['sbp_cat'] = [sbp_category(a, s) for a, s in zip(df['age'], df['sbp'])]

# For Jordan only — preserve categorical vitals and drop raw ones
jordan_clean['hr_cat'] = jordan['ER-HR']
jordan_clean['rr_cat'] = jordan['ER-RR']
jordan_clean['sbp_cat'] = jordan['ER-systolic BP']

# Drop incorrectly imputed raw vital signs (which were 0-filled)
jordan_clean.drop(columns=['hr', 'rr', 'sbp'], inplace=True, errors='ignore')


  india['los'] = (pd.to_datetime(india['dodd'] + ' ' + india['todd'], errors='coerce') -
  pd.to_datetime(india['doa'] + ' ' + india['toa'], errors='coerce')).dt.total_seconds() / 3600 / 24
  california['sex'] = california['sex'].replace({'Male': 1, 'Female': 0, '1': 1, '0': 0,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numerical_cols] = imputed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

In [227]:
india_clean

Unnamed: 0,age,sex,sbp,hr,rr,gcs,los,MOI,event,hr_cat,rr_cat,sbp_cat
0,40.0,1.0,120.000000,70.0,22.000000,15.0,6.295162,Fall,0,normal,tachypnea,normal
1,27.0,1.0,130.000000,84.0,15.000000,5.0,10.207837,Motor Vehicle Collision,0,normal,normal,normal
2,45.0,1.0,110.000000,80.0,19.294120,15.0,5.958333,Motor Vehicle Collision,0,normal,normal,normal
3,50.0,1.0,122.139506,88.0,24.000000,4.0,2.402778,Other / Unknown,1,normal,tachypnea,normal
4,50.0,0.0,130.000000,80.0,22.000000,3.0,2.527778,Other / Unknown,0,normal,tachypnea,normal
...,...,...,...,...,...,...,...,...,...,...,...,...
7973,8.0,0.0,102.000000,100.0,20.062082,11.0,1.208333,Fall,0,normal,normal,normal
7974,45.0,1.0,130.000000,72.0,14.000000,11.0,8.861111,Motor Vehicle Collision,0,normal,normal,normal
7975,11.0,1.0,100.000000,88.0,19.786547,15.0,2.506944,Fall,0,normal,normal,normal
7976,45.0,0.0,110.000000,90.0,24.000000,6.0,9.008322,Motor Vehicle Collision,1,normal,tachypnea,normal


In [228]:
jordan_clean

Unnamed: 0,age,sex,gcs,los,MOI,event,hr_cat,rr_cat,sbp_cat
0,6.0,0.0,15.0,5.0,Motor Vehicle Collision,0,tachycardia,tachypnea,normal
1,7.0,1.0,15.0,7.0,Motor Vehicle Collision,0,normal,normal,normal
2,13.0,1.0,3.0,7.0,Fall,0,normal,normal,normal
3,9.0,1.0,13.0,28.0,Motor Vehicle Collision,0,normal,normal,normal
4,1.5,1.0,15.0,3.0,Fall,0,normal,normal,normal
...,...,...,...,...,...,...,...,...,...
107,0.8,1.0,0.0,4.0,Fall,0,tachycardia,tachypnea,low
108,0.3,1.0,0.0,2.0,Fall,0,tachycardia,tachypnea,normal
109,0.8,1.0,0.0,6.0,Fall,0,tachycardia,tachypnea,low
110,0.4,1.0,0.0,6.0,Motor Vehicle Collision,0,tachycardia,tachypnea,low


In [229]:
print(florida['sex'].unique())
print(california['sex'].unique())


[0 1]
[1 0]


In [230]:
florida_clean

Unnamed: 0,age,sex,sbp,hr,rr,gcs,los,MOI,event,hr_cat,rr_cat,sbp_cat
0,23.0,0.0,107.0,133.0,14.0,3.0,107.0,Motor Vehicle Collision,0,tachycardia,normal,normal
1,53.0,1.0,167.0,105.0,16.0,15.0,25.0,Fall,1,tachycardia,normal,normal
2,34.0,0.0,100.0,117.0,30.0,6.0,54.0,Pedestrian Struck,0,tachycardia,tachypnea,normal
3,35.0,1.0,97.0,102.0,19.0,7.0,26.0,Motor Vehicle Collision,1,tachycardia,normal,normal
4,71.0,1.0,95.0,130.0,16.0,3.0,41.0,Gunshot Wound (GSW),0,tachycardia,normal,normal
...,...,...,...,...,...,...,...,...,...,...,...,...
258,77.0,0.0,185.0,103.0,16.0,15.0,16.0,Fall,0,tachycardia,normal,normal
259,70.0,1.0,162.0,99.0,16.0,15.0,21.0,Fall,0,normal,normal,normal
260,40.0,1.0,135.0,120.0,24.0,3.0,56.0,Pedestrian Struck,0,tachycardia,tachypnea,normal
261,49.0,1.0,179.0,112.0,33.0,3.0,34.0,Motor Vehicle Collision,0,tachycardia,tachypnea,normal


In [231]:
california_clean

Unnamed: 0,age,sex,sbp,hr,rr,gcs,los,MOI,event,hr_cat,rr_cat,sbp_cat
0,60.0,1.0,153.0,160.0,22.0,14.0,5.0,Gunshot Wound (GSW),1,tachycardia,tachypnea,normal
1,38.0,1.0,77.0,110.0,0.0,3.0,12.0,Motor Vehicle Collision,1,tachycardia,normal,low
2,61.0,1.0,148.0,82.0,18.0,14.0,2.0,Motor Vehicle Collision,1,normal,normal,normal
3,62.0,1.0,150.0,95.0,20.0,14.0,2.0,Motor Vehicle Collision,1,normal,normal,normal
4,22.0,1.0,200.0,82.0,0.0,3.0,7.0,Motor Vehicle Collision,1,normal,normal,normal
...,...,...,...,...,...,...,...,...,...,...,...,...
578,68.0,1.0,150.0,81.0,16.0,13.0,45.0,Motor Vehicle Collision,0,normal,normal,normal
579,46.0,1.0,102.0,86.0,20.0,14.0,19.0,Motor Vehicle Collision,0,normal,normal,normal
580,52.0,1.0,168.0,99.0,0.0,3.0,24.0,Motor Vehicle Collision,0,normal,normal,normal
581,47.0,0.0,178.0,114.0,28.0,14.0,36.0,Motor Vehicle Collision,0,tachycardia,tachypnea,normal


In [232]:
for name, df in zip(["India", "Jordan", "Florida", "California"], [india, jordan, florida, california]):
    print(f"\n{name} MOI Value Counts:")
    print(df['MOI'].value_counts(normalize=True).round(3) * 100)



India MOI Value Counts:
MOI
Fall                       39.4
Motor Vehicle Collision    38.0
Other / Unknown             8.5
Motorcycle Crash            6.2
Assault / Blunt Force       4.7
Pedestrian Struck           2.6
Bicycle Crash               0.4
Burn / Fire / Blast         0.1
Name: proportion, dtype: float64

Jordan MOI Value Counts:
MOI
Motor Vehicle Collision    60.7
Fall                       29.5
Other / Unknown             4.5
Burn / Fire / Blast         2.7
Assault / Blunt Force       2.7
Name: proportion, dtype: float64

Florida MOI Value Counts:
MOI
Motor Vehicle Collision    21.7
Fall                       18.6
Pedestrian Struck          17.5
Motorcycle Crash           15.2
Other / Unknown            12.2
Gunshot Wound (GSW)         7.2
Bicycle Crash               3.0
Assault / Blunt Force       2.7
Animal / Environmental      1.5
Burn / Fire / Blast         0.4
Name: proportion, dtype: float64

California MOI Value Counts:
MOI
Fall                       42.7
Motor Veh

In [233]:
for name, df, raw_col in [
    ("India", india, 'moi'),
    ("Jordan", jordan, 'type of trauma'),
    ("Florida", florida, 'MOI'),
    ("California", california, 'MOI')
]:
    print(f"\n🔍 {name} – Raw values mapped to 'Other / Unknown':")
    raw_vals = df[df['MOI'] == 'Other / Unknown'][raw_col]
    print(raw_vals.value_counts())



🔍 India – Raw values mapped to 'Other / Unknown':
moi
Railway injury    419
Other             225
Name: count, dtype: int64

🔍 Jordan – Raw values mapped to 'Other / Unknown':
Series([], Name: count, dtype: int64)

🔍 Florida – Raw values mapped to 'Other / Unknown':
MOI
Other / Unknown    32
Name: count, dtype: int64

🔍 California – Raw values mapped to 'Other / Unknown':
MOI
Other / Unknown    16
Name: count, dtype: int64


In [234]:
india_clean.to_csv('india_clean.csv', index=False)
jordan_clean.to_csv('jordan_clean.csv', index=False)
florida_clean.to_csv('florida_clean.csv', index=False)
california_clean.to_csv('california_clean.csv', index=False)
