In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from category_encoders import CatBoostEncoder
from sklearn.model_selection import KFold

input_path = Path("data/raw/wwlLancMsc_data.csv")
output_path = Path("data/processed/merge_Sophie_col_80_100.csv")

df = pd.read_csv(input_path)
last_20_cols = df.columns[-20:].tolist()
df_my = df[last_20_cols].copy()

target_col = "spell_episode_los" 
if target_col in df.columns:
    df_my[target_col] = df[target_col]
    target = df_my[target_col]
else:
    target = None

# PROCESSING 

# 1. chronic_condition_obesity_flag
if "chronic_condition_obesity_flag" in df_my.columns:
    df_my["chronic_condition_obesity_flag"] = df_my["chronic_condition_obesity_flag"].replace(["NA"], np.nan)
    df_my["chronic_condition_obesity_flag"] = df_my["chronic_condition_obesity_flag"].astype("Int64")

# 2. chronic_condition_respiratory_flag
if "chronic_condition_respiratory_flag" in df_my.columns:
    df_my["chronic_condition_respiratory_flag"] = df_my["chronic_condition_respiratory_flag"].replace(["NA"], np.nan)
    df_my["chronic_condition_respiratory_flag"] = df_my["chronic_condition_respiratory_flag"].astype("Int64")

# 3. frailty_score
if "frailty_score" in df_my.columns:
    df_my["frailty_score"] = (
        df_my["frailty_score"]
        .astype(str)
        .str.extract(r"(\d+)")[0]
        .astype(float)
    )

# 4. Arrival_Date - will be deleted after arrival_date_time processing
# 5. arrival_date_time
if "arrival_date_time" in df_my.columns and "Arrival_Date" in df_my.columns:#
    
    # Parse Arrival_Date
    df_my["Arrival_Date"] = pd.to_datetime(df_my["Arrival_Date"], errors="coerce")
    
    # Read raw arrival_date_time from original file
    raw = pd.read_csv(
        input_path,
        usecols=["arrival_date_time"],
        dtype=str
    ).loc[df_my.index, "arrival_date_time"].astype(str).str.strip()
    
    # Parse with both date formats
    adt_dayfirst = pd.to_datetime(raw, dayfirst=True, errors="coerce")
    adt_monthfirst = pd.to_datetime(raw, dayfirst=False, errors="coerce")
    
    # Initialize final series
    final = pd.Series(pd.NaT, index=df_my.index, dtype='datetime64[ns]')
    
    # Match day-first format with Arrival_Date
    mask_match_day = (
        df_my["Arrival_Date"].notna() & adt_dayfirst.notna() &
        (adt_dayfirst.dt.date == df_my["Arrival_Date"].dt.date)
    )
    final[mask_match_day] = adt_dayfirst[mask_match_day]
    
    # Match month-first format with Arrival_Date
    mask_match_month = (
        df_my["Arrival_Date"].notna() & adt_monthfirst.notna() &
        final.isna() &
        (adt_monthfirst.dt.date == df_my["Arrival_Date"].dt.date)
    )
    final[mask_match_month] = adt_monthfirst[mask_match_month]
    
    # Fill remaining with best guess
    remaining = final.isna()
    final[remaining] = adt_dayfirst[remaining].combine_first(adt_monthfirst[remaining])
    
    df_my["arrival_date_time"] = final
    
    # Check for mismatches
    mismatch_df = df_my[
        df_my["Arrival_Date"].notna() &
        df_my["arrival_date_time"].notna() &
        (df_my["Arrival_Date"].dt.date != df_my["arrival_date_time"].dt.date)
    ]
    
    def month_to_season(month):
        if month in [12, 1, 2]:
            return 1 #"Winter"
        elif month in [3, 4, 5]:
            return 2 # "Spring"
        elif month in [6, 7, 8]:
            return 3 # "Summer"
        elif month in [9, 10, 11]:
            return 4 # "Autumn"
        return None

    df_my["season_of_the_admission"] = (
        df_my["arrival_date_time"]
        .dt.month
        .apply(month_to_season)
    )
    df_my["season_of_the_admission"] = df_my["season_of_the_admission"].replace(["NA"], np.nan)
    df_my["season_of_the_admission"] = df_my["season_of_the_admission"].replace([np.nan], -1)

    df_my = df_my.drop(columns=["Arrival_Date", "arrival_date_time"])


# 6. attendancetype
if "attendancetype" in df_my.columns:
    df_my["attendancetype"] = df_my["attendancetype"].fillna("Unknown")
    le_attendance = LabelEncoder()
    df_my["attendancetype"] = le_attendance.fit_transform(df_my["attendancetype"])

# 7. initial_assessment_date_time - Delete
if "initial_assessment_date_time" in df_my.columns:
    df_my = df_my.drop(columns=["initial_assessment_date_time"])

# 8. sex_description.y - Delete
if "sex_description.y" in df_my.columns:
    df_my = df_my.drop(columns=["sex_description.y"])

# 9. arrival_mode_description
if "arrival_mode_description" in df_my.columns:
    df_my["arrival_mode_description"] = df_my["arrival_mode_description"].replace(["NA", 0], np.nan)
    df_my["arrival_mode_description"] = df_my["arrival_mode_description"].fillna("Unknown")
    le_arrival_mode = LabelEncoder()
    df_my["arrival_mode_description"] = le_arrival_mode.fit_transform(df_my["arrival_mode_description"])

# 10. place_of_incident
if "place_of_incident" in df_my.columns:
    df_my["place_of_incident"] = df_my["place_of_incident"].fillna("Unknown")
    le_place = LabelEncoder()
    df_my["place_of_incident"] = le_place.fit_transform(df_my["place_of_incident"])

# 11. source_of_ref_description
if "source_of_ref_description" in df_my.columns:
    df_my["source_of_ref_description"] = df_my["source_of_ref_description"].fillna("Unknown")
    le_source = LabelEncoder()
    df_my["source_of_ref_description"] = le_source.fit_transform(df_my["source_of_ref_description"])

# 12. presenting_complaint - CatBoost encoding
if "presenting_complaint" in df_my.columns:
    df_my["presenting_complaint"] = df_my["presenting_complaint"].fillna("Unknown")
    
    if target is not None:
        # Out-of-fold CatBoost encoding
        k_folds = KFold(n_splits=5, shuffle=True, random_state=98)
        oof_values = pd.Series(index=df_my.index, dtype=float)
        
        for tr_index, val_index in k_folds.split(df_my):
            encoder = CatBoostEncoder(cols=["presenting_complaint"], a=20.0)
            encoder.fit(
                df_my.iloc[tr_index][["presenting_complaint"]], 
                target.iloc[tr_index]
            )
            oof_values.iloc[val_index] = encoder.transform(
                df_my.iloc[val_index][["presenting_complaint"]]
            )["presenting_complaint"]
        
        df_my["presenting_complaint_encoded"] = oof_values
        
        # Add frequency count
        freq = df_my["presenting_complaint"].map(df["presenting_complaint"].value_counts())
        df_my["presenting_complaint_count"] = freq
        df_my["presenting_complaint_count"] = df_my["presenting_complaint_count"].replace(["NA"], np.nan)
        df_my["presenting_complaint_count"] = df_my["presenting_complaint_count"].replace([np.nan], -1)
        
        # Add rare flag (categories with less than 10 occurrences)
        df_my["presenting_complaint_is_rare"] = (freq < 10).astype(int)
        
        # Drop original column after encoding
        df_my = df_my.drop(columns=["presenting_complaint"])
    else:
        print("  Skipping CatBoost encoding (no target variable)")
        # Keep as categorical for CatBoost - no encoding needed

# 13. acuity_code
if "acuity_code" in df_my.columns:
    print(df_my["acuity_code"].unique())
    print(df_my["acuity_code"].describe())
    df_my["acuity_code"] = df_my["acuity_code"].replace(["NA"], np.nan)
    df_my["acuity_code"] = df_my["acuity_code"].replace([np.nan], -1)
    print(df_my["acuity_code"].unique())
    df_my["acuity_code"] = pd.to_numeric(df_my["acuity_code"], errors="coerce")

# 14. inj_or_ail - Delete
if "inj_or_ail" in df_my.columns:
    df_my["inj_or_ail"] = df_my["inj_or_ail"].fillna("Unknown")
    le_source = LabelEncoder()
    df_my["inj_or_ail"] = le_source.fit_transform(df_my["inj_or_ail"])

# 15. attend_dis_description - Delete
if "attend_dis_description" in df_my.columns:
    df_my = df_my.drop(columns=["attend_dis_description"])

# 16. ae_unplanned_attendance
if "ae_unplanned_attendance" in df_my.columns:
    df_my["ae_unplanned_attendance"] = df_my["ae_unplanned_attendance"].fillna(0).astype(int)

# 17. location - CatBoost encoding
if "location" in df_my.columns:
    df_my["location"] = df_my["location"].fillna("Unknown")
    le_source = LabelEncoder()
    df_my["location"] = le_source.fit_transform(df_my["location"])

# 18. ID - delete
    df_my = df_my.drop(columns=["ID"])

# 19. Deprivation Decile
if "Deprivation Decile" in df_my.columns:
    df_my["Deprivation Decile"] = pd.to_numeric(df_my["Deprivation Decile"], errors="coerce").astype("Int64")

# 20. NEWS2
if "NEWS2" in df_my.columns:
    df_my["NEWS2_missing"] = df_my["NEWS2"].isna().astype("int64")
    df_my["NEWS2"] = df_my["NEWS2"].replace(["NA", "NaN", "", 0], np.nan)
    df_my["NEWS2"] = df_my["NEWS2"].fillna(-1).astype("int64")


if target_col in df_my.columns and target_col not in last_20_cols:
    df_my = df_my.drop(columns=[target_col])

df_my.to_csv(output_path, index=False)

  df = pd.read_csv(input_path)


[nan  3.  4.  2.  1.  5.]
count    11017.000000
mean         2.744849
std          0.623447
min          1.000000
25%          2.000000
50%          3.000000
75%          3.000000
max          5.000000
Name: acuity_code, dtype: float64
[-1.  3.  4.  2.  1.  5.]

Removing target column 'spell_episode_los' before saving...
Processing complete! Output shape: (41846, 18)

Processed columns: ['chronic_condition_obesity_flag', 'chronic_condition_respiratory_flag', 'frailty_score', 'attendancetype', 'arrival_mode_description', 'place_of_incident', 'source_of_ref_description', 'acuity_code', 'inj_or_ail', 'NEWS2', 'ae_unplanned_attendance', 'location', 'Deprivation Decile', 'season_of_the_admission', 'presenting_complaint_encoded', 'presenting_complaint_count', 'presenting_complaint_is_rare', 'NEWS2_missing']

Data types:
chronic_condition_obesity_flag          Int64
chronic_condition_respiratory_flag      Int64
frailty_score                         float64
attendancetype                      