In [2186]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import precision_score, make_scorer
from sklearn.ensemble import RandomForestClassifier

In [2187]:
seed = 42

# Data preparation

In [2188]:
def clean_df(df):
    df["Tumor Size"] = df["Tumor Size"].fillna(value=df["Tumor Size"].mean())

    df = df.drop(
        ["Race", "Marital Status"],
        axis=1,
        errors="ignore",
    )

    return df

def prep_features(df: pd.DataFrame):
    df = df.drop(["ID", "Status"], axis=1, errors='ignore')

    # handle stages: we want gradual data
    df["T Stage"] = df["T Stage"].map({"T1": 0, "T2": 1, "T3": 2, "T4": 3})
    df["N Stage"] = df["N Stage"].map({"N1": 0, "N2": 1, "N3": 2})

    # dummy encoding
    dummy_cols = (df.select_dtypes(include="object").nunique() < 10).index.tolist()
    dummies = pd.get_dummies(df[dummy_cols], prefix=dummy_cols, drop_first=True)
    df = pd.concat([df, dummies], axis=1)

    df = df.select_dtypes(exclude='object')

    return df

In [2189]:
df = pd.read_csv("train_data.csv")
df = clean_df(df)

df_train = prep_features(df)

In [2190]:
df_train.head()

Unnamed: 0,Age,T Stage,N Stage,Tumor Size,Regional Node Examined,Reginol Node Positive,Reginol Node Negative,Blood Pressure,Diastolic Pressure,Cholesterol,...,T_N_Stage_T2_N3,T_N_Stage_T3_N1,T_N_Stage_T3_N2,T_N_Stage_T3_N3,T_N_Stage_T4_N1,T_N_Stage_T4_N2,T_N_Stage_T4_N3,Hormone_Status_Negative_Positive,Hormone_Status_Positive_Negative,Hormone_Status_Positive_Positive
0,62.300654,0,0,720.03894,61.95214,-3.00035,20.741135,138.774174,106.220079,174.569257,...,False,False,False,False,False,False,False,False,False,True
1,37.268422,1,0,3998.797606,3.807271,2.312096,0.917586,167.47013,79.398602,188.95156,...,False,False,False,False,False,False,False,False,False,True
2,55.864953,0,0,10.712932,54.044711,6.499322,4.648477,179.729892,86.773977,194.401287,...,False,False,False,False,False,False,False,False,False,True
3,60.586799,0,0,15.964767,11.860835,11.537015,6.59267,147.564722,119.535324,169.136234,...,False,False,False,False,False,False,False,False,False,True
4,48.197741,1,0,21.246372,4.571804,-6.975745,6.521427,129.900615,86.920775,226.722842,...,False,False,False,False,False,False,False,False,False,True


# Exploratory data analysis

In [2191]:
# any missing values?

df_train.isna().sum().sort_values(ascending=False).sum()

0

In [2192]:
# any values without any correlation?

status = df["Status"] == "Dead"
corrs = df_train.corrwith(status).sort_values(ascending=False)

to_drop = (corrs < 0.003) & (corrs > -0.003)
cols_dropped = df_train.columns[to_drop].tolist()

df_train = df_train.drop(cols_dropped, axis=1)

print(cols_dropped)
f"removed {(to_drop).sum()} features with bad correlation"

['Grade_3', 'A Stage_Regional', 'Estrogen Status_Positive']


'removed 3 features with bad correlation'

In [2193]:
aligned_status = df.loc[df_train.index, "Status"]

# Models

In [2194]:
X_train, X_val, y_train, y_val = train_test_split(
    df_train, aligned_status, test_size=0.2, random_state=seed
)

In [2195]:
def evaluate(clf):
    scorer = make_scorer(precision_score, pos_label="Dead")

    scores = cross_val_score(clf, X_train, y_train, cv=3, scoring=scorer, n_jobs=-1)
    cv = np.mean(scores) - np.std(scores)

    clf.fit(X_train, y_train)
    pr = precision_score(y_val, clf.predict(X_val), pos_label="Dead")

    return cv, pr

In [2196]:
rf = RandomForestClassifier(n_estimators=350, random_state=seed)

evaluate(rf)

(0.43278818472020153, 0.75)

In [2197]:
clf = rf

# Submission

In [2198]:
test_df = pd.read_csv("test_data.csv")
test_df = clean_df(test_df)
features = prep_features(test_df)
features = features.drop(cols_dropped, axis=1)

In [2199]:
# subtask 1
conditions = [
    test_df["GFR"] >= 90,
    (60 <= test_df["GFR"]) & (test_df["GFR"] < 90)
]

choices = ["Normal", "Mildly Decreased"]
subtask1 = np.select(conditions, choices)

# subtask 2
serum = df["Serum Creatinine"]
serum_test = test_df["Serum Creatinine"]

q1 = np.quantile(serum, 0.25)
q2 = np.quantile(serum, 0.50)
q3 = np.quantile(serum, 0.75)

conditions = [
    serum_test <= q1,
    (q1 < serum_test) & (serum_test <= q2),
    (q2 < serum_test) & (serum_test <= q3),
    (serum_test > q3),
]

choices = ["Very Low", "Low", "High", "Very High"]
subtask2 = np.select(conditions, choices)

# subtask 3
bmi_mean = df["BMI"].median()

subtask3 = (test_df["BMI"] > bmi_mean).astype(int)

# subtask 4
stage_counts = df["T Stage"].value_counts()
subtask4 = test_df["T Stage"].map(stage_counts)

In [2200]:
subtask5 = clf.predict(features)

In [2201]:
ids = test_df["ID"]

def build_subtask_df(subtask_id, answers):
    return pd.DataFrame(
        {"subtaskID": subtask_id, "datapointID": ids, "answer": answers}
    )

subtask_definitions = [
    (1, subtask1),
    (2, subtask2),
    (3, subtask3),
    (4, subtask4),
    (5, subtask5),
]

submission = pd.concat(
    [build_subtask_df(sid, subtask) for sid, subtask in subtask_definitions],
    ignore_index=True,
)

submission.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,3220,Normal
1,1,3221,Normal
2,1,3222,Normal
3,1,3223,Mildly Decreased
4,1,3224,Mildly Decreased


In [2202]:
submission.to_csv("submission.csv", index=False)