In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer


In [2]:
df = pd.read_csv('colorectal_cancer_prediction.csv')

In [3]:
pd.set_option('display.max_columns', None)


In [4]:
df.columns

Index(['Patient_ID', 'Age', 'Gender', 'Race', 'Region', 'Urban_or_Rural',
       'Socioeconomic_Status', 'Family_History', 'Previous_Cancer_History',
       'Stage_at_Diagnosis', 'Tumor_Aggressiveness', 'Colonoscopy_Access',
       'Screening_Regularity', 'Diet_Type', 'BMI', 'Physical_Activity_Level',
       'Smoking_Status', 'Alcohol_Consumption', 'Red_Meat_Consumption',
       'Fiber_Consumption', 'Insurance_Coverage', 'Time_to_Diagnosis',
       'Treatment_Access', 'Chemotherapy_Received', 'Radiotherapy_Received',
       'Surgery_Received', 'Follow_Up_Adherence', 'Survival_Status',
       'Recurrence', 'Time_to_Recurrence'],
      dtype='object')

In [5]:
df.describe

<bound method NDFrame.describe of        Patient_ID  Age  Gender   Race         Region Urban_or_Rural  \
0               1   71    Male  Other         Europe          Urban   
1               2   34  Female  Black  North America          Urban   
2               3   80  Female  White  North America          Urban   
3               4   40    Male  Black  North America          Rural   
4               5   43  Female  White         Europe          Urban   
...           ...  ...     ...    ...            ...            ...   
89940       89941   72    Male  White  North America          Urban   
89941       89942   59  Female  Asian  North America          Urban   
89942       89943   77  Female  White         Africa          Rural   
89943       89944   37    Male  Other  North America          Rural   
89944       89945   69    Male  White         Europe          Urban   

      Socioeconomic_Status Family_History Previous_Cancer_History  \
0                   Middle            Yes   

In [6]:
yes = df[df['Recurrence'] == 'Yes']
no = df[df['Recurrence'] == 'No']

In [7]:
display(yes['Recurrence'].value_counts(), no['Recurrence'].value_counts())

Recurrence
Yes    26970
Name: count, dtype: int64

Recurrence
No    62975
Name: count, dtype: int64

In [None]:
df_no_downsampled = no.sample(n=len(yes), random_state=42)

df_balanced = pd.concat([yes, df_no_downsampled], axis=0)

df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

df = df_balanced.copy()

In [9]:
categorical = ['Gender', 'Race','Urban_or_Rural','Colonoscopy_Access', 'Fiber_Consumption', 'Tumor_Aggressiveness', 'Diet_Type', 'Smoking_Status', 'Alcohol_Consumption', 'Red_Meat_Consumption', 'Physical_Activity_Level', 'Socioeconomic_Status', 'Family_History', 'Previous_Cancer_History', 'Stage_at_Diagnosis', 'Insurance_Coverage', 'Time_to_Diagnosis', 'Treatment_Access', 'Chemotherapy_Received', 'Radiotherapy_Received', 'Surgery_Received', 'Follow_Up_Adherence']
numerical = ['Age']

In [10]:
df= df.drop(['Patient_ID', 'Time_to_Recurrence'], axis=1)
df['Recurrence'] = df['Recurrence'].apply(lambda x: 1 if x == 'Yes' else 0)


In [11]:
df.head(2)

Unnamed: 0,Age,Gender,Race,Region,Urban_or_Rural,Socioeconomic_Status,Family_History,Previous_Cancer_History,Stage_at_Diagnosis,Tumor_Aggressiveness,Colonoscopy_Access,Screening_Regularity,Diet_Type,BMI,Physical_Activity_Level,Smoking_Status,Alcohol_Consumption,Red_Meat_Consumption,Fiber_Consumption,Insurance_Coverage,Time_to_Diagnosis,Treatment_Access,Chemotherapy_Received,Radiotherapy_Received,Surgery_Received,Follow_Up_Adherence,Survival_Status,Recurrence
0,49,Male,Black,Europe,Rural,Low,No,No,III,Medium,No,Regular,Western,19.9,Medium,Never,Low,Low,Low,Yes,Delayed,Good,No,Yes,Yes,Good,Deceased,1
1,76,Female,White,Europe,Urban,Low,No,No,II,High,No,Never,Balanced,25.3,High,Current,Low,Low,Medium,Yes,Timely,Limited,No,Yes,No,Good,Deceased,0


In [12]:
yes = df[df['Recurrence'] == 1]
no = df[df['Recurrence'] == 0]
display(yes['Recurrence'].value_counts(), no['Recurrence'].value_counts())

Recurrence
1    26970
Name: count, dtype: int64

Recurrence
0    26970
Name: count, dtype: int64

In [13]:
preprocessor = ColumnTransformer(
    [
        ('numerical', StandardScaler(), numerical),
        ('categorical', OneHotEncoder(handle_unknown='ignore'), categorical)
    ]
)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

model = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('classifier', LogisticRegression(
            class_weight='balanced',
            random_state=42
        ))
    ]
)

In [15]:
X = df.drop('Recurrence', axis=1)
y = df['Recurrence']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [17]:
model.fit(X_train,y_train)

In [18]:
y_pred = model.predict(X_test)


In [19]:
y_proba = model.predict_proba(X_test)[:, 1]


In [20]:

from sklearn.metrics import roc_auc_score


print("Classification Report:")
print(classification_report(y_test, y_pred))
print("\nROC AUC Score:", roc_auc_score(y_test, y_proba))

Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.51      0.51      8091
           1       0.51      0.51      0.51      8091

    accuracy                           0.51     16182
   macro avg       0.51      0.51      0.51     16182
weighted avg       0.51      0.51      0.51     16182


ROC AUC Score: 0.5078158729032707


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__n_estimators': [100, 200, 500],
    'classifier__max_depth': [5, 10, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}


model = Pipeline([
    ('preprocessor', preprocessor),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('classifier', RandomForestClassifier(
        n_estimators=100, 
        max_depth=10, 
        class_weight='balanced', 
        random_state=42
    ))
])



grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validated ROC AUC:", grid_search.best_score_)

Best parameters: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}
Best cross-validated ROC AUC: 0.5045589134773254


In [22]:
y_train.value_counts()

Recurrence
0    18879
1    18879
Name: count, dtype: int64

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

preprocessor = ColumnTransformer([
    ('numerical', StandardScaler(), numerical),
    ('categorical', OneHotEncoder(handle_unknown='ignore'), categorical)
])

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

X = df.drop('Recurrence', axis=1)
y = df['Recurrence']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

X_train_preprocessed = model_pipeline.fit_transform(X_train)
X_test_preprocessed = model_pipeline.transform(X_test)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train)

clf = RandomForestClassifier(random_state=42, class_weight='balanced')
clf.fit(X_train_resampled, y_train_resampled)

y_pred = clf.predict(X_test_preprocessed)

In [None]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("\nROC AUC Score:", roc_auc_score(y_test, y_proba))

Classification Report:
              precision    recall  f1-score   support

          No       0.70      0.51      0.59     18893
         Yes       0.30      0.50      0.38      8091

    accuracy                           0.50     26984
   macro avg       0.50      0.50      0.48     26984
weighted avg       0.58      0.50      0.52     26984


ROC AUC Score: 0.5014193305555698


Well there s n point in evaluating fairness for a model that makes predictions based on luck