In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import ttest_ind

In [2]:
# Load dataset
data = pd.read_csv("./dataset/colorectal_cancer_prediction.csv")
data.head(4)

Unnamed: 0,Patient_ID,Age,Gender,Race,Region,Urban_or_Rural,Socioeconomic_Status,Family_History,Previous_Cancer_History,Stage_at_Diagnosis,...,Insurance_Coverage,Time_to_Diagnosis,Treatment_Access,Chemotherapy_Received,Radiotherapy_Received,Surgery_Received,Follow_Up_Adherence,Survival_Status,Recurrence,Time_to_Recurrence
0,1,71,Male,Other,Europe,Urban,Middle,Yes,No,III,...,Yes,Delayed,Good,Yes,No,No,Good,Survived,No,16
1,2,34,Female,Black,North America,Urban,Middle,No,No,I,...,No,Timely,Good,No,Yes,Yes,Poor,Deceased,No,28
2,3,80,Female,White,North America,Urban,Middle,No,No,III,...,Yes,Timely,Limited,No,Yes,Yes,Good,Survived,No,26
3,4,40,Male,Black,North America,Rural,Low,No,No,I,...,Yes,Delayed,Limited,Yes,No,Yes,Poor,Deceased,No,44


In [3]:
# Drop unnecessary columns (e.g., Patient_ID if it exists)
data.drop(columns=['Patient_ID','Recurrence','Time_to_Recurrence'], inplace=True, errors='ignore')

In [4]:
# Data Preprocessing
print("Initial Data Overview:")
print(data.info())
print("\nMissing Values:")
print(data.isnull().sum())

# Handling missing values
data = data.dropna()

Initial Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89945 entries, 0 to 89944
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      89945 non-null  int64  
 1   Gender                   89945 non-null  object 
 2   Race                     89945 non-null  object 
 3   Region                   89945 non-null  object 
 4   Urban_or_Rural           89945 non-null  object 
 5   Socioeconomic_Status     89945 non-null  object 
 6   Family_History           89945 non-null  object 
 7   Previous_Cancer_History  89945 non-null  object 
 8   Stage_at_Diagnosis       89945 non-null  object 
 9   Tumor_Aggressiveness     89945 non-null  object 
 10  Colonoscopy_Access       89945 non-null  object 
 11  Screening_Regularity     89945 non-null  object 
 12  Diet_Type                89945 non-null  object 
 13  BMI                      89945 non-null  float64
 14 

In [5]:
# Encoding categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Feature scaling
scaler = StandardScaler()
features = data.drop(columns=['Survival_Status'])
X = scaler.fit_transform(features)
y = data['Survival_Status']

In [6]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# # Exploratory Data Analysis (EDA)
# plt.figure(figsize=(10, 6))
# sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
# plt.title("Feature Correlation Heatmap")
# plt.show()
# data.corr()

In [18]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
# Model Training & Evaluation
models = {
    "RandomForest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(eval_metric='logloss'),
    "LogisticRegression": LogisticRegression(),
    "KNN": KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier(),
    "NaiveBayes": GaussianNB()
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    acc = accuracy_score(y_train, y_pred_train)
    print(f"{name} Train Accuracy: {acc:.4f}")
    y_pred_test = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred_test)
    results[name] = acc
    print(f"{name} Test Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred_test,zero_division=1))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred_test))
    # Error Checking
    mse = mean_squared_error(y_test, y_pred_test)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred_test)
    print(f"{name} MSE: {mse:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}")
    print("-" * 50)
    print("-" * 50)

RandomForest Train Accuracy: 1.0000
RandomForest Test Accuracy: 0.7515
              precision    recall  f1-score   support

           0       0.67      0.00      0.00      4471
           1       0.75      1.00      0.86     13518

    accuracy                           0.75     17989
   macro avg       0.71      0.50      0.43     17989
weighted avg       0.73      0.75      0.65     17989

Confusion Matrix:
[[    2  4469]
 [    1 13517]]
RandomForest MSE: 0.2485, RMSE: 0.4985, MAE: 0.2485
--------------------------------------------------
--------------------------------------------------
XGBoost Train Accuracy: 0.7632
XGBoost Test Accuracy: 0.7476
              precision    recall  f1-score   support

           0       0.23      0.01      0.01      4471
           1       0.75      0.99      0.86     13518

    accuracy                           0.75     17989
   macro avg       0.49      0.50      0.43     17989
weighted avg       0.62      0.75      0.65     17989

Confusion M

In [19]:
# Best Model Selection
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
print(f"Best Model: {best_model_name} with accuracy {results[best_model_name]:.4f}")

Best Model: RandomForest with accuracy 0.7515


In [20]:
# Hyperparameter Tuning for Best Model
if best_model_name == "RandomForest":
    param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
elif best_model_name == "XGBoost":
    param_grid = {'learning_rate': [0.01, 0.1, 0.2], 'n_estimators': [50, 100, 200]}
elif best_model_name == "KNN":
    param_grid = {'n_neighbors': [3, 5, 7]}
elif best_model_name == "DecisionTree":
    param_grid = {'max_depth': [None, 10, 20]}
else:
    param_grid = {}

if param_grid:
    grid_search = GridSearchCV(best_model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print(f"Best Hyperparameters for {best_model_name}: {grid_search.best_params_}")
    best_model = grid_search.best_estimator_

Best Hyperparameters for RandomForest: {'max_depth': 10, 'n_estimators': 50}


In [21]:
# Overfitting Check
train_score = cross_val_score(best_model, X_train, y_train, cv=5).mean()
test_score = accuracy_score(y_test, best_model.predict(X_test))
print(f"Cross-validated Train Score: {train_score:.4f}, Test Score: {test_score:.4f}")
if train_score - test_score > 0.05:
    print("Warning: Model might be overfitting!")

Cross-validated Train Score: 0.7480, Test Score: 0.7515


In [22]:
# Hypothesis Testing
model_1_preds = models["RandomForest"].predict(X_test)
model_2_preds = models["XGBoost"].predict(X_test)

stat, p_value = ttest_ind(model_1_preds, model_2_preds)
print(f"Hypothesis Test: t-statistic={stat:.4f}, p-value={p_value:.4f}")
if p_value < 0.05:
    print("Significant difference between RandomForest and XGBoost.")
else:
    print("No significant difference between RandomForest and XGBoost.")

Hypothesis Test: t-statistic=11.0511, p-value=0.0000
Significant difference between RandomForest and XGBoost.


## Save the Trained Model

In [23]:
import joblib

# Save the best model
joblib.dump(best_model, 'best_model.pkl')

# Save the scaler and label encoders
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')

['label_encoders.pkl']

In [24]:
# Load the model and preprocessing tools
best_model = joblib.load('best_model.pkl')
scaler = joblib.load('scaler.pkl')
label_encoders = joblib.load('label_encoders.pkl')

In [25]:
# Function to Preprocess New Data

def preprocess_input(input_data, label_encoders, scaler):
    # Convert input data to DataFrame
    input_df = pd.DataFrame([input_data])
    
    # Encode categorical variables
    for column, le in label_encoders.items():
        if column in input_df.columns:
            input_df[column] = le.transform(input_df[column])
    
    # Scale the features
    scaled_input = scaler.transform(input_df)
    
    return scaled_input

In [26]:
# Function to Make Predictions

def predict_new_data(input_data, model, label_encoders, scaler):
    # Preprocess the input data
    processed_input = preprocess_input(input_data, label_encoders, scaler)
    
    # Make prediction
    prediction = model.predict(processed_input)
    
    return prediction

## New User Input

In [28]:
def main():
    # Example input data (replace with actual user input)
    input_data = {
        'Age': 45,
        'Gender': 'Male',  # User-friendly input
        'Race': 'White',
        'Region': 'North America',
        'Urban_or_Rural': 'Urban',
        'Socioeconomic_Status': 'Middle',
        'Family_History': 'Yes',
        'Previous_Cancer_History': 'No',
        'Stage_at_Diagnosis': 'II',
        'Tumor_Aggressiveness': 'Medium',
        'Colonoscopy_Access': 'Yes',
        'Screening_Regularity': 'Regular',
        'Diet_Type': 'Western',
        'BMI': 28.5,
        'Physical_Activity_Level': 'Medium',
        'Smoking_Status': 'Former',
        'Alcohol_Consumption': 'Low',
        'Red_Meat_Consumption': 'Medium',
        'Fiber_Consumption': 'High',
        'Insurance_Coverage': 'Yes',
        'Time_to_Diagnosis': 'Timely',
        'Treatment_Access': 'Good',
        'Chemotherapy_Received': 'No',
        'Radiotherapy_Received': 'Yes',
        'Surgery_Received': 'Yes',
        'Follow_Up_Adherence': 'Good',
    }
    
    # Make prediction
    prediction = predict_new_data(input_data, best_model, label_encoders, scaler)
    
    # Output the prediction
    if(prediction[0]) : 
        print('Survived') 
    else : 
        print('Deceased')


main()

Survived
