In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from datetime import datetime, timedelta

# ------------------- Module 1: Risk Assessment Model -------------------
class RiskStratificationModel:
    def __init__(self, file_path):
        self.df = pd.read_csv("cervical_cancer.csv")
        self.model = LogisticRegression(max_iter=1000)
        self.imputer = SimpleImputer(strategy='mean')
        self.feature_columns = None
        self.drop_cols = ['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis']
        self.X_train = self.X_test = self.y_train = self.y_test = None

    def preprocess(self):
        self.df.replace('?', np.nan, inplace=True)
        self.df = self.df.apply(pd.to_numeric, errors='coerce')
        self.df.drop(columns=[col for col in self.drop_cols if col in self.df.columns], inplace=True)
        self.df.dropna(subset=['Biopsy'], inplace=True)

        X = self.df.drop('Biopsy', axis=1)
        y = self.df['Biopsy']
        self.feature_columns = X.columns.tolist()

        # Split before imputation to avoid data leakage
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        X_train_imputed = self.imputer.fit_transform(self.X_train)
        X_test_imputed = self.imputer.transform(self.X_test)

        return X_train_imputed, X_test_imputed, self.y_train, self.y_test

    def train(self):
        X_train_imputed, X_test_imputed, y_train, y_test = self.preprocess()
        self.model.fit(X_train_imputed, y_train)

        # Accuracy and classification reports
        train_preds = self.model.predict(X_train_imputed)
        test_preds = self.model.predict(X_test_imputed)

        train_acc = accuracy_score(y_train, train_preds)
        test_acc = accuracy_score(y_test, test_preds)

        print("\n--- Model Evaluation ---")
        print(f"✅ Training Accuracy: {train_acc:.2f}")
        print(f"✅ Test Accuracy: {test_acc:.2f}")
        print("\n🔍 Classification Report (Test Set):\n")
        print(classification_report(y_test, test_preds))

    def predict_risk(self, patient_data):
        if len(patient_data) != len(self.feature_columns) + len(self.drop_cols):
            raise ValueError("Incorrect input length. Use get_input_template() to check required fields.")

        input_df = pd.DataFrame([patient_data], columns=self.feature_columns + self.drop_cols)
        input_df.drop(columns=[col for col in self.drop_cols if col in input_df.columns], inplace=True)
        input_df.replace('?', np.nan, inplace=True)
        input_df = input_df.apply(pd.to_numeric, errors='coerce')

        input_imputed = self.imputer.transform(input_df)
        risk_score = self.model.predict_proba(input_imputed)[0][1]  # Probability of Biopsy (positive class)
        return risk_score

    def get_input_template(self):
        return self.feature_columns + self.drop_cols


# ------------------- Module 2: Screening Guideline Engine -------------------
class GuidelineEngine:
    def get_next_screening(self, age, last_screening_date, risk_score):
        """Return the recommended next screening date based on risk and age."""
        today = datetime.today()
        if pd.isna(last_screening_date):
            last_screening_date = today - timedelta(days=365*10)  # Assume never screened

        if isinstance(last_screening_date, str):
            last_screening_date = datetime.strptime(last_screening_date, "%Y-%m-%d")

        # Logic: High-risk → 1 year, Medium → 3 years, Low → 5 years
        if risk_score > 0.7:
            interval_years = 1
        elif risk_score > 0.4:
            interval_years = 3
        else:
            interval_years = 5

        next_date = last_screening_date + timedelta(days=365 * interval_years)
        return next_date.date()


# ------------------- Module 3: Notification/Reminder System -------------------
class ReminderSystem:
    def __init__(self):
        self.reminders = []

    def generate_reminder(self, patient_id, patient_name, next_date):
        today = datetime.today().date()
        if next_date <= today:
            self.reminders.append(f"🔔 {patient_name} (ID: {patient_id}) is due for screening on {next_date}.")
        else:
            self.reminders.append(f"✅ {patient_name} is scheduled for screening on {next_date}.")

    def show_reminders(self):
        print("\n--- Screening Reminders ---")
        for reminder in self.reminders:
            print(reminder)


# ------------------- Main Integration -------------------
class CancerScreeningSystem:
    def __init__(self, data_path):
        self.risk_model = RiskStratificationModel(data_path)
        self.guideline_engine = GuidelineEngine()
        self.reminder_system = ReminderSystem()
        self.risk_model.train()

    def evaluate_patient(self, patient_id, patient_name, patient_data, age, last_screening_date):
        risk_score = self.risk_model.predict_risk(patient_data)
        next_date = self.guideline_engine.get_next_screening(age, last_screening_date, risk_score)
        self.reminder_system.generate_reminder(patient_id, patient_name, next_date)
        return risk_score, next_date

    def show_all_reminders(self):
        self.reminder_system.show_reminders()


# ------------------- Example Usage -------------------
# Load model
system = CancerScreeningSystem("kag_risk.csv")

# Template for input
template = system.risk_model.get_input_template()
print("\nInput template columns:", template)

# Define test patient
patient_id = 101
patient_name = "Alice"
age = 30
last_screening_date = "2020-07-01"

# Example patient input (same order as template)
patient_input = [
    18, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, np.nan, np.nan
]

# Evaluate and show results
risk, next_screening = system.evaluate_patient(
    patient_id, patient_name, patient_input, age, last_screening_date
)
print(f"\nPatient Risk Score: {risk:.2f}")
print(f"Next Recommended Screening Date: {next_screening}")

# Display reminders
system.show_all_reminders()



--- Model Evaluation ---
✅ Training Accuracy: 0.97
✅ Test Accuracy: 0.95

🔍 Classification Report (Test Set):

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       161
           1       0.64      0.64      0.64        11

    accuracy                           0.95       172
   macro avg       0.81      0.81      0.81       172
weighted avg       0.95      0.95      0.95       172


Input template columns: ['Age', 'Number of sexual partners', 'First sexual intercourse', 'Num of pregnancies', 'Smokes', 'Smokes (years)', 'Smokes (packs/year)', 'Hormonal Contraceptives', 'Hormonal Contraceptives (years)', 'IUD', 'IUD (years)', 'STDs', 'STDs (number)', 'STDs:condylomatosis', 'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis', 'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis', 'STDs:pelvic inflammatory disease', 'STDs:genital herpes', 'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV', 'STDs:Hepatitis B', 'STDs:HPV', 'S