In [1]:
import sys
!{sys.executable} -m pip install pandas numpy scikit-learn matplotlib seaborn scipy joblib xgboost lightgbm

Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting matplotlib
  Using cached matplotlib-3.10.7-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scipy
  Using cached scipy-1.16.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting xgboost
  Using cached xgboost-3.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting lightgbm
  Using cached lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.3-cp312-cp312-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV # pyright: ignore[reportMissingModuleSource]
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import warnings
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

warnings.filterwarnings("ignore")

# Function for preprocessing data (unchanged)
def preprocess_data(data):
    remove_columns = ['directory.id', 'Subject', 'RID', 'Image.Data.ID', 'Modality', 'Visit', 'Acq.Date', 'EXAMDATE', 'DX.bl']
    data = data.drop(remove_columns, axis=1)

    imputer = SimpleImputer(strategy="mean")
    data[["AGE", "PTEDUCAT", "MMSE"]] = imputer.fit_transform(data[["AGE", "PTEDUCAT", "MMSE"]])

    categorical_features = ["PTGENDER", "PTETHCAT", "PTRACCAT", "APOE Genotype"]
    encoder = LabelEncoder()
    for feature in categorical_features:
        data[feature] = encoder.fit_transform(data[feature])

    data = data.dropna(subset=['imputed_genotype'])
    X = data.drop("Dx Codes for Submission", axis=1)
    y = data["Dx Codes for Submission"]

    return X, y

In [None]:
# Load and preprocess data
data = pd.read_csv("ADNI_Training_Q3_APOE_CollectionADNI1Complete 1Yr 1.5T_July22.2014.xls")
X, y = preprocess_data(data.copy())

# Train-Test Split 
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.2, random_state=40)

FileNotFoundError: [Errno 2] No such file or directory: 'ADNI_Training_Q3_APOE_CollectionADNI1Complete 1Yr 1.5T_July22.2014.csv'

In [None]:


# Stage 1 Classifier (Naive Bayes) - Optimized
def train_stage1_classifier(X_train, y_train):
    # Reduced parameter grid and using RandomizedSearchCV
    gnb_param_grid = {'var_smoothing': np.logspace(-10, -7, 4)}
    model = RandomizedSearchCV(GaussianNB(), gnb_param_grid, cv=3, n_iter=2, random_state=31, n_jobs=-1)
    model.fit(X_train, y_train)
    return model.best_estimator_


# Improved Stage 2 Classifier Training
def train_stage2_classifier(X_train, y_train, classifier_type):
    if classifier_type == 'SVM':


        model = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(C=10, gamma='scale', kernel='rbf', class_weight='balanced', probability=True))
])


        from sklearn.decomposition import PCA

        model = Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=0.95)),  # retain 95% variance
            ('svm', SVC(C=10, gamma='scale', kernel='rbf', class_weight='balanced', probability=True))
        ])


        # Train only on non-AD samples (MCI and NC)
        non_ad_mask = (y_train != 'AD')
        # model.fit(X_train[non_ad_mask], y_train[non_ad_mask])
        model.fit(X_train[non_ad_mask], y_train[non_ad_mask])


        # print(f"Best SVM params: {model.best_params_}")
        # print(f"Best SVM CV accuracy: {model.best_score_:.4f}")

    elif classifier_type == 'KNN':
        # KNN pipeline with scaling
        knn_pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('knn', KNeighborsClassifier())
        ])

        knn_param_grid = {
            'knn__n_neighbors': [3, 5, 7, 9],
            'knn__weights': ['uniform', 'distance'],
            'knn__metric': ['euclidean', 'manhattan']
        }

        model = GridSearchCV(
            knn_pipe,
            knn_param_grid,
            cv=5,
            n_jobs=-1
        )
        model.fit(X_train, y_train)

    return model

In [None]:
# Train models with the updated logic
print("Training Stage 1 Classifier (Naive Bayes)...")
model_stage1 = train_stage1_classifier(X_train, y_train)

print("\nTraining Stage 2 Classifier (SVM - MCI/NC only)...")
model_stage2_svm = train_stage2_classifier(X_train, y_train, 'SVM')

print("\nTraining Stage 2 Classifier (KNN)...")
model_stage2_knn = train_stage2_classifier(X_train, y_train, 'KNN')

In [None]:
# Enhanced multistage classifier with confidence thresholds
def multistage_classifier(input_object, model_stage1, model_stage2_svm, model_stage2_knn):
    # Stage 1: AD detection with confidence threshold
    stage1_proba = model_stage1.predict_proba([input_object])[0]
    ad_class_index = list(model_stage1.classes_).index('AD')
    ad_prob = stage1_proba[ad_class_index]

    # Use 0.7 threshold for higher confidence in AD detection
    if ad_prob >= 0.7:
        return 'AD'

    # Stage 2: SVM for certain non-AD cases
    elif ad_prob <= 0.5:  # Confident it's not AD
        svm_pred = model_stage2_svm.predict([input_object])[0]
        svm_proba = model_stage2_svm.predict_proba([input_object])[0]
        svm_confidence = max(svm_proba)

        # Only accept SVM prediction if confident
        if svm_confidence >= 0.8:
            return svm_pred

    # Stage 3: Fallback to KNN for uncertain cases
    return model_stage2_knn.predict([input_object])[0]

In [None]:
# unseen data

# Evaluate on test data
y_pred = [multistage_classifier(list(sample), model_stage1, model_stage2_svm, model_stage2_knn) for sample in X_train]

# Print final output
print("\nClassification Report:\n")
print(classification_report(y_train, y_pred))
print("Accuracy:", accuracy_score(y_train, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred))

In [None]:
# seen data

# Evaluate on test data
y_pred = [multistage_classifier(list(sample), model_stage1, model_stage2_svm, model_stage2_knn) for sample in X_test]

# Print final output
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
def predict_on_data(data):
    """
    Preprocesses input data using your existing preprocess_data function
    and runs the multistage classifier on it.

    Parameters:
    input_data (DataFrame or array-like): Input data to predict on
    model_stage1: Trained Stage 1 classifier
    model_stage2_svm: Trained Stage 2 SVM classifier
    model_stage2_knn: Trained Stage 2 KNN classifier

    Returns:
    list: Predictions for each input sample
    """
    # Make a copy to avoid modifying original data
    # data = input_data.copy()

    # If input is not already a DataFrame, try to convert it
    if not isinstance(data, pd.DataFrame):
        try:
            # Try to reconstruct DataFrame assuming same structure as training data
            data = pd.DataFrame(data, columns=[
                'directory.id', 'Subject', 'RID', 'Image.Data.ID', 'Modality',
                'Visit', 'Acq.Date', 'EXAMDATE', 'Dx Codes for Submission',
                'AGE', 'PTEDUCAT', 'MMSE', 'PTGENDER', 'PTETHCAT',
                'PTRACCAT', 'APOE Genotype', 'imputed_genotype', 'DX.bl'
            ])
        except ValueError:
            raise ValueError("Input data format doesn't match expected structure")

    # Apply your existing preprocessing
    try:
        X_preprocessed, _ = preprocess_data(data)
    except Exception as e:
        raise ValueError(f"Preprocessing failed: {str(e)}")

    # Get predictions
    predictions = []
    for sample in X_preprocessed.values:
        try:
            pred = multistage_classifier(list(sample), model_stage1, model_stage2_svm, model_stage2_knn)
            predictions.append(pred)
        except Exception as e:
            predictions.append(f"Error: {str(e)}")

    return predictions

In [None]:
data.head(10)

In [None]:
predictions = predict_on_data(data.head(10))
print(predictions)