<a href="https://colab.research.google.com/github/sakeththammishetti1403/Medication-Interaction-Checker/blob/main/Medication_Interaction_Checker_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from collections import Counter

## Enhanced Synthetic Dataset Creation (500 samples)
np.random.seed(42)

# Base drugs and their properties
drug_db = {
    'warfarin': {'class': 'anticoagulant', 'metabolism': 'CYP2C9'},
    'simvastatin': {'class': 'statin', 'metabolism': 'CYP3A4'},
    'fluoxetine': {'class': 'SSRI', 'metabolism': 'CYP2D6'},
    'digoxin': {'class': 'cardiac', 'metabolism': 'P-gp'},
    'sildenafil': {'class': 'PDE5', 'metabolism': 'CYP3A4'},
    'atorvastatin': {'class': 'statin', 'metabolism': 'CYP3A4'},
    'omeprazole': {'class': 'PPI', 'metabolism': 'CYP2C19'},
    'citalopram': {'class': 'SSRI', 'metabolism': 'CYP2C19'},
    'metoprolol': {'class': 'beta-blocker', 'metabolism': 'CYP2D6'},
    'ibuprofen': {'class': 'NSAID', 'metabolism': 'CYP2C9'},
    'aspirin': {'class': 'antiplatelet', 'metabolism': 'other'},
    'clarithromycin': {'class': 'antibiotic', 'metabolism': 'CYP3A4-inhibitor'},
    'tramadol': {'class': 'opioid', 'metabolism': 'CYP2D6'},
    'furosemide': {'class': 'diuretic', 'metabolism': 'other'},
    'nitroglycerin': {'class': 'vasodilator', 'metabolism': 'other'},
    'grapefruit': {'class': 'food', 'metabolism': 'CYP3A4-inhibitor'},
    'clopidogrel': {'class': 'antiplatelet', 'metabolism': 'CYP2C19'},
    'linezolid': {'class': 'antibiotic', 'metabolism': 'MAOI'},
    'diltiazem': {'class': 'CCB', 'metabolism': 'CYP3A4-inhibitor'},
    'lithium': {'class': 'mood-stabilizer', 'metabolism': 'renal'}
}

# Known interaction patterns
interaction_patterns = [
    ('CYP3A4', 'CYP3A4-inhibitor', 'toxicity', 'major'),
    ('CYP2D6', 'CYP2D6', 'serotonin_syndrome', 'contraindicated'),
    ('anticoagulant', 'antiplatelet', 'bleeding_risk', 'major'),
    ('SSRI', 'MAOI', 'serotonin_syndrome', 'contraindicated'),
    ('statin', 'CYP3A4-inhibitor', 'rhabdomyolysis', 'major'),
    ('cardiac', 'diuretic', 'arrhythmia', 'moderate'),
    ('PDE5', 'vasodilator', 'hypotension', 'major'),
    ('NSAID', 'lithium', 'renal_toxicity', 'major'),
    ('PPI', 'antiplatelet', 'reduced_effect', 'minor'),
    ('beta-blocker', 'CCB', 'bradycardia', 'moderate')
]

# Generate synthetic data
data = []
for _ in range(500):
    # Select random drug pair
    drug1, drug2 = np.random.choice(list(drug_db.keys()), 2, replace=False)

    # Find matching interaction pattern if exists
    interaction_found = False
    for pattern in interaction_patterns:
        if ((drug_db[drug1]['metabolism'] == pattern[0] and drug_db[drug2]['metabolism'] == pattern[1]) or
            (drug_db[drug1]['class'] == pattern[0] and drug_db[drug2]['class'] == pattern[1])):
            interaction_type, severity = pattern[2], pattern[3]
            interaction_found = True
            break

    # If no known pattern, assign random non-severe interaction
    if not interaction_found:
        interaction_type = np.random.choice(['mild_sedation', 'GI_upset', 'minimal_effect'])
        severity = np.random.choice(['minor', 'none'], p=[0.3, 0.7])

    # Add evidence level based on severity
    evidence_level = 3 if severity == 'none' else np.random.choice([1, 2], p=[0.7, 0.3]) if severity in ['major', 'contraindicated'] else 2

    data.append([drug1, drug2,
                drug_db[drug1]['class'], drug_db[drug2]['class'],
                drug_db[drug1]['metabolism'], drug_db[drug2]['metabolism'],
                interaction_type, severity, evidence_level])

columns = ['drug1', 'drug2', 'class1', 'class2', 'metabolism1', 'metabolism2',
           'interaction_type', 'severity', 'evidence_level']
df = pd.DataFrame(data, columns=columns)

# Preprocessing
# Convert severity to ordinal (0=none, 1=minor, 2=moderate, 3=major, 4=contraindicated)
severity_map = {'none': 0, 'minor': 1, 'moderate': 2, 'major': 3, 'contraindicated': 4}
df['severity_code'] = df['severity'].map(severity_map)

# Feature engineering
df['same_class'] = (df['class1'] == df['class2']).astype(int)
df['metabolism_interaction'] = df.apply(lambda x: 1 if (
    (x['metabolism1'] == x['metabolism2']) or
    (x['metabolism1'].endswith('-inhibitor') and x['metabolism2'] == x['metabolism1'].replace('-inhibitor', '')) or
    (x['metabolism2'].endswith('-inhibitor') and x['metabolism1'] == x['metabolism2'].replace('-inhibitor', ''))
) else 0, axis=1)

# Prepare features and target
X = df[['class1', 'class2', 'metabolism1', 'metabolism2',
        'same_class', 'metabolism_interaction', 'evidence_level']]
y = df['severity_code']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline (only preprocessor)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['class1', 'class2', 'metabolism1', 'metabolism2'])
    ],
    remainder='passthrough'
)

# Apply preprocessing to training and test data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
X_processed = preprocessor.transform(X) # for cross-validation


# Handle class imbalance on the processed training data
print("Class distribution before SMOTE:", Counter(y_train))
smote = SMOTE(random_state=42, k_neighbors=2) # Reduced k_neighbors
X_train_res, y_train_res = smote.fit_resample(X_train_processed, y_train)
print("Class distribution after SMOTE:", Counter(y_train_res))

# Also apply SMOTE to the full dataset for cross-validation (acknowledging potential data leakage)
print("Class distribution before SMOTE (for CV):", Counter(y))
X_res, y_res = smote.fit_resample(X_processed, y)
print("Class distribution after SMOTE (for CV):", Counter(y_res))


# Model Stacking for better accuracy
base_models = [
    ('gbc', GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42)),
    ('xgb', xgb.XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)),
    ('svm', SVC(kernel='rbf', probability=True, random_state=42))
]

# Stacking classifier (without SMOTE in the pipeline)
stacked_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5
)


# Train model on resampled training data
stacked_model.fit(X_train_res, y_train_res)

# Evaluate on original test data
y_pred = stacked_model.predict(X_test_processed)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['none', 'minor', 'moderate', 'major', 'contraindicated'])

print(f"\nModel Accuracy: {accuracy:.2%}")
print("\nClassification Report:")
print(report)

# Cross-validation for more reliable accuracy estimate
# Apply cross-validation on the SMOTE-resampled full dataset (acknowledging data leakage)
cv_scores = cross_val_score(stacked_model, X_res, y_res, cv=5, scoring='accuracy')
print(f"\nCross-validated Accuracy: {np.mean(cv_scores):.2%} (±{np.std(cv_scores):.2%})")

# Feature importance (for XGBoost part)
# To get feature importances after preprocessing, we need to fit the preprocessor and then get feature names
preprocessor.fit(X, y) # Fit preprocessor on original data
onehot_columns = preprocessor.named_transformers_['cat'].get_feature_names_out(['class1', 'class2', 'metabolism1', 'metabolism2'])
all_columns = list(onehot_columns) + ['same_class', 'metabolism_interaction', 'evidence_level']

# Fit XGBoost model separately on processed data to get feature importances
xgb_model_importance = xgb.XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model_importance.fit(preprocessor.transform(X), y) # Fit on full processed data

xgb_feature_importances = pd.DataFrame({
    'feature': all_columns,
    'importance': xgb_model_importance.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Important Features:")
print(xgb_feature_importances.head(10))

# Example prediction function
def predict_interaction(drug1, drug2):
    try:
        class1 = drug_db[drug1]['class']
        class2 = drug_db[drug2]['class']
        metabolism1 = drug_db[drug1]['metabolism']
        metabolism2 = drug_db[drug2]['metabolism']
        same_class = int(class1 == class2)
        metabolism_interaction = int(
            (metabolism1 == metabolism2) or
            (metabolism1.endswith('-inhibitor') and metabolism2 == metabolism1.replace('-inhibitor', '')) or
            (metabolism2.endswith('-inhibitor') and metabolism1 == metabolism2.replace('-inhibitor', ''))
        )

        # Assume evidence_level=1 (strongest evidence) for prediction
        input_data = pd.DataFrame([[class1, class2, metabolism1, metabolism2,
                                 same_class, metabolism_interaction, 1]],
                                columns=['class1', 'class2', 'metabolism1', 'metabolism2',
                                        'same_class', 'metabolism_interaction', 'evidence_level'])

        # Preprocess the input data using the fitted preprocessor
        input_data_processed = preprocessor.transform(input_data)


        pred = stacked_model.predict(input_data_processed)[0]
        severity = ['none', 'minor', 'moderate', 'major', 'contraindicated'][pred]

        # Get probability estimates
        proba = stacked_model.predict_proba(input_data_processed)[0]
        confidence = proba[pred]

        return {
            'drug1': drug1,
            'drug2': drug2,
            'predicted_severity': severity,
            'confidence': f"{confidence:.1%}",
            'class_interaction': f"{class1} + {class2}",
            'metabolism_interaction': f"{metabolism1} + {metabolism2}"
        }
    except KeyError:
        return "One or both drugs not in database"

# Test predictions
print("\nExample Predictions:")
print(predict_interaction('fluoxetine', 'linezolid'))  # Known contraindication
print(predict_interaction('simvastatin', 'grapefruit')) # Known major interaction
print(predict_interaction('ibuprofen', 'metoprolol'))  # Minimal interaction

Class distribution before SMOTE: Counter({0: 255, 1: 120, 3: 15, 4: 7, 2: 3})
Class distribution after SMOTE: Counter({1: 255, 0: 255, 3: 255, 4: 255, 2: 255})
Class distribution before SMOTE (for CV): Counter({0: 315, 1: 156, 3: 17, 4: 8, 2: 4})
Class distribution after SMOTE (for CV): Counter({1: 315, 0: 315, 3: 315, 4: 315, 2: 315})

Model Accuracy: 100.00%

Classification Report:
                 precision    recall  f1-score   support

           none       1.00      1.00      1.00        60
          minor       1.00      1.00      1.00        36
       moderate       1.00      1.00      1.00         1
          major       1.00      1.00      1.00         2
contraindicated       1.00      1.00      1.00         1

       accuracy                           1.00       100
      macro avg       1.00      1.00      1.00       100
   weighted avg       1.00      1.00      1.00       100


Cross-validated Accuracy: 99.94% (±0.13%)

Top 10 Important Features:
                         f