### Importing Libraries

### Load Dataset


### Data Preparation

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler


# Load data
df = pd.read_csv("synthetic.csv")  
print(df.head())

# Drop rows where AIDS_diagnosed is missing
df = df.dropna(subset=["AIDS_diagnosed"])

# Replace 'None' with np.nan in Concurrent_diagnosed
df['Concurrent_diagnosed'] = df['Concurrent_diagnosed'].replace('None', np.nan)
df['Concurrent_diagnosed'] = df['Concurrent_diagnosed'].fillna('No Other Disease')

# Fill missing or None values (mode for categorical, median for numeric)
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].replace("None", np.nan, inplace=True)
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)

# Convert boolean-like columns to 0/1
bool_columns = ['HIV_diagnosed', 'AIDS_diagnosed', 'Linked_to_Care_3mo', 'Death_Status']
for col in bool_columns:
    if df[col].dtype == 'object':
        df[col] = df[col].map({'No': 0, 'Yes': 1, 'Alive': 0, 'Deceased': 1})
    df[col] = df[col].astype(int)

# Outlier detection (optional)
iso = IsolationForest(contamination=0.01, random_state=42)
outliers = iso.fit_predict(df.select_dtypes(include=np.number))
df = df[outliers == 1]

# Split features and target
X = df.drop('AIDS_diagnosed', axis=1)
y = df['AIDS_diagnosed']

# Encode categorical variables
cat_columns = X.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in cat_columns:
    X[col] = le.fit_transform(X[col])


# === Handle class imbalance using SMOTE ===
smote = SMOTE(random_state=42)
X_bal, y_bal = smote.fit_resample(X, y)

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_bal)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_bal, test_size=0.2, random_state=42)

print("✅ Preprocessing complete.")


   Year        Borough       Gender  Age      Race  HIV_diagnosed  \
0  2020          Bronx         Male   44  Hispanic           True   
1  2012          Bronx  Transgender   64     Other           True   
2  2010          Bronx         Male   54     White           True   
3  2013  Staten Island       Female   32     White           True   
4  2022          Bronx         Male   70     Asian           True   

                   Concurrent_diagnosed  AIDS_diagnosed Death_Status  \
0                           Hepatitis B           False        Alive   
1                             Gonorrhea            True     Deceased   
2  Gonorrhea, Tuberculosis, Hepatitis C           False        Alive   
3                Gonorrhea, Hepatitis C           False        Alive   
4              Hepatitis C, Hepatitis B           False        Alive   

  Poverty_Level Transmission_Category Education_Level  Linked_to_Care_3mo  \
0        Medium                   MSM         Primary                True  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].replace("None", np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values 

✅ Preprocessing complete.


In [5]:
# === Split data into Branch A (70/30) and Branch B (80/20) ===
X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(X_scaled, y_bal, test_size=0.3, random_state=42)
X_train_B, X_test_B, y_train_B, y_test_B = train_test_split(X_scaled, y_bal, test_size=0.2, random_state=42)

# Scale data to be non-negative
scaler = MinMaxScaler()
X_train_A_scaled = scaler.fit_transform(X_train_A)
X_test_A_scaled = scaler.transform(X_test_A)

# Now apply Chi-Square feature selection
selector_A = SelectKBest(score_func=chi2, k=10)
X_train_A_selected = selector_A.fit_transform(X_train_A_scaled, y_train_A)
X_test_A_selected = selector_A.transform(X_test_A_scaled)

# Repeat for Branch B
X_train_B_scaled = scaler.fit_transform(X_train_B)
X_test_B_scaled = scaler.transform(X_test_B)

selector_B = SelectKBest(score_func=chi2, k=10)
X_train_B_selected = selector_B.fit_transform(X_train_B_scaled, y_train_B)
X_test_B_selected = selector_B.transform(X_test_B_scaled)

# === Define model configurations ===
param_grids = {
    'SVM': {
        'model': SVC(probability=True),
        'params': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {'n_estimators': [50, 100], 'max_depth': [5, 10, 20]}
    },
   # 'XGBoost': {
    #    'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
   #     'params': {'n_estimators': [50, 100], 'max_depth': [3, 6], 'learning_rate': [0.01, 0.1, 0.2]}
}


# === Training and evaluation function ===
def train_and_evaluate(X_train, X_test, y_train, y_test, branch_name, feature_selection):
    results = {}
    print(f"\nRunning training for {branch_name} with {feature_selection} feature selection...")
    for name, cfg in param_grids.items():
        print(f"\nTraining {name} on {branch_name}...")
        grid = GridSearchCV(cfg['model'], cfg['params'], cv=5, scoring='f1', n_jobs=-1)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_test)
        y_proba = best_model.predict_proba(X_test)[:, 1]
        report = classification_report(y_test, y_pred)
        print(report)
        print("ROC AUC Score:", roc_auc_score(y_test, y_proba))
        results[name] = {
            'best_params': grid.best_params_,
            'report': classification_report(y_test, y_pred, output_dict=True),
            'conf_matrix': confusion_matrix(y_test, y_pred).tolist(),
            'roc_auc': roc_auc_score(y_test, y_proba)
        }
    return results

# === Run training for both branches ===
results_A = train_and_evaluate(X_train_A_selected, X_test_A_selected, y_train_A, y_test_A, "Branch A", "Chi-Square")
results_B = train_and_evaluate(X_train_B_selected, X_test_B_selected, y_train_B, y_test_B, "Branch B", "Chi-Square")


Running training for Branch A with Chi-Square feature selection...

Training SVM on Branch A...
              precision    recall  f1-score   support

           0       0.65      0.70      0.67      6224
           1       0.67      0.62      0.65      6231

    accuracy                           0.66     12455
   macro avg       0.66      0.66      0.66     12455
weighted avg       0.66      0.66      0.66     12455

ROC AUC Score: 0.7110838027294493

Training RandomForest on Branch A...
              precision    recall  f1-score   support

           0       0.65      0.68      0.67      6224
           1       0.67      0.64      0.65      6231

    accuracy                           0.66     12455
   macro avg       0.66      0.66      0.66     12455
weighted avg       0.66      0.66      0.66     12455

ROC AUC Score: 0.7160136093931206

Running training for Branch B with Chi-Square feature selection...

Training SVM on Branch B...
              precision    recall  f1-score   

### Splitting Data


### Modelling Data

In [None]:
# Run models for Branch A with and without feature selection
results_A_selected = train_and_evaluate(X_train_A_selected, X_test_A_selected, y_train_A, y_test_A, "Branch A", "with")
results_A_full = train_and_evaluate(X_train_A, X_test_A, y_train_A, y_test_A, "Branch A", "without")

# Run models for Branch B with and without feature selection
results_B_selected = train_and_evaluate(X_train_B_selected, X_test_B_selected, y_train_B, y_test_B, "Branch B", "with")
results_B_full = train_and_evaluate(X_train_B, X_test_B, y_train_B, y_test_B, "Branch B", "without")

### Final Evaluation 

In [None]:
# Store results
comparison_results = {
    'Branch A with Feature Selection': results_A_selected,
    'Branch A without Feature Selection': results_A_full,
    'Branch B with Feature Selection': results_B_selected,
    'Branch B without Feature Selection': results_B_full
}


### Knowledge Presentation

In [None]:
# Print all results
for branch_name, result in comparison_results.items():
    print(f"Results for {branch_name}:")
    for model_name, model_result in result.items():
        print(f"Model: {model_name}")
        print(f"Best Parameters: {model_result['best_params']}")
        print(f"Classification Report: {model_result['report']}")
        print(f"Confusion Matrix: {model_result['conf_matrix']}")
        print(f"ROC AUC: {model_result['roc_auc']}")
        print('-' * 50)