In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.utils import resample
import xgboost as xgb
import pickle
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np


try:
    df = pd.read_csv("match_data.csv")
    df.columns = df.columns.str.strip() 
except FileNotFoundError:
    print("Error: 'match_data.csv' not found. Please make sure the file is in the correct directory.")
    exit()


numeric_cols = ["age_diff", "distance_km", "urgency", "hospital_transportation"]
categorical_cols = ["bloodgroup_donor", "bloodgroup_recipient", "organ", "organ_tissue_type_donor"]


df[numeric_cols] = df[numeric_cols].fillna(0)

df[categorical_cols] = df[categorical_cols].fillna("Unknown")


X = df.drop(columns=["success", "donerid", "reciverid"])
y = df["success"].astype(int)



if 0 not in y.unique() or 1 not in y.unique():
    print("Warning: The dataset is missing at least one class (0 or 1). Creating pseudo-negative data.")
    
    df_with_target = df.copy()
    
    if 0 not in y.unique():
       
        df_negative = df_with_target.sample(frac=0.2, random_state=42).copy()
        
      
        df_negative['distance_km'] = df_negative['distance_km'] + np.random.randint(500, 2000, size=len(df_negative))
        
        
        blood_groups = ["A+", "A-", "B+", "B-", "AB+", "AB-", "O+", "O-"]
        df_negative['bloodgroup_donor'] = np.random.choice(blood_groups, size=len(df_negative))
        df_negative['bloodgroup_recipient'] = np.random.choice(blood_groups, size=len(df_negative))

        df_negative['success'] = 0
        df = pd.concat([df, df_negative], ignore_index=True)
    
    if 1 not in y.unique():

        df_positive = df_with_target.sample(frac=0.2, random_state=42).copy()
        df_positive['success'] = 1
        df = pd.concat([df, df_positive], ignore_index=True)


    X = df.drop(columns=["success", "donerid", "reciverid"])
    y = df["success"].astype(int)


ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
X_cat = ohe.fit_transform(X[categorical_cols])
encoded_cols = ohe.get_feature_names_out(categorical_cols)
X_cat_df = pd.DataFrame(X_cat, columns=encoded_cols)


scaler = MinMaxScaler()
X_num = scaler.fit_transform(X[numeric_cols])
X_num_df = pd.DataFrame(X_num, columns=numeric_cols)


X_final = pd.concat([X_num_df, X_cat_df], axis=1)


X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42, stratify=y
)

print("Training class distribution before balancing:\n", y_train.value_counts())
print("Testing class distribution:\n", y_test.value_counts())


train_df = pd.concat([X_train, y_train], axis=1)

df_majority = train_df[train_df.success == 1]
df_minority = train_df[train_df.success == 0]


if not df_minority.empty and not df_majority.empty:
    
    df_minority_upsampled = resample(df_minority,
                                     replace=True,  
                                     n_samples=len(df_majority), 
                                     random_state=42)

    train_df_balanced = pd.concat([df_majority, df_minority_upsampled])
    
    
    X_train_balanced = train_df_balanced.drop(columns=['success']).reset_index(drop=True)
    y_train_balanced = train_df_balanced['success'].reset_index(drop=True)

    X_train = X_train_balanced
    y_train = y_train_balanced
else:
    print("\nWarning: The training set contains only one class. Upsampling will be skipped. This may cause issues with model training and evaluation.")

print("Training class distribution after balancing:\n", y_train.value_counts())


model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("\nModel Evaluation on Test Data:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_proba))


with open("donor_match_model.pkl", "wb") as f:
    pickle.dump({
        "model": model,
        "ohe": ohe,
        "scaler": scaler,
        "feature_columns": X_final.columns.tolist()
    }, f)

print("\nModel and preprocessing objects saved to donor_match_model.pkl")


Training class distribution before balancing:
 success
1    1131
0     226
Name: count, dtype: int64
Testing class distribution:
 success
1    283
0     57
Name: count, dtype: int64
Training class distribution after balancing:
 success
1    1131
0    1131
Name: count, dtype: int64

Model Evaluation on Test Data:
Accuracy: 0.95
AUC: 0.9743971235509268

Model and preprocessing objects saved to donor_match_model.pkl
