# Patient Discontinuation Prediction Model

In [None]:
# Import necessary libraries
import pandas as pd    # Data manipulation library
import numpy as np     # Numerical operations library
import xgboost as xgb  # XGBoost library for gradient boosting
import optuna          # Optimization library for hyperparameter tuning
import matplotlib.pyplot as plt  # Plotting library

from sklearn.model_selection import train_test_split  # For data splitting
from sklearn.metrics import roc_auc_score            # Metric for evaluation
from sklearn.preprocessing import StandardScaler     # For data scaling
from itertools import combinations                   # For feature interactions
from xgboost import XGBClassifier                    # XGBoost classifier
from sklearn.model_selection import StratifiedKFold  # Cross-validation
from sklearn.metrics import f1_score, accuracy_score
from catboost import CatBoostClassifier, Pool # Additional evaluation metrics

# Reading training and testing data from CSV files
train_data = pd.read_csv("/content/prepared_combined.csv")  # Read training data from a CSV file
test_data = pd.read_csv("/content/prepared_test.csv")      # Read testing data from a CSV file



In [None]:
# Splitting training and testing data into input (features) and output (target) variables
# For training data:
train_data_x = train_data.iloc[:, 8:]    # Extract features from training data, starting from the 9th column
train_data_y = train_data.iloc[:, 4:5]   # Extract target variable from training data, 5th column

# For testing data:
test_data_x = test_data.iloc[:, 6:]      # Extract features from testing data, starting from the 7th column

In [None]:
# Create a DataFrame 'df' for training data and remove specific columns
df = train_data_x
df = df.drop('sath', axis=1)  # Remove the 'sath' column
df = df.drop('id_t', axis=1)  # Remove the 'id_t' column

# Get the column names and indices
column_names = df.columns
column_indices = range(len(column_names))

# Calculate pairwise interactions (products) between columns
interactions = []

for col1_index, col2_index in combinations(column_indices, 2):
    col1_name = column_names[col1_index]
    col2_name = column_names[col2_index]
    product = df.iloc[:, col1_index] * df.iloc[:, col2_index]
    interaction_name = f"{col1_name}_{col2_name}_interaction"
    df[interaction_name] = product
    interactions.append(interaction_name)

# Create a DataFrame 'df2' for testing data and remove specific columns
df2 = test_data_x
df2 = df2.drop('sath', axis=1)  # Remove the 'sath' column
df2 = df2.drop('id_t', axis=1)  # Remove the 'id_t' column

# Get the column names and indices
column_names = df2.columns
column_indices = range(len(column_names))

# Calculate pairwise interactions (products) between columns for testing data
interactions = []

for col1_index, col2_index in combinations(column_indices, 2):
    col1_name = column_names[col1_index]
    col2_name = column_names[col2_index]
    product = df2.iloc[:, col1_index] * df2.iloc[:, col2_index]
    interaction_name = f"{col1_name}_{col2_name}_interaction"
    df2[interaction_name] = product
    interactions.append(interaction_name)


In [None]:
# Add the 'est_age' column to the training and testing DataFrames
df['est_age'] = train_data['est_age']
df2['est_age'] = test_data['est_age']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler to your training data and transform it
X_train_scaled = scaler.fit_transform(df)

# Use the same scaler to transform your test data
X_test_scaled = scaler.transform(df2)

# Create DataFrames for scaled training and testing data
df = pd.DataFrame(X_train_scaled, columns=df.columns)
df2 = pd.DataFrame(X_test_scaled, columns=df2.columns)


In [None]:
# Add the 'race_cd' and 'sex_cd' columns to the training and testing DataFrames
df['race_cd'] = train_data['race_cd']
df['sex_cd'] = train_data['sex_cd']

# One-hot encode the 'sex_cd' column in the training data
df = pd.get_dummies(df, columns=['sex_cd'], drop_first=True)

# Add the 'race_cd' and 'sex_cd' columns to the testing DataFrames
df2['race_cd'] = test_data['race_cd']
df2['sex_cd'] = test_data['sex_cd']

# One-hot encode the 'sex_cd' column in the testing data
df2 = pd.get_dummies(df2, columns=['sex_cd'], drop_first=True)



In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df,train_data_y, test_size=0.2, random_state=42)

# Create a CatBoost dataset for training and testing
train_data = Pool(data=X_train, label=y_train)
test_data = Pool(data=X_test, label=y_test)

# Initialize the CatBoost classifier
catboost_model = CatBoostClassifier(**parameters)

# Train the model
catboost_model.fit(train_data, eval_set=test_data)

# Evaluate the model on the test set
y_pred = catboost_model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred)
print(f"AUC on the test set: {auc}")

# Feature Importance
feature_importance = catboost_model.get_feature_importance()



In [None]:
feature_importance = catboost_model.get_feature_importance()
num_features_to_select = 325  # Change this to your desired number of features
top_feature_indices = np.argsort(feature_importance)[::-1][:num_features_to_select]

In [None]:
def objective(trial):
    model = CatBoostClassifier(
         iterations=trial.suggest_int('iterations', 32, 1024),
         learning_rate=trial.suggest_float('learning_rate', 0.001, 0.3),
         depth=trial.suggest_int('depth', 1, 10),
         l2_leaf_reg=trial.suggest_float('l2_leaf_reg', 0.01, 10),
         grow_policy=trial.suggest_categorical('grow_policy', ['Depthwise']),
         bootstrap_type=trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli']),
         od_type=trial.suggest_categorical('od_type', ['Iter']),
         eval_metric=trial.suggest_categorical('eval_metric', ['AUC']),  # Use AUC as the evaluation metric
         loss_function=trial.suggest_categorical('loss_function', ['Logloss']),
         random_state=trial.suggest_categorical('random_state', [42]),
         verbose=trial.suggest_categorical('verbose', [0])
    )

    model.fit(
         X_train, y_train,
         eval_set=[(X_train, y_train), (X_val, y_val)],
         verbose=False
    )

     # Use roc_auc_score as the evaluation metric
    auc_score = roc_auc_score(y_val, model.predict(X_val))
    return auc_score  # AUC score is used for hyperparameter optimization



X_train, X_val, y_train, y_val = train_test_split(df.iloc[:, top_feature_indices], train_data_y, test_size=0.2, random_state=42)
import optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

best_hyperparams = study.best_params







In [None]:
best_hyperparams = {'iterations': 796,
                    'learning_rate': 0.20846308782767234,
                    'depth': 8,
                    'l2_leaf_reg': 0.6003745928817406,
                    'grow_policy': 'Depthwise',
                    'bootstrap_type': 'Bernoulli',
                    'od_type': 'Iter',
                    'eval_metric': 'AUC',
                    'loss_function': 'Logloss',
                    'random_state': 42,
                    'verbose': 0}

parameters = {'iterations': 681,
              'learning_rate': 0.2775361994919723,
              'depth': 8, 'l2_leaf_reg': 6.961173375158855,
              'grow_policy': 'Depthwise',
              'bootstrap_type': 'Bayesian',
              'od_type': 'Iter',
              'eval_metric': 'AUC',
              'loss_function': 'Logloss',
              'random_state': 42,
              'verbose': 0}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, top_feature_indices], train_data_y, test_size=0.2, random_state=42)

# Initialize the CatBoostClassifier with your hyperparameters
model = CatBoostClassifier(**parameters)

# Perform 5-fold cross-validation (you can change the number of folds)
cv_scores_auc = cross_val_score(model, df.iloc[:, top_feature_indices], train_data_y, cv=5, scoring='f1_micro')
#cv_scores_accuracy = cross_val_score(model, df.iloc[:, top_feature_indices], train_data_y, cv=5, scoring='accuracy')

# Print the cross-validation scores for AUC and accuracy
print("Cross-Validation AUC Scores:", cv_scores_auc)
print("Mean AUC:", cv_scores_auc.mean())

#print("Cross-Validation Accuracy Scores:", cv_scores_accuracy)
#print("Mean Accuracy:", cv_scores_accuracy.mean())

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the training data
y_train_pred = model.predict(X_train)

# Calculate F1 score for training data (micro average)
train_score_micro = f1_score(y_train, y_train_pred, average='micro')
print("Training F1 Score (Micro Average):", train_score_micro)


In [None]:
X = df.iloc[:, top_feature_indices]
y = train_data_y

# Initialize the CatBoostClassifier with your best hyperparameters
model = CatBoostClassifier(**parameters)

# Initialize StratifiedKFold with 5 folds and shuffle=True
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store metrics for each fold
f1_scores_micro = []
auc_scores = []
accuracy_scores = []

# Perform cross-validation
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_test_pred = model.predict(X_test)

    # Calculate and store the F1 score (micro average) for this fold
    f1_micro = f1_score(y_test, y_test_pred, average='micro')
    f1_scores_micro.append(f1_micro)

    # Calculate and store the AUC score for this fold
    y_test_proba = model.predict(X_test) # Probability of class 1
    auc = roc_auc_score(y_test, y_test_proba)
    auc_scores.append(auc)

    # Calculate and store the accuracy for this fold
    accuracy = accuracy_score(y_test, y_test_pred)
    accuracy_scores.append(accuracy)

# Print the evaluation metrics for each fold
for fold, (f1_micro, auc, accuracy) in enumerate(zip(f1_scores_micro, auc_scores, accuracy_scores), 1):
    print(f"Fold {fold}:")
    print(f"  F1 Score (Micro Average) = {f1_micro:.4f}")
    print(f"  AUC = {auc:.4f}")
    print(f"  Accuracy = {accuracy:.4f}")
    print()

# Calculate and print the mean metrics across all folds
mean_f1_micro = sum(f1_scores_micro) / len(f1_scores_micro)
mean_auc = sum(auc_scores) / len(auc_scores)``
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)

print(f"Mean F1 Score (Micro Average) = {mean_f1_micro:.4f}")
print(f"Mean AUC = {mean_auc:.4f}")
print(f"Mean Accuracy = {mean_accuracy:.4f}")


In [None]:
# Create a CatBoost classifier
model = CatBoostClassifier(**parameters)
# Train the model on the training data
model.fit(df.iloc[:, top_feature_indices], train_data_y)

# Make predictions on the test data
y_prob = model.predict_proba(df2.iloc[:, top_feature_indices])

In [None]:
final_df = pd.DataFrame()
final_df['ID'] = test_data[['id']]

In [None]:
final_df['SCORE'] = y_prob[:, 1]

In [None]:
final_df['RANK'] = final_df['SCORE'].rank(ascending=False).astype(int)

In [None]:
submission_df = final_df
# Scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(submission_df['RANK'], submission_df['SCORE'], s=10)
plt.title('Scatter Plot of Rank vs. Score')
plt.xlabel('Rank')
plt.ylabel('Score')
plt.grid(True)
plt.show()


In [None]:
submission_df.to_csv("2023CaseCompetition_Sathish_Prasad_20231005.csv",index=False)