In [3]:
# Preprocessing, Cross-validate

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Load datasets
df_train = pd.read_csv('panic_disorder_dataset_training.csv')
df_test = pd.read_csv('panic_disorder_dataset_testing.csv')

# Drop Participant ID from training and test data
df_train = df_train.drop(columns=['Participant ID'])
df_test = df_test.drop(columns=['Participant ID'])

# Features and target
X = df_train.drop(columns=['Panic Disorder Diagnosis'])
y = df_train['Panic Disorder Diagnosis']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Identify categorical and numerical features
categorical_features = X_train.select_dtypes(include=['object']).columns
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns

# Impute missing values
imputer_cat = SimpleImputer(strategy='most_frequent')
imputer_num = SimpleImputer(strategy='mean')

# Apply imputation
X_train[categorical_features] = imputer_cat.fit_transform(X_train[categorical_features])
X_val[categorical_features] = imputer_cat.transform(X_val[categorical_features])
X_train[numerical_features] = imputer_num.fit_transform(X_train[numerical_features])
X_val[numerical_features] = imputer_num.transform(X_val[numerical_features])

# Label encode categorical features
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_val[col] = le.transform(X_val[col])
    label_encoders[col] = le

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'  # Keep the rest of the features as they are
)

# Define models
models = {
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

for model_name, model in models.items():
    # Create pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Fit the model on the training set and evaluate on the validation set
    pipeline.fit(X_train, y_train)
    train_score = pipeline.score(X_train, y_train)
    val_score = pipeline.score(X_val, y_val)
    
    # Calculate training and validation errors
    train_error = 1 - train_score
    val_error = 1 - val_score
    
    # Print model performance
    print(f"Model: {model_name}")
    print(f"Training set accuracy: {train_score:.4f}")
    print(f"Validation set accuracy: {val_score:.4f}")
    print(f"Training set error: {train_error:.4f}")
    print(f"Validation set error: {val_error:.4f}\n")
    



Model: DecisionTree
Training set accuracy: 1.0000
Validation set accuracy: 0.9889
Training set error: 0.0000
Validation set error: 0.0111

Model: XGBoost
Training set accuracy: 0.9983
Validation set accuracy: 0.9900
Training set error: 0.0017
Validation set error: 0.0100



In [4]:
# Pruned Decision Tree model
dt_model_pruned = DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_split=20, min_samples_leaf=1)
pipeline_dt_pruned = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', dt_model_pruned)
])
pipeline_dt_pruned.fit(X_train, y_train)
dt_pruned_val_score = pipeline_dt_pruned.score(X_val, y_val)
dt_pruned_train_score = pipeline_dt_pruned.score(X_train, y_train)
dt_pruned_train_error = 1 - dt_pruned_train_score
dt_pruned_val_error = 1 - dt_pruned_val_score

print(f"Decision Tree Pruned Validation Accuracy: {dt_pruned_val_score:.4f}")
print(f"Decision Tree Pruned Training Accuracy: {dt_pruned_train_score:.4f}")
print(f"Decision Tree Pruned Training Error: {dt_pruned_train_error:.4f}")
print(f"Decision Tree Pruned Validation Error: {dt_pruned_val_error:.4f}")



Decision Tree Pruned Validation Accuracy: 0.9905
Decision Tree Pruned Training Accuracy: 0.9913
Decision Tree Pruned Training Error: 0.0087
Decision Tree Pruned Validation Error: 0.0095


In [5]:
best_params = {
    'n_estimators': 353,
    'max_depth': 4,
    'learning_rate': 0.07639730394446925,
    'subsample': 0.9769972567915909,
    'colsample_bytree': 0.8641799601859603,
    'gamma': 0.05958068943077993,
    'min_child_weight': 4
}

# Use the best parameters to create a model
xgb_model_tuned = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, **best_params)

# Create the pipeline with the best model
pipeline_xgb_tuned = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb_model_tuned)
])

# Fit the pipeline on the full training set
pipeline_xgb_tuned.fit(X_train, y_train)

xgb_tuned_val_score = pipeline_xgb_tuned.score(X_val, y_val)
xgb_tuned_train_score = pipeline_xgb_tuned.score(X_train, y_train)
xgb_tuned_train_error = 1 - xgb_tuned_train_score
xgb_tuned_val_error = 1 - xgb_tuned_val_score

print(f"Decision Tree Pruned Validation Accuracy: {xgb_tuned_val_score:.4f}")
print(f"Decision Tree Pruned Training Accuracy: {xgb_tuned_train_score:.4f}")
print(f"Decision Tree Pruned Training Error: {xgb_tuned_train_error:.4f}")
print(f"Decision Tree Pruned Validation Error: {xgb_tuned_val_error:.4f}")


Decision Tree Pruned Validation Accuracy: 0.9908
Decision Tree Pruned Training Accuracy: 0.9929
Decision Tree Pruned Training Error: 0.0071
Decision Tree Pruned Validation Error: 0.0092


In [6]:
X_test = df_test.drop(columns=['Panic Disorder Diagnosis'])
y_test = df_test['Panic Disorder Diagnosis']

# Impute missing values for the test data
X_test[categorical_features] = imputer_cat.transform(X_test[categorical_features])
X_test[numerical_features] = imputer_num.transform(X_test[numerical_features])

# Label encode categorical features for the test data
for col in categorical_features:
    le = label_encoders[col]
    X_test[col] = le.transform(X_test[col])

# Predict using the pruned Decision Tree model
dt_pruned_test_predictions = pipeline_dt_pruned.predict(X_test)
dt_pruned_test_accuracy = accuracy_score(y_test, dt_pruned_test_predictions)
dt_pruned_test_error = 1 - dt_pruned_test_accuracy

print(f"Pruned Decision Tree Test Accuracy: {dt_pruned_test_accuracy:.4f}")
print(f"Pruned Decision Tree Test Error: {dt_pruned_test_error:.4f}")

# Predict using the tuned XGBoost model
xgb_tuned_test_predictions = pipeline_xgb_tuned.predict(X_test)
xgb_tuned_test_accuracy = accuracy_score(y_test, xgb_tuned_test_predictions)
xgb_tuned_test_error = 1 - xgb_tuned_test_accuracy

print(f"Tuned XGBoost Test Accuracy: {xgb_tuned_test_accuracy:.4f}")
print(f"Tuned XGBoost Test Error: {xgb_tuned_test_error:.4f}")


Pruned Decision Tree Test Accuracy: 0.9907
Pruned Decision Tree Test Error: 0.0093
Tuned XGBoost Test Accuracy: 0.9911
Tuned XGBoost Test Error: 0.0089


In [7]:
import pickle


# Save the pipeline and the label encoders to a pickle file
with open('xgb_model.pkl', 'wb') as file:
    pickle.dump((pipeline_xgb_tuned, label_encoders, imputer_cat, imputer_num), file)
# Save the pipeline and the label encoders to a pickle file
with open('dtc_model.pkl', 'wb') as file:
    pickle.dump((pipeline_dt_pruned, label_encoders, imputer_cat, imputer_num), file)


In [None]:
import pickle

# Save the pruned Decision Tree pipeline
with open('dt_pruned_pipeline.pkl', 'wb') as file:
    pickle.dump(trpipeline_dt_pruned, file)

# Save the tuned XGBoost pipeline
with open('xgb_tuned_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline_xgb_tuned, file)