In [1]:
pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-24.4.0-py3-none-any.whl.metadata (11 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
   ---------------------------------------- 0.0/107.8 kB ? eta -:--:--
   --- ------------------------------------ 10.2/107.8 kB ? eta -:--:--
   ----------- --------------------------- 30.7/107.8 kB 259.2 kB/s eta 0:00:01
   ---------------------- ---------------- 61.4/107.8 kB 465.5 kB/s eta 0:00:01
   -------------------------------------- 107.8/107.8 kB 621.4 kB/s eta 0:00:00
Downloading pyaml-24.4.0-py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-24.4.0 scikit-optimize-0.10.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV

# Load datasets
df_train = pd.read_csv('panic_disorder_dataset_training.csv')
df_test = pd.read_csv('panic_disorder_dataset_testing.csv')

# Drop Participant ID from training and test data
df_train = df_train.drop(columns=['Participant ID'])
df_test = df_test.drop(columns=['Participant ID'])

# Features and target
X = df_train.drop(columns=['Panic Disorder Diagnosis'])
y = df_train['Panic Disorder Diagnosis']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Identify categorical and numerical features
categorical_features = X_train.select_dtypes(include=['object']).columns
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns

# Impute missing values
imputer_cat = SimpleImputer(strategy='most_frequent')
imputer_num = SimpleImputer(strategy='mean')

# Apply imputation
X_train[categorical_features] = imputer_cat.fit_transform(X_train[categorical_features])
X_val[categorical_features] = imputer_cat.transform(X_val[categorical_features])
X_train[numerical_features] = imputer_num.fit_transform(X_train[numerical_features])
X_val[numerical_features] = imputer_num.transform(X_val[numerical_features])

# Label encode categorical features
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_val[col] = le.transform(X_val[col])
    label_encoders[col] = le

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'  # Keep the rest of the features as they are
)

# Create a pipeline with the preprocessor and the DecisionTreeClassifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Define the parameter space
param_space = {
    'classifier__max_depth': (3, 10),
    'classifier__min_samples_split': (2, 20),
    'classifier__min_samples_leaf': (1, 10)
}

# Perform Bayesian Optimization with fewer iterations
bayes_search = BayesSearchCV(pipeline, param_space, n_iter=60, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
bayes_search.fit(X_train, y_train)

# Best hyperparameters
print("Best hyperparameters:", bayes_search.best_params_)

# Best model
best_model = bayes_search.best_estimator_

# Evaluate the best model on the validation set
val_score = best_model.score(X_val, y_val)
print(f"Validation Accuracy: {val_score:.4f}")




Best hyperparameters: OrderedDict([('classifier__max_depth', 10), ('classifier__min_samples_leaf', 1), ('classifier__min_samples_split', 20)])
Validation Accuracy: 0.9905


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV

# Load datasets
df_train = pd.read_csv('panic_disorder_dataset_training.csv')
df_test = pd.read_csv('panic_disorder_dataset_testing.csv')

# Drop Participant ID from training and test data
df_train = df_train.drop(columns=['Participant ID'])
df_test = df_test.drop(columns=['Participant ID'])

# Features and target
X = df_train.drop(columns=['Panic Disorder Diagnosis'])
y = df_train['Panic Disorder Diagnosis']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Identify categorical and numerical features
categorical_features = X_train.select_dtypes(include=['object']).columns
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns

# Impute missing values
imputer_cat = SimpleImputer(strategy='most_frequent')
imputer_num = SimpleImputer(strategy='mean')

# Apply imputation
X_train[categorical_features] = imputer_cat.fit_transform(X_train[categorical_features])
X_val[categorical_features] = imputer_cat.transform(X_val[categorical_features])
X_train[numerical_features] = imputer_num.fit_transform(X_train[numerical_features])
X_val[numerical_features] = imputer_num.transform(X_val[numerical_features])

# Label encode categorical features
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_val[col] = le.transform(X_val[col])
    label_encoders[col] = le

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'  # Keep the rest of the features as they are
)

# Create a pipeline with the preprocessor and the DecisionTreeClassifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Define the parameter space
param_space = {
    'classifier__max_depth': (3, 10),
    'classifier__min_samples_split': (2, 20),
    'classifier__min_samples_leaf': (1, 10)
}

# Perform Bayesian Optimization with fewer iterations
bayes_search = BayesSearchCV(pipeline, param_space, n_iter=100, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
bayes_search.fit(X_train, y_train)

# Best hyperparameters
print("Best hyperparameters:", bayes_search.best_params_)

# Best model
best_model = bayes_search.best_estimator_

# Evaluate the best model on the validation set
val_score = best_model.score(X_val, y_val)
print(f"Validation Accuracy: {val_score:.4f}")




Best hyperparameters: OrderedDict([('classifier__max_depth', 10), ('classifier__min_samples_leaf', 1), ('classifier__min_samples_split', 20)])
Validation Accuracy: 0.9905
