In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, learning_curve
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import pickle 

# Load datasets
df_train = pd.read_csv('panic_disorder_dataset_training.csv')
df_test = pd.read_csv('panic_disorder_dataset_testing.csv')

# Drop Participant ID from training and test data
df_train = df_train.drop(columns=['Participant ID'])
df_test = df_test.drop(columns=['Participant ID'])

# List of columns to impute, in order of least to most missing values
columns_to_impute = ['Psychiatric History', 'Medical History', 'Substance Use']

# Function to impute a column using Random Forest
def impute_column(df, target_column, features):
    train_data = df[df[target_column].notnull()]
    test_data = df[df[target_column].isnull()]

    X_train = train_data[features]
    y_train = train_data[target_column]
    X_test = test_data[features]

    X_train = pd.get_dummies(X_train, drop_first=True)
    X_test = pd.get_dummies(X_test, drop_first=True)
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

    model = RandomForestClassifier() if df[target_column].dtype == 'object' else RandomForestRegressor()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    df.loc[df[target_column].isnull(), target_column] = predictions
    return df

# List of features to use for imputation
features_train = [col for col in df_train.columns if col not in columns_to_impute + ['Participant ID']]
for column in columns_to_impute:
    df_train = impute_column(df_train, column, features_train)

features_test = [col for col in df_test.columns if col not in columns_to_impute + ['Participant ID']]
for column in columns_to_impute:
    df_test = impute_column(df_test, column, features_test)

# Verify no missing values
print(df_test[columns_to_impute].isnull().sum())
print(df_train[columns_to_impute].isnull().sum())

# Split data into training and validation sets
X = df_train.drop(columns=['Panic Disorder Diagnosis'])
y = df_train['Panic Disorder Diagnosis']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Define categorical and numerical features
categorical_features = ['Gender', 'Family History', 'Personal History', 'Current Stressors', 'Symptoms', 
                        'Severity', 'Impact on Life', 'Demographics', 'Medical History', 'Psychiatric History', 
                        'Substance Use', 'Coping Mechanisms', 'Social Support', 'Lifestyle Factors']
numerical_features = ['Age']


# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Define the SVM model pipeline
pipeline_svm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(random_state=42))
])

# Train the pipeline
pipeline_svm.fit(X_train, y_train)

# Calculate and print the accuracy for the training set
train_accuracy = pipeline_svm.score(X_train, y_train)
# Calculate and print the accuracy for the validation set
val_accuracy = pipeline_svm.score(X_val, y_val)

# Calculate training and validation errors
train_error = 1 - train_accuracy
val_error = 1 - val_accuracy


print(f'Training set accuracy: {train_accuracy:.4f}')
print(f'Validation set accuracy: {val_accuracy:.4f}')
print(f"Training set error: {train_error:.4f}")
print(f"Validation set error: {val_error:.4f}\n")


Psychiatric History    0
Medical History        0
Substance Use          0
dtype: int64
Psychiatric History    0
Medical History        0
Substance Use          0
dtype: int64
Training set accuracy: 0.9890
Validation set accuracy: 0.9900
Training set error: 0.0110
Validation set error: 0.0100



In [6]:

# Save the model pipeline and the one-hot encoders
onehot_encoders = preprocessor.named_transformers_['cat']
with open('svm_model.pkl', 'wb') as file:
    pickle.dump((pipeline_svm), file)