# Ablation Study

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

# Globally setting print options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [21]:
data = pd.read_csv('/content/drive/Shareddrives/DATA240 - Data Mining/Modeling and Evaluation/preprocessed_data.csv')
data.head()

Unnamed: 0,Little_interest_in_doing_things,Feeling_down_or_depressed,Trouble_sleeping_or_sleeping_too_much,Feeling_tired_or_having_little_energy,Poor_appetite_or_overeating,Feeling_bad_about_yourself,Trouble_concentrating_on_things,Moving_or_speaking_slowly_or_too_fast,Thought_you_would_be_better_off_dead,No_of_hours_you_sleep,How_often_do_you_snore,How_often_do_you_snort/stop_breathing,trouble_sleeping,Sleep_Apnea,Blood_pressure,Taking_Medication_for_HBP,current_drinker,past_drinker,current_smoker,past_smoker,Gender,Age,Education_Level,Pregnant,PIR,Race_Non-Hispanic Black,Race_Non-Hispanic White,Race_Other Hispanic,Marital_Status_Married/Living with Partner,Marital_Status_Never Married,depression_category
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0,0,2,85,3,0.0,1.99,0,1,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0,0,2,44,3,0.0,4.65,0,1,0,1,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,3.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0,0,1,70,4,0.0,5.0,0,1,0,1,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0,0,1,73,2,0.0,2.15,0,1,0,1,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0,0,1,21,2,0.0,0.46,0,1,0,0,1,0


In [22]:
data.shape

(24108, 31)

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24108 entries, 0 to 24107
Data columns (total 31 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Little_interest_in_doing_things             24108 non-null  float64
 1   Feeling_down_or_depressed                   24108 non-null  float64
 2   Trouble_sleeping_or_sleeping_too_much       24108 non-null  float64
 3   Feeling_tired_or_having_little_energy       24108 non-null  float64
 4   Poor_appetite_or_overeating                 24108 non-null  float64
 5   Feeling_bad_about_yourself                  24108 non-null  float64
 6   Trouble_concentrating_on_things             24108 non-null  float64
 7   Moving_or_speaking_slowly_or_too_fast       24108 non-null  float64
 8   Thought_you_would_be_better_off_dead        24108 non-null  float64
 9   No_of_hours_you_sleep                       24108 non-null  float64
 10  How_often_

In [24]:
# Check for duplicates

data.duplicated().sum()

27

In [25]:
# Remove duplicate rows
data = data.drop_duplicates()

In [26]:
data.duplicated().sum()

0

In [27]:
data.isnull().sum()

Little_interest_in_doing_things               0
Feeling_down_or_depressed                     0
Trouble_sleeping_or_sleeping_too_much         0
Feeling_tired_or_having_little_energy         0
Poor_appetite_or_overeating                   0
Feeling_bad_about_yourself                    0
Trouble_concentrating_on_things               0
Moving_or_speaking_slowly_or_too_fast         0
Thought_you_would_be_better_off_dead          0
No_of_hours_you_sleep                         0
How_often_do_you_snore                        0
How_often_do_you_snort/stop_breathing         0
trouble_sleeping                              0
Sleep_Apnea                                   0
Blood_pressure                                0
Taking_Medication_for_HBP                     0
current_drinker                               0
past_drinker                                  0
current_smoker                                0
past_smoker                                   0
Gender                                  

### Split data into X and y

In [28]:
y = data['depression_category']
X = data.drop(columns=['depression_category'])

In [29]:
X.shape, y.shape

((24081, 30), (24081,))

### Handling class imbalance - SMOTENC Oversampling

In [30]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN, SMOTENC

# oversampling techniques

def oversample_data(X, y, method='smote', sampling_strategy='auto', random_state=42):
    if method == 'smote':
        oversampler = SMOTE(sampling_strategy=sampling_strategy, random_state=random_state)
    elif method == 'borderline':
        oversampler = BorderlineSMOTE(sampling_strategy=sampling_strategy, random_state=random_state)
    elif method == 'adasyn':
        oversampler = ADASYN(sampling_strategy=sampling_strategy, random_state=random_state)
    elif method == 'smotenc':
        categorical_features = [True] * X.shape[1]
        categorical_features[1] = False
        oversampler = SMOTENC(sampling_strategy=sampling_strategy, categorical_features=categorical_features, random_state=random_state)
    else:
        raise ValueError("Invalid oversampling method")

    X_resampled, y_resampled = oversampler.fit_resample(X, y)
    return X_resampled, y_resampled

In [31]:
sampling_strategy = 'auto'
X_os,y_os = oversample_data(X, y, method='smote', sampling_strategy=sampling_strategy)

In [13]:
X_os.shape, y_os.shape

((55884, 30), (55884,))

### Train test split

In [32]:
from sklearn.model_selection import train_test_split

X_train_full, X_test, y_train_full, y_test = train_test_split(X_os, y_os, test_size=0.2, random_state=2)

In [33]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=2)

In [34]:
print('train data(X_train,y_train) shape: ',X_train.shape, y_train.shape)
print('validation data(X_valid,y_valid) shape: ',X_valid.shape, y_valid.shape)
print('test data(X_test,y_test) shape: ',X_test.shape, y_test.shape)

train data(X_train,y_train) shape:  (35765, 30) (35765,)
validation data(X_valid,y_valid) shape:  (8942, 30) (8942,)
test data(X_test,y_test) shape:  (11177, 30) (11177,)


### Define Baseline Model (Decision Tree)

In [35]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Train your baseline model
baseline_dtc = DecisionTreeClassifier(random_state=42)
baseline_dtc.fit(X_train, y_train)

# Evaluate the baseline model
baseline_accuracy = baseline_dtc.score(X_valid, y_valid)
print("Baseline Accuracy:", baseline_accuracy)

Baseline Accuracy: 0.9685752628047417


In [36]:
# Assume X is a DataFrame for simplicity
feature_names = X.columns
for feature in feature_names:
    X_train_reduced = X_train.drop(feature, axis=1)
    X_valid_reduced = X_valid.drop(feature, axis=1)

    model_reduced = DecisionTreeClassifier(random_state=42)
    model_reduced.fit(X_train_reduced, y_train)
    reduced_accuracy = model_reduced.score(X_valid_reduced, y_valid)

    print(f"Accuracy without the feature '{feature}': {reduced_accuracy})")

Accuracy without the feature 'Little_interest_in_doing_things': 0.9575039141131738)
Accuracy without the feature 'Feeling_down_or_depressed': 0.9585104003578617)
Accuracy without the feature 'Trouble_sleeping_or_sleeping_too_much': 0.9511295012301498)
Accuracy without the feature 'Feeling_tired_or_having_little_energy': 0.9520241556698725)
Accuracy without the feature 'Poor_appetite_or_overeating': 0.9516886602549766)
Accuracy without the feature 'Feeling_bad_about_yourself': 0.9618653545068218)
Accuracy without the feature 'Trouble_concentrating_on_things': 0.9561619324535898)
Accuracy without the feature 'Moving_or_speaking_slowly_or_too_fast': 0.9627600089465443)
Accuracy without the feature 'Thought_you_would_be_better_off_dead': 0.936703198389622)
Accuracy without the feature 'No_of_hours_you_sleep': 0.969358085439499)
Accuracy without the feature 'How_often_do_you_snore': 0.9695817490494296)
Accuracy without the feature 'How_often_do_you_snort/stop_breathing': 0.9714828897338403)

The ablation study results show varying degrees of importance across different features in predicting depression. Notably, the removal of features such as "Thought you would be better off dead," "No of hours you sleep," and "How often do you snort/stop breathing" leads to the most significant drops in model accuracy, indicating their strong predictive value for depression.

Conversely, features like "Race_Non-Hispanic White" and "Feeling bad about yourself" have less impact on the accuracy when removed, suggesting they might be less critical in the model's ability to predict depression. This insight can guide feature selection and prioritization in refining the model for better performance and interpretability.



