In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from io import StringIO
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
# Reading the Dataset in the enviroment 
df = pd.read_csv(r"C:\Users\Raheb\Desktop\resume files\MentalHealthDC.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445132 entries, 0 to 445131
Data columns (total 31 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   State                       445132 non-null  object 
 1   Date                        445126 non-null  object 
 2   Mental_Health_Days          445132 non-null  float64
 3   Sleep_Hours_per_day         445132 non-null  float64
 4   Deppresive_Dissorder        445132 non-null  object 
 5   Marrital_Status             445124 non-null  object 
 6   Home_Ownership              445123 non-null  object 
 7   Employed_Status             445132 non-null  object 
 8   CHILDREN                    445132 non-null  int64  
 9   100_Cigarettes_Lifetime     445132 non-null  object 
 10  Ecigs_Vapes                 445132 non-null  object 
 11  Covid_Status_Ever           395897 non-null  object 
 12  Satisfaction_Level          254488 non-null  object 
 13  Emotional_Supp

In [4]:
# Change all object types to category
df = df.apply(lambda x: x.astype('category') if x.dtype == 'object' else x)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445132 entries, 0 to 445131
Data columns (total 31 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   State                       445132 non-null  category
 1   Date                        445126 non-null  category
 2   Mental_Health_Days          445132 non-null  float64 
 3   Sleep_Hours_per_day         445132 non-null  float64 
 4   Deppresive_Dissorder        445132 non-null  category
 5   Marrital_Status             445124 non-null  category
 6   Home_Ownership              445123 non-null  category
 7   Employed_Status             445132 non-null  category
 8   CHILDREN                    445132 non-null  int64   
 9   100_Cigarettes_Lifetime     445132 non-null  category
 10  Ecigs_Vapes                 445132 non-null  category
 11  Covid_Status_Ever           395897 non-null  category
 12  Satisfaction_Level          254488 non-null  category
 13 

In [5]:
df.isna().sum()

State                              0
Date                               6
Mental_Health_Days                 0
Sleep_Hours_per_day                0
Deppresive_Dissorder               0
Marrital_Status                    8
Home_Ownership                     9
Employed_Status                    0
CHILDREN                           0
100_Cigarettes_Lifetime            0
Ecigs_Vapes                        0
Covid_Status_Ever              49235
Satisfaction_Level            190644
Emotional_Support_Recieved    190991
Social_Isolation              191342
Stress_30days                 193921
Physical_Health                    0
Physically_Active_30days           0
Sex_at_Birth                       0
Age_Distrubution                   0
Height(inches)                     0
Weight(kilograms)                  0
Education                          0
Income                             0
Smoking                            0
Drunk_30days                       0
Drinks_per_Day                     4
R

In [6]:
# Imputing missing values
# Imputing numerical columns with mean
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
imputer_num = SimpleImputer(strategy='mean')
df[numerical_cols] = imputer_num.fit_transform(df[numerical_cols])

# Imputing categorical columns with mode
categorical_cols = df.select_dtypes(include=['category']).columns
imputer_cat = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = imputer_cat.fit_transform(df[categorical_cols])

In [7]:
df.isna().sum()

State                         0
Date                          0
Mental_Health_Days            0
Sleep_Hours_per_day           0
Deppresive_Dissorder          0
Marrital_Status               0
Home_Ownership                0
Employed_Status               0
CHILDREN                      0
100_Cigarettes_Lifetime       0
Ecigs_Vapes                   0
Covid_Status_Ever             0
Satisfaction_Level            0
Emotional_Support_Recieved    0
Social_Isolation              0
Stress_30days                 0
Physical_Health               0
Physically_Active_30days      0
Sex_at_Birth                  0
Age_Distrubution              0
Height(inches)                0
Weight(kilograms)             0
Education                     0
Income                        0
Smoking                       0
Drunk_30days                  0
Drinks_per_Day                0
Race/Ethnicity                0
Height(meters)                0
Body_Mass_Index               0
BMI_scale                     0
dtype: i

In [8]:
# Standardizing and encoding integers and categorical variables
# Standardize Numerical Variables
scaler_standard = StandardScaler()
df[numerical_cols] = scaler_standard.fit_transform(df[numerical_cols])

# Encode Categorical Variables
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [9]:
df.shape

(445132, 558)

In [10]:
# Splitting for Modeling
# Defining target variable and features
X = df.drop(columns=['Deppresive_Dissorder_Yes'])  # Updating with the correct target variable if needed
y = df['Deppresive_Dissorder_Yes']  # Target variable

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Saving the preprocessed data to new CSV files
X_train.to_csv('C:/Users/Raheb/Desktop/resume files/X_train.csv', index=False)
X_test.to_csv('C:/Users/Raheb/Desktop/resume files/X_test.csv', index=False)
y_train.to_csv('C:/Users/Raheb/Desktop/resume files/y_train.csv', index=False)
y_test.to_csv('C:/Users/Raheb/Desktop/resume files/y_test.csv', index=False)

In [11]:
# Loading the preprocessed data
X_train = pd.read_csv('C:/Users/Raheb/Desktop/resume files/X_train.csv')
X_test = pd.read_csv('C:/Users/Raheb/Desktop/resume files/X_test.csv')
y_train = pd.read_csv('C:/Users/Raheb/Desktop/resume files/y_train.csv').values.ravel()  # Converting to 1D array
y_test = pd.read_csv('C:/Users/Raheb/Desktop/resume files/y_test.csv').values.ravel()  # Converting to 1D array

In [12]:
# Initializing models: Logistic Regression:
log_reg = LogisticRegression(max_iter=1000)

# Training and evaluating Logistic Regression
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log_reg)}")
print("Classification Report:")
print(classification_report(y_test, y_pred_log_reg))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log_reg))

Logistic Regression:
Accuracy: 0.8300059907143927
Classification Report:
              precision    recall  f1-score   support

       False       0.85      0.95      0.90    105953
        True       0.67      0.35      0.46     27587

    accuracy                           0.83    133540
   macro avg       0.76      0.65      0.68    133540
weighted avg       0.81      0.83      0.81    133540

Confusion Matrix:
[[101147   4806]
 [ 17895   9692]]


In [13]:
# Random Forest:
rand_forest = RandomForestClassifier()

# Training and evaluating Random Forest
rand_forest.fit(X_train, y_train)
y_pred_rand_forest = rand_forest.predict(X_test)
print("\nRandom Forest:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rand_forest)}")
print("Classification Report:")
print(classification_report(y_test, y_pred_rand_forest))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rand_forest))


Random Forest:
Accuracy: 0.8293470121311967
Classification Report:
              precision    recall  f1-score   support

       False       0.84      0.97      0.90    105953
        True       0.71      0.30      0.42     27587

    accuracy                           0.83    133540
   macro avg       0.77      0.63      0.66    133540
weighted avg       0.81      0.83      0.80    133540

Confusion Matrix:
[[102535   3418]
 [ 19371   8216]]


In [14]:
# Gradient Boosting
grad_boost = GradientBoostingClassifier()

# Training and evaluating Gradient Boosting Classifier
grad_boost.fit(X_train, y_train)
y_pred_grad_boost = grad_boost.predict(X_test)
print("\nGradient Boosting Classifier:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_grad_boost)}")
print("Classification Report:")
print(classification_report(y_test, y_pred_grad_boost))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_grad_boost))


Gradient Boosting Classifier:
Accuracy: 0.832522090759323
Classification Report:
              precision    recall  f1-score   support

       False       0.85      0.95      0.90    105953
        True       0.67      0.38      0.48     27587

    accuracy                           0.83    133540
   macro avg       0.76      0.67      0.69    133540
weighted avg       0.82      0.83      0.81    133540

Confusion Matrix:
[[100678   5275]
 [ 17090  10497]]


In [None]:
#Support Vector Machine
svm_model = SVC(probability=True)

# Training and evaluating Support Vector Machine (SVM)   
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("\nSupport Vector Machine (SVM):")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm)}")
print("Classification Report:")
print(classification_report(y_test, y_pred_svm))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))