In [295]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import pickle

In [296]:
# Step 1: Load the dataset
file_path = 'Student_Mental_Health.csv'
data = pd.read_csv(file_path)

In [297]:
# Step 2: Preprocess the Data
# Fill NaN values for numeric columns with the mean
data['What is your CGPA?'] = data['What is your CGPA?'].fillna(data['What is your CGPA?'].mean())


In [298]:
# Encode categorical columns using one-hot encoding
categorical_columns = ['Choose your gender', 'What is your course?', 'Your current year of Study', 'Marital status', 
                       'Do you have Anxiety?', 'Do you have Panic attack?', 'Did you seek any specialist for a treatment?']
data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)


In [299]:
# Step 3: Define the target variable and features
target_column = 'Do you have Depression?'
X = data_encoded.drop(columns=[target_column, 'Timestamp'])  # Remove the 'Timestamp' column
y = pd.get_dummies(data_encoded[target_column], drop_first=True)  # Encode the target variable ('Yes' as 1)


In [300]:
# Step 4: Handle any remaining NaN values in X
imputer = SimpleImputer(strategy='mean')

# Before imputation, check the number of columns
print("Number of columns before imputation:", X.shape[1])

# Apply imputation
X_imputed = imputer.fit_transform(X)

# Check the shape after imputation
print("Shape of X after imputation:", X_imputed.shape)

# If the number of columns doesn't match, we need to handle this:
if X_imputed.shape[1] != X.shape[1]:
    print("Column mismatch detected. Adjusting the DataFrame accordingly.")
    # Here, we drop the columns from the original X that were all NaN
    X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns[:X_imputed.shape[1]])
else:
    # If no column mismatch, convert back to DataFrame normally
    X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)

# Print the shape of the DataFrame after adjustment
print("Shape of the adjusted DataFrame:", X_imputed_df.shape)

Number of columns before imputation: 13
Shape of X after imputation: (1000, 13)
Shape of the adjusted DataFrame: (1000, 13)


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [301]:

column_names = X_imputed_df.columns.tolist()
with open('column_names.pkl', 'wb') as file:
    pickle.dump(column_names, file)

In [302]:
# Step 5: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed_df, y, test_size=0.3, random_state=42)

# Step 6: Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [303]:
# Step 7: Train Multiple Models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(probability=True),
    "Gradient Boosting": GradientBoostingClassifier()
}

results = {}

for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    results[model_name] = {
        "Model": model,
        "Accuracy": accuracy,
        "Classification Report": report
    }


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  y = column_or_1d(y, warn=True)
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  model.fit(X_train_scaled, y_train)
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "spars

In [304]:
# Step 8: Evaluate the Models
for model_name, result in results.items():
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {result['Accuracy']:.2f}")
    print("Classification Report:")
    print(result['Classification Report'])


Logistic Regression Results:
Accuracy: 0.81
Classification Report:
              precision    recall  f1-score   support

       False       0.75      0.74      0.75       112
        True       0.85      0.86      0.85       188

    accuracy                           0.81       300
   macro avg       0.80      0.80      0.80       300
weighted avg       0.81      0.81      0.81       300


Random Forest Results:
Accuracy: 0.77
Classification Report:
              precision    recall  f1-score   support

       False       0.68      0.71      0.70       112
        True       0.83      0.80      0.81       188

    accuracy                           0.77       300
   macro avg       0.75      0.76      0.76       300
weighted avg       0.77      0.77      0.77       300


Support Vector Machine Results:
Accuracy: 0.80
Classification Report:
              precision    recall  f1-score   support

       False       0.76      0.69      0.72       112
        True       0.82      0.87   

In [305]:
# Step 10: Select the Best Model
best_model_name = max(results, key=lambda x: results[x]["Accuracy"])
best_model = results[best_model_name]["Model"]

print(f"\nBest Model: {best_model_name} with Accuracy: {results[best_model_name]['Accuracy']:.2f}")

# Step 11: Save the Best Model using Pickle
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

# Also, save the scaler since you'll need it to preprocess data before making predictions with the model
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)


Best Model: Logistic Regression with Accuracy: 0.81
