In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA
from scipy.statsc import ttest_1samp, chi2_contingency
from statsmodels.stats.weightstats import ztest

In [None]:


# Load dataset
df = pd.read_csv("/content/Student_Mental_Stress_and_Coping_Mechanisms.csv")

# Drop irrelevant column
df.drop(columns=['Student ID'], inplace=True)

# Encode categorical variables
encoder = LabelEncoder()
categorical_cols = ['Gender', 'Counseling Attendance', 'Stress Coping Mechanisms', 'Family Mental Health History', 'Medical Condition']
for col in categorical_cols:
    df[col] = encoder.fit_transform(df[col])

# Extract features and target variable
X = df.drop(columns=['Mental Stress Level'])
y = df['Mental Stress Level']


In [None]:

# Convert target to binary classification (High/Low stress)
y_binary = (y > y.median()).astype(int)

# Split dataset into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()
}

# Evaluate Models
results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-score': f1_score(y_test, y_pred)
    }

# Feature Importance from Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train_scaled, y_train)
feature_importance = pd.Series(rf_model.feature_importances_, index=X.columns).nlargest(3)

# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train_scaled)

# One-sample T-test (Hypothesis: Mean stress level = 5)
t_stat, p_value_ttest = ttest_1samp(y, 5)

# Z-test for study hours between counseling vs non-counseling
group1 = df[df['Counseling Attendance'] == 1]['Study Hours Per Week']
group2 = df[df['Counseling Attendance'] == 0]['Study Hours Per Week']
z_stat, p_value_ztest = ztest(group1, group2)

# Chi-square test for gender vs counseling attendance
contingency_table = pd.crosstab(df['Gender'], df['Counseling Attendance'])
chi2_stat, p_value_chi, _, _ = chi2_contingency(contingency_table)

# Print Results
print("Model Performance:")
for model, metrics in results.items():
    print(f"{model}: {metrics}")

print("\nTop 3 Important Features:")
print(feature_importance)

print("\nT-test p-value:", p_value_ttest)
print("Interpretation:", "Significant" if p_value_ttest < 0.05 else "Not Significant")

print("\nZ-test p-value:", p_value_ztest)
print("Interpretation:", "Significant" if p_value_ztest < 0.05 else "Not Significant")

print("\nChi-square p-value:", p_value_chi)
print("Interpretation:", "Significant" if p_value_chi < 0.05 else "Not Significant")


Model Performance:
Logistic Regression: {'Accuracy': 0.48026315789473684, 'Precision': 0.43283582089552236, 'Recall': 0.4142857142857143, 'F1-score': 0.4233576642335766}
Random Forest: {'Accuracy': 0.5328947368421053, 'Precision': 0.49206349206349204, 'Recall': 0.44285714285714284, 'F1-score': 0.46616541353383456}
SVM: {'Accuracy': 0.5263157894736842, 'Precision': 0.48214285714285715, 'Recall': 0.38571428571428573, 'F1-score': 0.42857142857142855}

Top 3 Important Features:
Study Hours Per Week                  0.126360
Physical Exercise (Hours per week)    0.082319
Social Media Usage (Hours per day)    0.076869
dtype: float64

T-test p-value: 0.0005939960520346076
Interpretation: Significant

Z-test p-value: 0.2395012959973023
Interpretation: Not Significant

Chi-square p-value: 0.25916012793906545
Interpretation: Not Significant
