In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA
from scipy.stats import ttest_1samp, zscore
from statsmodels.stats.weightstats import ztest
from scipy.stats import chi2_contingency
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("/content/Student_Mental_Stress_and_Coping_Mechanisms.csv")
df.drop(columns=["Student ID"], inplace=True)
le = LabelEncoder()
for col in df.select_dtypes(include='object').columns:
    df[col] = le.fit_transform(df[col])
df['Stress Binary'] = (df['Mental Stress Level'] >= 6).astype(int)
X = df.drop(columns=['Mental Stress Level', 'Stress Binary'])
y_binary = df['Stress Binary']
y_regression = df['Mental Stress Level']
X_train, X_test, y_train_bin, y_test_bin = train_test_split(X, y_binary, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train_bin)
y_pred_log = log_reg.predict(X_test_scaled)
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train_bin)
y_pred_rf = rf.predict(X_test)
svm = SVC()
svm.fit(X_train_scaled, y_train_bin)
y_pred_svm = svm.predict(X_test_scaled)
def evaluate(y_true, y_pred):
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1 Score': f1_score(y_true, y_pred)
    }
results = {
    'Logistic Regression': evaluate(y_test_bin, y_pred_log),
    'Random Forest': evaluate(y_test_bin, y_pred_rf),
    'SVM': evaluate(y_test_bin, y_pred_svm)
}
print("\nModel Performance:")
print(pd.DataFrame(results).T.round(3))
importances = pd.Series(rf.feature_importances_, index=X.columns)
top_features = importances.sort_values(ascending=False).head(3)
print("\nTop 3 Important Features Influencing Stress:")
print(top_features)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
print("\nExplained Variance Ratio by PCA components:", pca.explained_variance_ratio_)
t_stat, p_val = ttest_1samp(df['Mental Stress Level'], popmean=5)
print("\nT-Test Results:")
print(f"T-statistic = {t_stat:.3f}, P-value = {p_val:.3f}")
if p_val < 0.05:
    print("→ Reject null hypothesis: Stress levels are significantly different from 5.")
else:
    print("→ Fail to reject null: No significant difference from 5.")
group_yes = df[df['Counseling Attendance'] == 1]['Study Hours Per Week']
group_no = df[df['Counseling Attendance'] == 0]['Study Hours Per Week']
z_stat, p_val_z = ztest(group_yes, group_no)
print("\nZ-Test Results:")
print(f"Z-statistic = {z_stat:.3f}, P-value = {p_val_z:.3f}")
if p_val_z < 0.05:
    print("→ Significant difference in study hours between groups.")
else:
    print("→ No significant difference in study hours.")
contingency = pd.crosstab(df['Gender'], df['Counseling Attendance'])
chi2, p_chi, _, _ = chi2_contingency(contingency)
print("\nChi-square Test Results (Gender vs Counseling Attendance):")
print(f"Chi2 = {chi2:.3f}, P-value = {p_chi:.3f}")
if p_chi < 0.05:
    print("→ Significant relationship between gender and counseling attendance.")
else:
    print("→ No significant relationship between gender and counseling attendance.")


Model Performance:
                     Accuracy  Precision  Recall  F1 Score
Logistic Regression     0.531      0.533   0.425     0.473
Random Forest           0.566      0.581   0.442     0.503
SVM                     0.535      0.538   0.442     0.485

Top 3 Important Features Influencing Stress:
Study Hours Per Week                  0.119949
Physical Exercise (Hours per week)    0.083990
Age                                   0.077764
dtype: float64

Explained Variance Ratio by PCA components: [0.83434985 0.03728499]

T-Test Results:
T-statistic = 3.449, P-value = 0.001
→ Reject null hypothesis: Stress levels are significantly different from 5.

Z-Test Results:
Z-statistic = 1.176, P-value = 0.240
→ No significant difference in study hours.

Chi-square Test Results (Gender vs Counseling Attendance):
Chi2 = 8.910, P-value = 0.259
→ No significant relationship between gender and counseling attendance.
