In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# remove warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
myd = pd.read_csv("/kaggle/input/student-stress-factors-a-comprehensive-analysis/StressLevelDataset.csv")

# Display the first 5 rows of the data
myd.head()

In [None]:
# Check for missing values
print(myd.isnull().sum())

In [None]:
# Bar charts for all columns
plt.figure(figsize=(12, 20)) 

for i, column in enumerate(myd.columns, 1):
    plt.subplot(7, 3, i)
    sns.countplot(x=column, data=myd, palette='Blues_d')
    plt.xticks(rotation=45)
    plt.title(column)

plt.tight_layout()
plt.show()

In [None]:
# Box plots for all columns

plt.figure(figsize= (12,8))
sns.boxplot(data=myd, orient='h', color='silver')
plt.title('Boxplot of all columns')
plt.show()

In [None]:
# Percentage of students with mental health history
mental_health_history = myd['mental_health_history'].value_counts(normalize=True) * 100
print('Percentage of students with mental health history:', mental_health_history[1])

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(myd.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Correlation heat map only stress level 
correlation = myd.corr()
correlation_stress = correlation['stress_level'].sort_values(ascending=False)
correlation_stress = correlation_stress.drop('stress_level')
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_stress.to_frame(), annot=True, cmap='coolwarm')
plt.title('Correlation of Features with Stress Level')
plt.show()

In [None]:
# Average score for each stress level
average_stress = myd.groupby('stress_level').mean()
average_stress

In [None]:
# Average stress level
average_stress_level = myd['stress_level'].mean()
print('Average Stress Level:', average_stress_level)

# Percentage breakdown of stress levels pie chart
myd['stress_level'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, colors=['#ff9999','#66b3ff','#99ff99'])
plt.title('Percentage Breakdown of Stress Levels')
plt.ylabel('')
plt.show() 

In [None]:
# Split the data into features and target
X = myd.drop('stress_level', axis=1)
y = myd['stress_level']

In [None]:
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Scree plot
pca = PCA()
pca.fit(X_scaled)
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_, marker='o')
plt.title('Scree Plot')
plt.xlabel('Number of Components')
plt.ylabel('Explained Variance')
plt.show()

In [None]:
# 2 components explain most of the variance

# Fit PCA with 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot PCA representation
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, palette='crest') #cividis
plt.title('PCA Representation')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

In [None]:
# Explained Variance Ratio
explained_variance_ratio = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance_ratio)

# Total variance explained
total_variance_explained = np.sum(explained_variance_ratio)
print("Total Variance Explained:", total_variance_explained)

In [None]:
# Access loading coefficients for all variables
loadings = pca.components_

# DataFrame to display loadings
loadings_df = pd.DataFrame(loadings.T, columns=['PC1', 'PC2'], index=X.columns)

print("Loadings for all variables in relation to Principal Components:")
print(loadings_df)

In [None]:
# Correlation Loadings between original variables and principal components
correlation_loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
correlation_loadings_df = pd.DataFrame(correlation_loadings, columns=['PC1', 'PC2'], index=X.columns)


# cutoff threshold for correlation loadings
cutoff = 0.5  

# Filtering the data frame to only include values above cutoff
significant_correlation_loadings_df = correlation_loadings_df.where(
    lambda x: abs(x) > cutoff).dropna(how='all').fillna('')

print("Significant Correlation Loadings for all variables in relation to Principal Components:")
print(significant_correlation_loadings_df)

In [None]:
# Determine the optimal number of clusters using the elbow method
wcss = []
max_clusters = 10
for i in range(1, max_clusters + 1):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X_pca)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score
import numpy as np
import matplotlib.pyplot as plt

# Initialize a list to store silhouette scores for different numbers of clusters
silhouette_scores = []

# Specify the range of clusters to try
max_clusters = 10
for n_clusters in range(2, max_clusters + 1):
    # Initialize KMeans with n_clusters
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(X_pca)
    
    # Compute silhouette score
    silhouette_avg = silhouette_score(X_pca, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Plot silhouette scores
plt.plot(range(2, max_clusters + 1), silhouette_scores, marker='o', linestyle='-')
plt.title('Silhouette Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.show()

# Find the optimal number of clusters
optimal_num_clusters = np.argmax(silhouette_scores) + 2  # Adding 2 because we started from 2 clusters
print("Optimal number of clusters:", optimal_num_clusters)

In [None]:
# KMeans clustering
kmeans = KMeans(n_clusters=4, init='k-means++', random_state=42)
kmeans.fit(X_pca)
cluster_labels = kmeans.labels_

# Add cluster labels to the new DataFrame
clustered_data = myd.copy()
clustered_data['cluster'] = cluster_labels


# Visualize the clusters
plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='crest', s=50, alpha=0.5)
plt.title('KMeans Clustering')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

In [None]:
# Examine the clusters means
clustered_data.groupby('cluster').mean()

In [None]:
# cluster size
cluster_sizes = clustered_data['cluster'].value_counts()
cluster_sizes

In [None]:
# Decision Tree Classifier
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

print(classification_report(y_test, y_pred))