# Data Loading and Preprocessing

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
# # Load the datasets
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')
test_data = pd.read_csv('test.csv')

# # Combine train and val for preprocessing
data = pd.concat([train_data, val_data], ignore_index=True)

# Display the dataset
data.head()


In [None]:
#unique classes in dataset
data['Dominant_Emotion'].value_counts()

In [None]:
print("Summary statistics of the dataset:")
display(data.describe())

In [None]:
# Check for NaN values in the dataset
nan_counts = data.isnull().sum()
print("Count of NaN values in each column:")
print(nan_counts)

In [None]:
data['Platform'].value_counts()

In [None]:
#Encode categorical variables
label_encoders = {}
for column in ['Gender', 'Platform']:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])




In [None]:
data['Platform'].value_counts()

In [None]:
data.head(10)

In [None]:
data.columns

In [None]:
#Normalize numeric features
scaler = MinMaxScaler()
data[['Age', 'Daily_Usage_Time (minutes)', 'Posts_Per_Day', 'Likes_Received_Per_Day', 'Comments_Received_Per_Day', 'Messages_Sent_Per_Day']] = scaler.fit_transform(
    data[['Age', 'Daily_Usage_Time (minutes)', 'Posts_Per_Day', 'Likes_Received_Per_Day', 'Comments_Received_Per_Day', 'Messages_Sent_Per_Day']])


In [None]:
# Split the data back to train and validation 80:20
train_data = data[:len(train_data)]
val_data = data[len(train_data):]

# Define features and target
X_train = train_data.drop(columns=['User_ID', 'Dominant_Emotion'])
y_train = train_data['Dominant_Emotion']
X_val = val_data.drop(columns=['User_ID', 'Dominant_Emotion'])
y_val = val_data['Dominant_Emotion']


# Visualizations

In [None]:
# Visualization Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(X_train.corr(), annot=True, cmap='Blues')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
#Scatter plot: Daily_Usage_Time vs. Dominant_Emotion
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Daily_Usage_Time (minutes)', y='Dominant_Emotion', data=data, hue='Dominant_Emotion')
plt.title('Daily Usage Time vs. Dominant Emotion')
plt.show()


In [None]:
#Histograms
data[['Daily_Usage_Time (minutes)', 'Posts_Per_Day', 'Likes_Received_Per_Day', 'Comments_Received_Per_Day', 'Messages_Sent_Per_Day']].hist(bins=30, figsize=(15, 10))
plt.suptitle('Distribution of Social Media Usage and Mental Health Indicators')
plt.show()


In [None]:
data['Dominant_Emotion'].value_counts()

In [None]:
# Convert Dominant_Emotion column to numeric
label_encoder_emotion = LabelEncoder()
data['Dominant_Emotion'] = label_encoder_emotion.fit_transform(data['Dominant_Emotion'])

In [None]:
data['Dominant_Emotion'].value_counts()

In [None]:
# Convert y_train and y_val to numeric using the same LabelEncoder for Dominant_Emotion
y_train = label_encoder_emotion.transform(y_train)
y_val = label_encoder_emotion.transform(y_val)


In [None]:
# Identify highly correlated feature
correlation_matrix = data.corr()
print(correlation_matrix['Dominant_Emotion'].sort_values(ascending=False))


In [None]:
y_val

# Models In Question

## Linear Regression

In [None]:
#linear regression on dominant column
lin_reg_gender = LinearRegression()
lin_reg_gender.fit(X_train, y_train)

# Make predictions
y_pred_train = lin_reg_gender.predict(X_train)
y_pred_val = lin_reg_gender.predict(X_val)

In [None]:
accuracy_train = accuracy_score(y_train, np.round(y_pred_train))
precision_train = precision_score(y_train, np.round(y_pred_train), average='weighted')
recall_train = recall_score(y_train, np.round(y_pred_train), average='weighted')
f1_train = f1_score(y_train, np.round(y_pred_train), average='weighted')

accuracy_val = accuracy_score(y_val, np.round(y_pred_val))
precision_val = precision_score(y_val, np.round(y_pred_val), average='weighted')
recall_val = recall_score(y_val, np.round(y_pred_val), average='weighted')
f1_val = f1_score(y_val, np.round(y_pred_val), average='weighted')

print("Training set metrics:")
print(f'Accuracy: {accuracy_train:.4f}')
print(f'Precision: {precision_train:.4f}')
print(f'Recall: {recall_train:.4f}')
print(f'F1 Score: {f1_train:.4f}')

print("Validation set metrics:")
print(f'Accuracy: {accuracy_val:.4f}')
print(f'Precision: {precision_val:.4f}')
print(f'Recall: {recall_val:.4f}')
print(f'F1 Score: {f1_val:.4f}')

## Multivariate Linear Regression

In [None]:
## Multivariate regression
lin_reg_multi = LinearRegression()
lin_reg_multi.fit(X_train, y_train)

# Make predictions
y_pred_train_multi = lin_reg_multi.predict(X_train)
y_pred_val_multi = lin_reg_multi.predict(X_val)

# display the confusion matrix with the proper labels
cm_knn_euclidean = confusion_matrix(y_val, y_pred_val_knn_euclidean)
cm_knn_euclidean_df = pd.DataFrame(cm_knn_euclidean, index = label_encoder_emotion.classes_, columns = label_encoder_emotion.classes_)

plot = sns.heatmap(cm_knn_euclidean_df, annot=True, cmap='Blues')

In [None]:
accuracy_train = accuracy_score(y_train, np.round(y_pred_train_multi))
precision_train = precision_score(y_train, np.round(y_pred_train_multi), average='weighted')
recall_train = recall_score(y_train, np.round(y_pred_train_multi), average='weighted')
f1_train = f1_score(y_train, np.round(y_pred_train_multi), average='weighted')

accuracy_val = accuracy_score(y_val, np.round(y_pred_val_multi))
precision_val = precision_score(y_val, np.round(y_pred_val_multi), average='weighted')
recall_val = recall_score(y_val, np.round(y_pred_val_multi), average='weighted')
f1_val = f1_score(y_val, np.round(y_pred_val_multi), average='weighted')

print("Training set metrics:")
print(f'Accuracy: {accuracy_train:.4f}')
print(f'Precision: {precision_train:.4f}')
print(f'Recall: {recall_train:.4f}')
print(f'F1 Score: {f1_train:.4f}')

print("Validation set metrics:")
print(f'Accuracy: {accuracy_val:.4f}')
print(f'Precision: {precision_val:.4f}')
print(f'Recall: {recall_val:.4f}')
print(f'F1 Score: {f1_val:.4f}')

## Polynomial Regression

In [None]:
## Polynomial regression
degree = 3
poly = PolynomialFeatures(degree)
lin_reg_poly = make_pipeline(poly, LinearRegression())

# Fit the polynomial regression model
lin_reg_poly.fit(X_train, y_train)

# Make predictions
y_pred_train_poly = lin_reg_poly.predict(X_train)
y_pred_val_poly = lin_reg_poly.predict(X_val)

In [None]:
# Compute evaluation metrics
accuracy_train_poly = accuracy_score(y_train, np.round(y_pred_train_poly))
precision_train_poly = precision_score(y_train, np.round(y_pred_train_poly), average='weighted')
recall_train_poly = recall_score(y_train, np.round(y_pred_train_poly), average='weighted')
f1_train_poly = f1_score(y_train, np.round(y_pred_train_poly), average='weighted')

accuracy_val_poly = accuracy_score(y_val, np.round(y_pred_val_poly))
precision_val_poly = precision_score(y_val, np.round(y_pred_val_poly), average='weighted')
recall_val_poly = recall_score(y_val, np.round(y_pred_val_poly), average='weighted')
f1_val_poly = f1_score(y_val, np.round(y_pred_val_poly), average='weighted')

print("Training set metrics (Polynomial Regression):")
print(f'Accuracy: {accuracy_train_poly:.4f}')
print(f'Precision: {precision_train_poly:.4f}')
print(f'Recall: {recall_train_poly:.4f}')
print(f'F1 Score: {f1_train_poly:.4f}')

print("Validation set metrics (Polynomial Regression):")
print(f'Accuracy: {accuracy_val_poly:.4f}')
print(f'Precision: {precision_val_poly:.4f}')
print(f'Recall: {recall_val_poly:.4f}')
print(f'F1 Score: {f1_val_poly:.4f}')

## Logistic Regression

In [None]:
# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Make predictions
y_pred_train_log = log_reg.predict(X_train)
y_pred_val_log = log_reg.predict(X_val)

# display the confusion matrix with the proper labels
cm_log = confusion_matrix(y_val, y_pred_val_log)
cm_log_df = pd.DataFrame(cm_log, index = label_encoder_emotion.classes_, columns = label_encoder_emotion.classes_)

plot = sns.heatmap(cm_log_df, annot=True, cmap='Blues')

In [None]:
# Compute evaluation metrics
accuracy_train_log = accuracy_score(y_train, y_pred_train_log)
precision_train_log = precision_score(y_train, y_pred_train_log, average='weighted')
recall_train_log = recall_score(y_train, y_pred_train_log, average='weighted')
f1_train_log = f1_score(y_train, y_pred_train_log, average='weighted')

accuracy_val_log = accuracy_score(y_val, y_pred_val_log)
precision_val_log = precision_score(y_val, y_pred_val_log, average='weighted')
recall_val_log = recall_score(y_val, y_pred_val_log, average='weighted')
f1_val_log = f1_score(y_val, y_pred_val_log, average='weighted')

print("Training set metrics (Logistic Regression):")
print(f'Accuracy: {accuracy_train_log:.4f}')
print(f'Precision: {precision_train_log:.4f}')
print(f'Recall: {recall_train_log:.4f}')
print(f'F1 Score: {f1_train_log:.4f}')

print("Validation set metrics (Logistic Regression):")
print(f'Accuracy: {accuracy_val_log:.4f}')
print(f'Precision: {precision_val_log:.4f}')
print(f'Recall: {recall_val_log:.4f}')
print(f'F1 Score: {f1_val_log:.4f}')

## KNN (Euclidean Distance)

In [None]:
# KNN with Euclidean Distance
knn_euclidean = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn_euclidean.fit(X_train, y_train)

# Make predictions
y_pred_train_knn_euclidean = knn_euclidean.predict(X_train)
y_pred_val_knn_euclidean = knn_euclidean.predict(X_val)

# display the confusion matrix with the proper labels
cm_knn_euclidean = confusion_matrix(y_val, y_pred_val_knn_euclidean)
cm_knn_euclidean_df = pd.DataFrame(cm_knn_euclidean, index = label_encoder_emotion.classes_, columns = label_encoder_emotion.classes_)

plot = sns.heatmap(cm_knn_euclidean_df, annot=True, cmap='Blues')


In [None]:
# Compute evaluation metrics
accuracy_train_knn_euclidean = accuracy_score(y_train, y_pred_train_knn_euclidean)
precision_train_knn_euclidean = precision_score(y_train, y_pred_train_knn_euclidean, average='weighted')
recall_train_knn_euclidean = recall_score(y_train, y_pred_train_knn_euclidean, average='weighted')
f1_train_knn_euclidean = f1_score(y_train, y_pred_train_knn_euclidean, average='weighted')

accuracy_val_knn_euclidean = accuracy_score(y_val, y_pred_val_knn_euclidean)
precision_val_knn_euclidean = precision_score(y_val, y_pred_val_knn_euclidean, average='weighted')
recall_val_knn_euclidean = recall_score(y_val, y_pred_val_knn_euclidean, average='weighted')
f1_val_knn_euclidean = f1_score(y_val, y_pred_val_knn_euclidean, average='weighted')

print("Training set metrics (KNN with Euclidean Distance):")
print(f'Accuracy: {accuracy_train_knn_euclidean:.4f}')
print(f'Precision: {precision_train_knn_euclidean:.4f}')
print(f'Recall: {recall_train_knn_euclidean:.4f}')
print(f'F1 Score: {f1_train_knn_euclidean:.4f}')

print("Validation set metrics (KNN with Euclidean Distance):")
print(f'Accuracy: {accuracy_val_knn_euclidean:.4f}')
print(f'Precision: {precision_val_knn_euclidean:.4f}')
print(f'Recall: {recall_val_knn_euclidean:.4f}')
print(f'F1 Score: {f1_val_knn_euclidean:.4f}')

## KNN (Manhattan Distance)

In [None]:
#KNN with Manhattan Distance
knn_manhattan = KNeighborsClassifier(n_neighbors=5, metric='manhattan')
knn_manhattan.fit(X_train, y_train)

# Make predictions
y_pred_train_knn_manhattan = knn_manhattan.predict(X_train)
y_pred_val_knn_manhattan = knn_manhattan.predict(X_val)

# display the confusion matrix with the proper labels
cm_knn_manhattan = confusion_matrix(y_val, y_pred_val_knn_manhattan)
cm_knn_manhattan_df = pd.DataFrame(cm_knn_manhattan, index = label_encoder_emotion.classes_, columns = label_encoder_emotion.classes_)

plot = sns.heatmap(cm_knn_manhattan_df, annot=True, cmap='Blues')

In [None]:
# Compute evaluation metrics
accuracy_train_knn_manhattan = accuracy_score(y_train, y_pred_train_knn_manhattan)
precision_train_knn_manhattan = precision_score(y_train, y_pred_train_knn_manhattan, average='weighted')
recall_train_knn_manhattan = recall_score(y_train, y_pred_train_knn_manhattan, average='weighted')
f1_train_knn_manhattan = f1_score(y_train, y_pred_train_knn_manhattan, average='weighted')

accuracy_val_knn_manhattan = accuracy_score(y_val, y_pred_val_knn_manhattan)
precision_val_knn_manhattan = precision_score(y_val, y_pred_val_knn_manhattan, average='weighted')
recall_val_knn_manhattan = recall_score(y_val, y_pred_val_knn_manhattan, average='weighted')
f1_val_knn_manhattan = f1_score(y_val, y_pred_val_knn_manhattan, average='weighted')

print("Training set metrics (KNN with Manhattan Distance):")
print(f'Accuracy: {accuracy_train_knn_manhattan:.4f}')
print(f'Precision: {precision_train_knn_manhattan:.4f}')
print(f'Recall: {recall_train_knn_manhattan:.4f}')
print(f'F1 Score: {f1_train_knn_manhattan:.4f}')

print("Validation set metrics (KNN with Manhattan Distance):")
print(f'Accuracy: {accuracy_val_knn_manhattan:.4f}')
print(f'Precision: {precision_val_knn_manhattan:.4f}')
print(f'Recall: {recall_val_knn_manhattan:.4f}')
print(f'F1 Score: {f1_val_knn_manhattan:.4f}')

## KNN (Cosine Distance)

In [None]:
# KNN with Cosine Distance
knn_cosine = KNeighborsClassifier(n_neighbors=5, metric='cosine')
knn_cosine.fit(X_train, y_train)

# Make predictions
y_pred_train_knn_cosine = knn_cosine.predict(X_train)
y_pred_val_knn_cosine = knn_cosine.predict(X_val)

# display the confusion matrix with the proper labels
cm_knn_cosine = confusion_matrix(y_val, y_pred_val_knn_cosine)
cm_knn_cosine_df = pd.DataFrame(cm_knn_cosine, index = label_encoder_emotion.classes_, columns = label_encoder_emotion.classes_)

plot = sns.heatmap(cm_knn_cosine_df, annot=True, cmap='Blues')


In [None]:
# Compute evaluation metrics
accuracy_train_knn_cosine = accuracy_score(y_train, y_pred_train_knn_cosine)
precision_train_knn_cosine = precision_score(y_train, y_pred_train_knn_cosine, average='weighted')
recall_train_knn_cosine = recall_score(y_train, y_pred_train_knn_cosine, average='weighted')
f1_train_knn_cosine = f1_score(y_train, y_pred_train_knn_cosine, average='weighted')

accuracy_val_knn_cosine = accuracy_score(y_val, y_pred_val_knn_cosine)
precision_val_knn_cosine = precision_score(y_val, y_pred_val_knn_cosine, average='weighted')
recall_val_knn_cosine = recall_score(y_val, y_pred_val_knn_cosine, average='weighted')
f1_val_knn_cosine = f1_score(y_val, y_pred_val_knn_cosine, average='weighted')

print("Training set metrics (KNN with Cosine Distance):")
print(f'Accuracy: {accuracy_train_knn_cosine:.4f}')
print(f'Precision: {precision_train_knn_cosine:.4f}')
print(f'Recall: {recall_train_knn_cosine:.4f}')
print(f'F1 Score: {f1_train_knn_cosine:.4f}')

print("Validation set metrics (KNN with Cosine Distance):")
print(f'Accuracy: {accuracy_val_knn_cosine:.4f}')
print(f'Precision: {precision_val_knn_cosine:.4f}')
print(f'Recall: {recall_val_knn_cosine:.4f}')
print(f'F1 Score: {f1_val_knn_cosine:.4f}')

## Decision Tree

In [None]:
# Decision Trees
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# Make predictions
y_pred_train_dt = dt.predict(X_train)
y_pred_val_dt = dt.predict(X_val)

# display the confusion matrix with the proper labels
cm_dt = confusion_matrix(y_val, y_pred_val_dt)
cm_dt_df = pd.DataFrame(cm_dt, index = label_encoder_emotion.classes_, columns = label_encoder_emotion.classes_)

plot = sns.heatmap(cm_dt_df, annot=True, cmap='Blues')

In [None]:
# Compute evaluation metrics
accuracy_train_dt = accuracy_score(y_train, y_pred_train_dt)
precision_train_dt = precision_score(y_train, y_pred_train_dt, average='weighted')
recall_train_dt = recall_score(y_train, y_pred_train_dt, average='weighted')
f1_train_dt = f1_score(y_train, y_pred_train_dt, average='weighted')

accuracy_val_dt = accuracy_score(y_val, y_pred_val_dt)
precision_val_dt = precision_score(y_val, y_pred_val_dt, average='weighted')
recall_val_dt = recall_score(y_val, y_pred_val_dt, average='weighted')
f1_val_dt = f1_score(y_val, y_pred_val_dt, average='weighted')

print("Training set metrics (Decision Trees):")
print(f'Accuracy: {accuracy_train_dt:.4f}')
print(f'Precision: {precision_train_dt:.4f}')
print(f'Recall: {recall_train_dt:.4f}')
print(f'F1 Score: {f1_train_dt:.4f}')

print("Validation set metrics (Decision Trees):")
print(f'Accuracy: {accuracy_val_dt:.4f}')
print(f'Precision: {precision_val_dt:.4f}')
print(f'Recall: {recall_val_dt:.4f}')
print(f'F1 Score: {f1_val_dt:.4f}')

## Random Forest

In [None]:
# Additional evaluation: ROC curve for the best model (assuming Random Forest for demonstration)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions
y_pred_train_rf = rf.predict(X_train)
y_pred_val_rf = rf.predict(X_val)

# display the confusion matrix with the proper labels
cm_rf = confusion_matrix(y_val, y_pred_val_rf)
cm_rf_df = pd.DataFrame(cm_rf, index = label_encoder_emotion.classes_, columns = label_encoder_emotion.classes_)

plot = sns.heatmap(cm_rf_df, annot=True, cmap='Blues')


In [None]:
# Compute evaluation metrics
accuracy_train_rf = accuracy_score(y_train, y_pred_train_rf)
precision_train_rf = precision_score(y_train, y_pred_train_rf, average='weighted')
recall_train_rf = recall_score(y_train, y_pred_train_rf, average='weighted')
f1_train_rf = f1_score(y_train, y_pred_train_rf, average='weighted')

accuracy_val_rf = accuracy_score(y_val, y_pred_val_rf)
precision_val_rf = precision_score(y_val, y_pred_val_rf, average='weighted')
recall_val_rf = recall_score(y_val, y_pred_val_rf, average='weighted')
f1_val_rf = f1_score(y_val, y_pred_val_rf, average='weighted')

print("Training set metrics (Random Forest):")
print(f'Accuracy: {accuracy_train_rf:.4f}')
print(f'Precision: {precision_train_rf:.4f}')
print(f'Recall: {recall_train_rf:.4f}')
print(f'F1 Score: {f1_train_rf:.4f}')

print("Validation set metrics (Random Forest):")
print(f'Accuracy: {accuracy_val_rf:.4f}')
print(f'Precision: {precision_val_rf:.4f}')
print(f'Recall: {recall_val_rf:.4f}')
print(f'F1 Score: {f1_val_rf:.4f}')

## Support Vector Machine

In [None]:
svm = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svm.fit(X_train, y_train)

# Make predictions
y_pred_train_svm = svm.predict(X_train)
y_pred_val_svm = svm.predict(X_val)

# display the confusion matrix with the proper labels
cm_svm = confusion_matrix(y_val, y_pred_val_svm)
cm_svm_df = pd.DataFrame(cm_svm, index = label_encoder_emotion.classes_, columns = label_encoder_emotion.classes_)

plot = sns.heatmap(cm_svm_df, annot=True, cmap='Blues')

In [None]:
# Compute evaluation metrics
accuracy_train_svm = accuracy_score(y_train, y_pred_train_svm)
precision_train_svm = precision_score(y_train, y_pred_train_svm, average='weighted')
recall_train_svm = recall_score(y_train, y_pred_train_svm, average='weighted')
f1_train_svm = f1_score(y_train, y_pred_train_svm, average='weighted')

accuracy_val_svm = accuracy_score(y_val, y_pred_val_svm)
precision_val_svm = precision_score(y_val, y_pred_val_svm, average='weighted')
recall_val_svm = recall_score(y_val, y_pred_val_svm, average='weighted')
f1_val_svm = f1_score(y_val, y_pred_val_svm, average='weighted')

print("Training set metrics (Random Forest):")
print(f'Accuracy: {accuracy_train_svm:.4f}')
print(f'Precision: {precision_train_svm:.4f}')
print(f'Recall: {recall_train_svm:.4f}')
print(f'F1 Score: {f1_train_svm:.4f}')

print("Validation set metrics (Random Forest):")
print(f'Accuracy: {accuracy_val_svm:.4f}')
print(f'Precision: {precision_val_svm:.4f}')
print(f'Recall: {recall_val_svm:.4f}')
print(f'F1 Score: {f1_val_svm:.4f}')

## Multi-Layer Perceptron Neural Network

In [None]:
# Additional evaluation: ROC curve for the best model (assuming Random Forest for demonstration)
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(128, 64, 32, 32), random_state=42)
mlp.fit(X_train, y_train)

# Make predictions
y_pred_train_mlp = mlp.predict(X_train)
y_pred_val_mlp = mlp.predict(X_val)

# display the confusion matrix with the proper labels
cm_mlp = confusion_matrix(y_val, y_pred_val_mlp)
cm_mlp_df = pd.DataFrame(cm_mlp, index = label_encoder_emotion.classes_, columns = label_encoder_emotion.classes_)

plot = sns.heatmap(cm_mlp_df, annot=True, cmap='Blues')

In [None]:
# Compute evaluation metrics
accuracy_train_mlp = accuracy_score(y_train, y_pred_train_mlp)
precision_train_mlp = precision_score(y_train, y_pred_train_mlp, average='weighted')
recall_train_mlp = recall_score(y_train, y_pred_train_mlp, average='weighted')
f1_train_mlp = f1_score(y_train, y_pred_train_mlp, average='weighted')

accuracy_val_mlp = accuracy_score(y_val, y_pred_val_mlp)
precision_val_mlp = precision_score(y_val, y_pred_val_mlp, average='weighted')
recall_val_mlp = recall_score(y_val, y_pred_val_mlp, average='weighted')
f1_val_mlp = f1_score(y_val, y_pred_val_mlp, average='weighted')

print("Training set metrics (Random Forest):")
print(f'Accuracy: {accuracy_train_mlp:.4f}')
print(f'Precision: {precision_train_mlp:.4f}')
print(f'Recall: {recall_train_mlp:.4f}')
print(f'F1 Score: {f1_train_mlp:.4f}')

print("Validation set metrics (Random Forest):")
print(f'Accuracy: {accuracy_val_mlp:.4f}')
print(f'Precision: {precision_val_mlp:.4f}')
print(f'Recall: {recall_val_mlp:.4f}')
print(f'F1 Score: {f1_val_mlp:.4f}')

## Neural Network with Keras

In [None]:
# import keras
# from keras.models import Sequential
# from keras.layers import Dense

# model = Sequential([
#     Dense(128, activation="relu"),
#     Dense(64, activation="relu"),
#     Dense(32, activation="relu"),
#     Dense(32, activation="relu"),
#     Dense(6, activation="softmax")
# ])

# model.compile(optimizer="Adam", loss="categorical_crossentropy", metrics=["accuracy"])

# y_train_categorical = keras.utils.to_categorical(y_train, num_classes=6)
# y_val_categorical = keras.utils.to_categorical(y_val, num_classes=6)

# model.fit(X_train, y_train_categorical, epochs=100, batch_size=32, validation_data=(X_val, y_val_categorical))

# y_pred_train_nn = model.predict(X_train)
# y_pred_val_nn = model.predict(X_val)

# Export Models

In [None]:
import joblib


# Prediction of Dominant Emotion

In [None]:
#now preprocess test data to make predicitons
test_data

In [None]:
#Encode categorical variables
label_encoders = {}
for column in ['Gender', 'Platform']:
    label_encoders[column] = LabelEncoder()
    test_data[column] = label_encoders[column].fit_transform(test_data[column])


In [None]:
#Normalize numeric features
scaler = MinMaxScaler()
test_data[['Age', 'Daily_Usage_Time (minutes)', 'Posts_Per_Day', 'Likes_Received_Per_Day', 'Comments_Received_Per_Day', 'Messages_Sent_Per_Day']] = scaler.fit_transform(
    test_data[['Age', 'Daily_Usage_Time (minutes)', 'Posts_Per_Day', 'Likes_Received_Per_Day', 'Comments_Received_Per_Day', 'Messages_Sent_Per_Day']])


In [None]:
X_test1 = test_data.drop(columns=['User_ID', 'Dominant_Emotion'])
y_test1 = test_data['Dominant_Emotion']

In [None]:
y_pred_test_linear_reg_gender = lin_reg_gender.predict(X_test1)
y_pred_test_multi = lin_reg_multi.predict(X_test1)
y_pred_test_poly = lin_reg_poly.predict(X_test1)
y_pred_test_log = log_reg.predict(X_test1)
y_pred_test_knn_euclidean = knn_euclidean.predict(X_test1)
y_pred_test_knn_manhattan = knn_manhattan.predict(X_test1)
y_pred_test_knn_cosine = knn_cosine.predict(X_test1)
y_pred_test_dt = dt.predict(X_test1)
y_pred_test_rf = rf.predict(X_test1)


In [None]:
label_encoder = LabelEncoder()
y_test1 = label_encoder.fit_transform(y_test1)

In [None]:
# Initialize lists
algo_names = []
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Linear Regression (Gender)
algo_names.append("Linear Regression (Gender)")
accuracy_scores.append(accuracy_score(y_test1, np.round(y_pred_test_linear_reg_gender)))
precision_scores.append(precision_score(y_test1, np.round(y_pred_test_linear_reg_gender), average='weighted'))
recall_scores.append(recall_score(y_test1, np.round(y_pred_test_linear_reg_gender), average='weighted'))
f1_scores.append(f1_score(y_test1, np.round(y_pred_test_linear_reg_gender), average='weighted'))

# Multivariate Linear Regression
algo_names.append("Multivariate Linear Regression")
accuracy_scores.append(accuracy_score(y_test1, np.round(y_pred_test_multi)))
precision_scores.append(precision_score(y_test1, np.round(y_pred_test_multi), average='weighted'))
recall_scores.append(recall_score(y_test1, np.round(y_pred_test_multi), average='weighted'))
f1_scores.append(f1_score(y_test1, np.round(y_pred_test_multi), average='weighted'))

# Polynomial Regression
algo_names.append("Polynomial Regression")
accuracy_scores.append(accuracy_score(y_test1, np.round(y_pred_test_poly)))
precision_scores.append(precision_score(y_test1, np.round(y_pred_test_poly), average='weighted'))
recall_scores.append(recall_score(y_test1, np.round(y_pred_test_poly), average='weighted'))
f1_scores.append(f1_score(y_test1, np.round(y_pred_test_poly), average='weighted'))

# Logistic Regression
algo_names.append("Logistic Regression")
accuracy_scores.append(accuracy_score(y_test1, np.round(y_pred_test_log)))
precision_scores.append(precision_score(y_test1, np.round(y_pred_test_log), average='weighted'))
recall_scores.append(recall_score(y_test1, np.round(y_pred_test_log), average='weighted'))
f1_scores.append(f1_score(y_test1, np.round(y_pred_test_log), average='weighted'))

# KNN (Euclidean Distance)
algo_names.append("KNN (Euclidean Distance)")
accuracy_scores.append(accuracy_score(y_test1, y_pred_test_knn_euclidean))
precision_scores.append(precision_score(y_test1, y_pred_test_knn_euclidean, average='weighted'))
recall_scores.append(recall_score(y_test1, y_pred_test_knn_euclidean, average='weighted'))
f1_scores.append(f1_score(y_test1, y_pred_test_knn_euclidean, average='weighted'))

# KNN (Manhattan Distance)
algo_names.append("KNN (Manhattan Distance)")
accuracy_scores.append(accuracy_score(y_test1, y_pred_test_knn_manhattan))
precision_scores.append(precision_score(y_test1, y_pred_test_knn_manhattan, average='weighted'))
recall_scores.append(recall_score(y_test1, y_pred_test_knn_manhattan, average='weighted'))
f1_scores.append(f1_score(y_test1, y_pred_test_knn_manhattan, average='weighted'))

# KNN (Cosine Distance)
algo_names.append("KNN (Cosine Distance)")
accuracy_scores.append(accuracy_score(y_test1, y_pred_test_knn_cosine))
precision_scores.append(precision_score(y_test1, y_pred_test_knn_cosine, average='weighted'))
recall_scores.append(recall_score(y_test1, y_pred_test_knn_cosine, average='weighted'))
f1_scores.append(f1_score(y_test1, y_pred_test_knn_cosine, average='weighted'))

# Decision Trees
algo_names.append("Decision Trees")
accuracy_scores.append(accuracy_score(y_test1, y_pred_test_dt))
precision_scores.append(precision_score(y_test1, y_pred_test_dt, average='weighted'))
recall_scores.append(recall_score(y_test1, y_pred_test_dt, average='weighted'))
f1_scores.append(f1_score(y_test1, y_pred_test_dt, average='weighted'))

# Random Forest
algo_names.append("Random Forest")
accuracy_scores.append(accuracy_score(y_test1, y_pred_test_rf))
precision_scores.append(precision_score(y_test1, y_pred_test_rf, average='weighted'))
recall_scores.append(recall_score(y_test1, y_pred_test_rf, average='weighted'))
f1_scores.append(f1_score(y_test1, y_pred_test_rf, average='weighted'))


In [None]:
#printung results of all algorithms on test data
df = pd.DataFrame({
    'Algorithm': algo_names,
    'Accuracy': accuracy_scores,
    'Precision': precision_scores,
    'Recall': recall_scores,
    'F1-score': f1_scores
})
df

plot = sns.heatmap(df.set_index('Algorithm'), annot=True, cmap='Blues')

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from itertools import cycle


y_scores_linear_reg_gender = y_pred_test_linear_reg_gender
y_scores_multi = y_pred_test_multi
y_scores_poly = y_pred_test_poly
y_scores_log = y_pred_test_log
y_scores_knn_euclidean = y_pred_test_knn_euclidean
y_scores_knn_manhattan = y_pred_test_knn_manhattan
y_scores_knn_cosine = y_pred_test_knn_cosine
y_scores_dt = y_pred_test_dt
y_scores_rf = y_pred_test_rf

# Combine predicted probabilities into a list
y_scores = [
    y_scores_linear_reg_gender,
    y_scores_multi,
    y_scores_poly,
    y_scores_log,
    y_scores_knn_euclidean,
    y_scores_knn_manhattan,
    y_scores_knn_cosine,
    y_scores_dt,
    y_scores_rf
]

y_test = y_test1
# Binarize the out as we have multiclass data
y_test_bin = label_binarize(y_test, classes=np.unique(y_test))


n_classes = len(np.unique(y_test))
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_scores[i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot all ROC curves
plt.figure(figsize=(10, 8))
colors = cycle(['blue', 'red', 'green', 'purple', 'orange', 'yellow', 'cyan', 'magenta', 'brown'])

for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label='ROC curve of class {0} (AUC = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for Multi-Class')
plt.legend(loc="lower right")
plt.show()


In [None]:
import pandas as pd

def get_user_data():
    print("Please enter the following information:")

    age = int(input("Age: "))

    gender = input("Gender (Male/Female/Other): ").strip().lower()  # Assuming standard input format

    platform = input("Platform (e.g., Facebook, Instagram): ").strip()

    daily_usage_time = float(input("Daily Usage Time (minutes): "))

    posts_per_day = float(input("Posts Per Day: "))

    likes_received_per_day = float(input("Likes Received Per Day: "))

    comments_received_per_day = float(input("Comments Received Per Day: "))

    messages_sent_per_day = float(input("Messages Sent Per Day: "))

    dominant_emotion = input("Dominant Emotion (Neutral/Happiness/Anxiety/Sadness/Boredom/Anger): ").strip().lower()

    # Return collected data as a dictionary
    user_data = {
        'Age': age,
        'Gender': gender,
        'Platform': platform,
        'Daily_Usage_Time (minutes)': daily_usage_time,
        'Posts_Per_Day': posts_per_day,
        'Likes_Received_Per_Day': likes_received_per_day,
        'Comments_Received_Per_Day': comments_received_per_day,
        'Messages_Sent_Per_Day': messages_sent_per_day,
        'Dominant_Emotion': dominant_emotion
    }

    return user_data

if __name__ == "__main__":
    user_data = get_user_data()


    user_df = pd.DataFrame([user_data])


In [None]:
user_df

In [None]:
#Encode categorical variables
label_encoders = {}
for column in ['Gender', 'Platform']:
    label_encoders[column] = LabelEncoder()
    user_df[column] = label_encoders[column].fit_transform(user_df[column])

In [None]:
#Normalize numeric features
scaler = MinMaxScaler()
test_data[['Age', 'Daily_Usage_Time (minutes)', 'Posts_Per_Day', 'Likes_Received_Per_Day', 'Comments_Received_Per_Day', 'Messages_Sent_Per_Day']] = scaler.fit_transform(
    test_data[['Age', 'Daily_Usage_Time (minutes)', 'Posts_Per_Day', 'Likes_Received_Per_Day', 'Comments_Received_Per_Day', 'Messages_Sent_Per_Day']])


In [None]:
user_df_X = user_df.drop(columns=['Dominant_Emotion'])
user_df_Y = user_df['Dominant_Emotion']

In [None]:
y_pred_test_linear_reg_gender = lin_reg_gender.predict(user_df_X).astype(int)
y_pred_test_multi = lin_reg_multi.predict(user_df_X).astype(int)
y_pred_test_poly = lin_reg_poly.predict(user_df_X).astype(int)
y_pred_test_log = log_reg.predict(user_df_X).astype(int)
y_pred_test_knn_euclidean = knn_euclidean.predict(user_df_X).astype(int)
y_pred_test_knn_manhattan = knn_manhattan.predict(user_df_X).astype(int)
y_pred_test_knn_cosine = knn_cosine.predict(user_df_X).astype(int)
y_pred_test_dt = dt.predict(user_df_X).astype(int)
y_pred_test_rf = rf.predict(user_df_X).astype(int)


In [None]:
emotion_mapping_reverse = {
    0: 'Neutral',
    1: 'Happiness',
    2: 'Anxiety',
    3: 'Sadness',
    4: 'Boredom',
    5: 'Anger'
}


In [None]:
# y_pred_linear_reg_gender_emotions = [emotion_mapping_reverse[label] for label in y_pred_test_linear_reg_gender]
# y_pred_multi_emotions = [emotion_mapping_reverse[label] for label in y_pred_test_multi]
# y_pred_poly_emotions = [emotion_mapping_reverse[label] for label in y_pred_test_poly]
y_pred_log_emotions = [emotion_mapping_reverse[label] for label in y_pred_test_log]
y_pred_knn_euclidean_emotions = [emotion_mapping_reverse[label] for label in y_pred_test_knn_euclidean]
y_pred_knn_manhattan_emotions = [emotion_mapping_reverse[label] for label in y_pred_test_knn_manhattan]
y_pred_knn_cosine_emotions = [emotion_mapping_reverse[label] for label in y_pred_test_knn_cosine]
y_pred_dt_emotions = [emotion_mapping_reverse[label] for label in y_pred_test_dt]
y_pred_rf_emotions = [emotion_mapping_reverse[label] for label in y_pred_test_rf]

# # Print all predicted emotions
# print("Predicted Emotions (Linear Regression - Gender):", y_pred_linear_reg_gender_emotions)
# print("Predicted Emotions (Multivariate Linear Regression):", y_pred_multi_emotions)
# print("Predicted Emotions (Polynomial Regression):", y_pred_poly_emotions)
print("Predicted Emotions (Logistic Regression):", y_pred_log_emotions)
print("Predicted Emotions (KNN - Euclidean Distance):", y_pred_knn_euclidean_emotions)
print("Predicted Emotions (KNN - Manhattan Distance):", y_pred_knn_manhattan_emotions)
print("Predicted Emotions (KNN - Cosine Distance):", y_pred_knn_cosine_emotions)
print("Predicted Emotions (Decision Trees):", y_pred_dt_emotions)
print("Predicted Emotions (Random Forest):", y_pred_rf_emotions)