In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import os
import warnings

os.environ["OMP_NUM_THREADS"] = "16"
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
# Read in the questionnaire answers
answers_df = pd.read_csv("./Resources/mental_health_finaldata_1.csv")
answers_df.head()

In [None]:
# Use Label Encoding to convert the Age bands into Category Numbers
# First, store and print the categories so the process can be reversed later
age_categories = answers_df["Age"].astype('category')
# Print the Age categories, just to become familiar with the data
print(age_categories)
# Encode the Age column using the category codes
answers_df["Age"] = age_categories.cat.codes
print(answers_df["Age"])

In [None]:
# Use Label Encoding on the Gender, Occupation, and Days Indoors as well
gender_categories = answers_df["Gender"].astype('category')
answers_df["Gender"] = gender_categories.cat.codes
occupation_categories = answers_df["Occupation"].astype('category')
answers_df["Occupation"] = occupation_categories.cat.codes
days_categories = answers_df["Days Indoors"].astype('category')
answers_df["Days Indoors"] = days_categories.cat.codes
answers_df.head()

In [None]:
# Now, use One-Hot Encoding to convert the remaining columns into numerical data
# First, store the columns to be encoded in a variable
cols = ["Growing Stress", "Quarantine Frustrations", "Changes Habits", "Mental Health History", "Weight Change", "Mood Swings", "Coping Struggles", "Work Interest", "Social Weakness"]
# Use the get_dummies function to encode the columns
answers_df = pd.get_dummies(answers_df, columns=cols)
answers_df.head()

In [None]:
# Create a a list to store inertia values
inertia = []

# Create a a list to store the values of k
k = list(range(1, 11))

# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the spread_df DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the K-means model instance
for i in k:
    k_model = KMeans(n_clusters=i, n_init='auto', random_state=2)
    k_model.fit(answers_df)
    inertia.append(k_model.inertia_)

In [None]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

In [None]:
# Plot the elbow curve
df_elbow.plot.line(x="k",
                   y="inertia",
                   title="Elbow Curve",
                   xticks=k)

In [None]:
# All the other columns are 0s and 1s. We should try scaling the first four columns to see if it improves the model.

# create a labelling function
def label(col):
    answers_df[col] = answers_df[col].astype('category').cat.codes

# Start over by reloading the original df
# Read in the questionnaire answers
answers_df = pd.read_csv("./Resources/mental_health_finaldata_1.csv")

# Use Label Encoding
label("Age")
label("Gender")
label("Occupation")
label("Days Indoors")

# Store the columns to be scaled in a variable
cols_to_scale = ["Age", "Gender", "Occupation", "Days Indoors"]
cols_to_leave = ["Growing Stress", "Quarantine Frustrations", "Changes Habits", "Mental Health History", "Weight Change", "Mood Swings", "Coping Struggles", "Work Interest", "Social Weakness"]

# Create a StandardScaler instance
scaler = StandardScaler()
# Fit the StandardScaler
answers_scaled = scaler.fit_transform(answers_df[cols_to_scale])
# Convert the scaled data into a DataFrame
answers_scaled_df = pd.DataFrame(answers_scaled, columns=cols_to_scale)
# Join the scaled dataframe to the remaining columns from the original dataframe
answers_result_df = answers_scaled_df.join(answers_df[cols_to_leave])

# One-Hot Encode the remaining columns
answers_result_df = pd.get_dummies(answers_result_df, columns=cols_to_leave)
answers_result_df.head()


In [None]:
# Now run K-means again, using the scaled data
# Create a a list to store inertia values
inertia = []

# Create a a list to store the values of k
k = list(range(1, 11))

# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the spread_df DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the K-means model instance
for i in k:
    k_model = KMeans(n_clusters=i, n_init='auto', random_state=2)
    k_model.fit(answers_result_df)
    inertia.append(k_model.inertia_)

In [None]:
elbow_data_scaled = {"k": k, "inertia": inertia}
df_elbow_scaled = pd.DataFrame(elbow_data_scaled)

# Review the DataFrame
df_elbow_scaled.head()

In [None]:
# Plot the elbow curve
df_elbow_scaled.plot.line(x="k",
                   y="inertia",
                   title="Elbow Curve",
                   xticks=k)

In [None]:
import matplotlib.pyplot as plt

fig, (plt1, plt2) = plt.subplots(1, 2, figsize=(20, 8))
fig.suptitle('Elbow Curves')
plt1.plot(df_elbow["k"], df_elbow["inertia"])
plt1.set_title("Original Data")
plt2.plot(df_elbow_scaled["k"], df_elbow_scaled["inertia"])
plt2.set_title("Scaled Data")
plt.show()