In [1]:
# disable tensorflow warnings and errors
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [2]:
import tensorflow as tf
import pandas as pd
import seaborn as sn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Sequential

In [3]:
rawdf = pd.read_csv(os.path.join(os.getcwd(), "StressLevelDataset.csv"), encoding="utf8")
rawdf.info()
rawdf.drop(columns=["teacher_student_relationship", "stress_level"], axis=1, inplace=True) # see readme on why this is removed
# check for null data before continuing with preprocessing
rawdf.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   anxiety_level                 1100 non-null   int64
 1   self_esteem                   1100 non-null   int64
 2   mental_health_history         1100 non-null   int64
 3   depression                    1100 non-null   int64
 4   headache                      1100 non-null   int64
 5   blood_pressure                1100 non-null   int64
 6   sleep_quality                 1100 non-null   int64
 7   breathing_problem             1100 non-null   int64
 8   noise_level                   1100 non-null   int64
 9   living_conditions             1100 non-null   int64
 10  safety                        1100 non-null   int64
 11  basic_needs                   1100 non-null   int64
 12  academic_performance          1100 non-null   int64
 13  study_load                    110

anxiety_level                 0
self_esteem                   0
mental_health_history         0
depression                    0
headache                      0
blood_pressure                0
sleep_quality                 0
breathing_problem             0
noise_level                   0
living_conditions             0
safety                        0
basic_needs                   0
academic_performance          0
study_load                    0
future_career_concerns        0
social_support                0
peer_pressure                 0
extracurricular_activities    0
bullying                      0
dtype: int64

In [4]:
# define our labels and features into variables
main_label = "anxiety_level"
main_features = rawdf.columns.values.tolist()
main_features.remove(main_label)

print(f"Label: {main_label}\nFeatures: {main_features}")

Label: anxiety_level
Features: ['self_esteem', 'mental_health_history', 'depression', 'headache', 'blood_pressure', 'sleep_quality', 'breathing_problem', 'noise_level', 'living_conditions', 'safety', 'basic_needs', 'academic_performance', 'study_load', 'future_career_concerns', 'social_support', 'peer_pressure', 'extracurricular_activities', 'bullying']


In [5]:
# look at our dataset before choosing type of preprocessing (normalization or standardization)
rawdf.head(10)

Unnamed: 0,anxiety_level,self_esteem,mental_health_history,depression,headache,blood_pressure,sleep_quality,breathing_problem,noise_level,living_conditions,safety,basic_needs,academic_performance,study_load,future_career_concerns,social_support,peer_pressure,extracurricular_activities,bullying
0,14,20,0,11,2,1,2,4,2,3,3,2,3,2,3,2,3,3,2
1,15,8,1,15,5,3,1,4,3,1,2,2,1,4,5,1,4,5,5
2,12,18,1,14,2,1,2,2,2,2,3,2,2,3,2,2,3,2,2
3,16,12,1,15,4,3,1,3,4,2,2,2,2,4,4,1,4,4,5
4,16,28,0,7,2,3,5,1,3,2,4,3,4,3,2,1,5,0,5
5,20,13,1,21,3,3,1,4,3,2,2,1,2,5,5,1,4,4,5
6,4,26,0,6,1,2,4,1,1,4,4,4,5,1,1,3,2,2,1
7,17,3,1,22,4,3,1,5,3,1,1,1,1,3,4,1,4,4,5
8,13,22,1,12,3,1,2,4,3,3,3,3,3,3,3,3,3,2,2
9,6,8,0,27,4,3,1,2,0,5,2,2,2,2,5,1,5,3,4


In [6]:
# we will need this later for adjusting model's response
print(f"anxiety level:\n\tmin: {rawdf[main_label].min()}\n\tmax: {rawdf[main_label].max()}")

anxiety level:
	min: 0
	max: 21


In [7]:
# normalize the dataset
normalizer = MinMaxScaler(feature_range=(0, 1))
dfnormarr = normalizer.fit_transform(rawdf)
newdf = pd.DataFrame(dfnormarr, index=rawdf.index, columns=rawdf.columns)

In [8]:
newdf.head()

Unnamed: 0,anxiety_level,self_esteem,mental_health_history,depression,headache,blood_pressure,sleep_quality,breathing_problem,noise_level,living_conditions,safety,basic_needs,academic_performance,study_load,future_career_concerns,social_support,peer_pressure,extracurricular_activities,bullying
0,0.666667,0.666667,0.0,0.407407,0.4,0.0,0.4,0.8,0.4,0.6,0.6,0.4,0.6,0.4,0.6,0.666667,0.6,0.6,0.4
1,0.714286,0.266667,1.0,0.555556,1.0,1.0,0.2,0.8,0.6,0.2,0.4,0.4,0.2,0.8,1.0,0.333333,0.8,1.0,1.0
2,0.571429,0.6,1.0,0.518519,0.4,0.0,0.4,0.4,0.4,0.4,0.6,0.4,0.4,0.6,0.4,0.666667,0.6,0.4,0.4
3,0.761905,0.4,1.0,0.555556,0.8,1.0,0.2,0.6,0.8,0.4,0.4,0.4,0.4,0.8,0.8,0.333333,0.8,0.8,1.0
4,0.761905,0.933333,0.0,0.259259,0.4,1.0,1.0,0.2,0.6,0.4,0.8,0.6,0.8,0.6,0.4,0.333333,1.0,0.0,1.0


In [9]:
# check for negative numbers
print((newdf.values < 0).any())

False


In [10]:
# # numpy implementation
# # the data does not have the same maximum/minimum per column so we will standardize
# scaler = MinMaxScaler()
# newdf = scaler.fit_transform(rawdf)

# labelindex = rawdf.columns.get_loc(main_label)

# rawfeatures = np.delete(newdf, labelindex, axis=1) # get all arrays based on the rule given
# rawlabels = newdf[:, labelindex] # access column with the label based on its index

# print(rawfeatures)
# print(np.info(rawfeatures))

# print(rawlabels)
# print(np.info(rawlabels))

In [11]:
newdf.head(10)

Unnamed: 0,anxiety_level,self_esteem,mental_health_history,depression,headache,blood_pressure,sleep_quality,breathing_problem,noise_level,living_conditions,safety,basic_needs,academic_performance,study_load,future_career_concerns,social_support,peer_pressure,extracurricular_activities,bullying
0,0.666667,0.666667,0.0,0.407407,0.4,0.0,0.4,0.8,0.4,0.6,0.6,0.4,0.6,0.4,0.6,0.666667,0.6,0.6,0.4
1,0.714286,0.266667,1.0,0.555556,1.0,1.0,0.2,0.8,0.6,0.2,0.4,0.4,0.2,0.8,1.0,0.333333,0.8,1.0,1.0
2,0.571429,0.6,1.0,0.518519,0.4,0.0,0.4,0.4,0.4,0.4,0.6,0.4,0.4,0.6,0.4,0.666667,0.6,0.4,0.4
3,0.761905,0.4,1.0,0.555556,0.8,1.0,0.2,0.6,0.8,0.4,0.4,0.4,0.4,0.8,0.8,0.333333,0.8,0.8,1.0
4,0.761905,0.933333,0.0,0.259259,0.4,1.0,1.0,0.2,0.6,0.4,0.8,0.6,0.8,0.6,0.4,0.333333,1.0,0.0,1.0
5,0.952381,0.433333,1.0,0.777778,0.6,1.0,0.2,0.8,0.6,0.4,0.4,0.2,0.4,1.0,1.0,0.333333,0.8,0.8,1.0
6,0.190476,0.866667,0.0,0.222222,0.2,0.5,0.8,0.2,0.2,0.8,0.8,0.8,1.0,0.2,0.2,1.0,0.4,0.4,0.2
7,0.809524,0.1,1.0,0.814815,0.8,1.0,0.2,1.0,0.6,0.2,0.2,0.2,0.2,0.6,0.8,0.333333,0.8,0.8,1.0
8,0.619048,0.733333,1.0,0.444444,0.6,0.0,0.4,0.8,0.6,0.6,0.6,0.6,0.6,0.6,0.6,1.0,0.6,0.4,0.4
9,0.285714,0.266667,0.0,1.0,0.8,1.0,0.2,0.4,0.0,1.0,0.4,0.4,0.4,0.4,1.0,0.333333,1.0,0.6,0.8


In [12]:
# split dataframe into features and labels before split
featuresdf = newdf[main_features]
labelsdf = newdf[main_label]

In [13]:
#split the big dataframe into train/validation sets
trainx, testx, trainy, testy = train_test_split(featuresdf.to_numpy(), labelsdf.to_numpy(), train_size=0.8, test_size=0.2)

In [14]:
type(trainx)

numpy.ndarray

In [15]:
print(f"trainx: {trainx.shape}, testx: {testx.shape}, trainy: {trainy.shape}, testy: {testy.shape}")

trainx: (880, 18), testx: (220, 18), trainy: (880,), testy: (220,)


In [16]:
model = Sequential([
    Input(shape=trainx.shape[1]), #number of features is input shape
    Dense(64, activation="relu"),
    Dropout(0.2), # prevent model from adapting to training data
    Dense(64, activation="relu"),
    Dropout(0.2),
    Dense(64, activation="relu"),
    Dense(1, activation="linear") # potentially 1 for every number in gad-7 scale
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                1216      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 9601 (37.50 KB)
Trainable params: 9601 (37

In [17]:
model.compile(optimizer="adam", loss="mean_squared_error", metrics=["accuracy"])

In [18]:
model.fit(trainx, trainy, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7fa4f41033d0>

In [19]:
loss, accuracy = model.evaluate(testx, testy, verbose=2)
print(f'test loss: {loss:.0%} ({str(loss)[:5]})')
print(f'test accuracy: {accuracy:.0%} ({str(accuracy)[:5]})')
# small loss & accuracy means model makes small errors on a lot of data
# see https://datascience.stackexchange.com/questions/42599/what-is-the-relationship-between-the-accuracy-and-the-loss-in-deep-learning

7/7 - 0s - loss: 0.0289 - accuracy: 0.0455 - 108ms/epoch - 15ms/step
test loss: 3% (0.028)
test accuracy: 5% (0.045)


In [20]:
validation = model.evaluate(testx, testy, verbose=2)
type(validation)
print(validation)

7/7 - 0s - loss: 0.0289 - accuracy: 0.0455 - 44ms/epoch - 6ms/step
[0.028861641883850098, 0.04545454680919647]


In [38]:
df_test = newdf.sample()
df_test.reset_index(drop=True, inplace=True)
df_test.drop(columns=[main_label], axis=1, inplace=True)
df_test.head()

Unnamed: 0,self_esteem,mental_health_history,depression,headache,blood_pressure,sleep_quality,breathing_problem,noise_level,living_conditions,safety,basic_needs,academic_performance,study_load,future_career_concerns,social_support,peer_pressure,extracurricular_activities,bullying
0,0.1,1.0,0.888889,1.0,1.0,0.2,0.6,0.6,0.4,0.2,0.4,0.2,0.8,0.8,0.333333,1.0,1.0,0.8


In [39]:
check = model.predict(df_test)
print(check)

[[0.6303442]]


In [40]:
type(check)

numpy.ndarray

In [41]:
# this sucks but inverse_transforms requires that we add back all our existing features/label before giving us the original data
df_test.insert(0, main_label, check, True) # dataframe way
resultarr = normalizer.inverse_transform(df_test)
df_result = pd.DataFrame(resultarr, columns=df_test.columns, index=df_test.index)

In [42]:
df_result.head(10)

Unnamed: 0,anxiety_level,self_esteem,mental_health_history,depression,headache,blood_pressure,sleep_quality,breathing_problem,noise_level,living_conditions,safety,basic_needs,academic_performance,study_load,future_career_concerns,social_support,peer_pressure,extracurricular_activities,bullying
0,13.237228,3.0,1.0,24.0,5.0,3.0,1.0,3.0,3.0,2.0,1.0,2.0,1.0,4.0,4.0,1.0,5.0,5.0,4.0


In [43]:
anxiety_scaled = float(df_result.loc[0, main_label]) # numpy to python float
print(f"Raw Guess: {check[0][0]}, GAD-7 Scaled Guess: {anxiety_scaled:.2f}")

Raw Guess: 0.6303442120552063, GAD-7 Scaled Guess: 13.24
