In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('survey.csv')

In [3]:
dataset

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,2015-09-12 11:17:21,26,male,United Kingdom,,No,No,Yes,,26-100,...,Somewhat easy,No,No,Some of them,Some of them,No,No,Don't know,No,
1255,2015-09-26 01:07:35,32,Male,United States,IL,No,Yes,Yes,Often,26-100,...,Somewhat difficult,No,No,Some of them,Yes,No,No,Yes,No,
1256,2015-11-07 12:36:58,34,male,United States,CA,No,Yes,Yes,Sometimes,More than 1000,...,Somewhat difficult,Yes,Yes,No,No,No,No,No,No,
1257,2015-11-30 21:25:06,46,f,United States,NC,No,No,No,,100-500,...,Don't know,Yes,No,No,No,No,No,No,No,


In [4]:
dataset = dataset.drop(['Timestamp', 'Country', 'state', 'self_employed', 'leave', 'comments', 'obs_consequence', 'anonymity', 'no_employees', 'remote_work', 'tech_company', 'benefits', 'coworkers', 'supervisor', 'mental_health_interview', 'phys_health_interview', 'mental_vs_physical'], axis = 1)

In [5]:
dataset

Unnamed: 0,Age,Gender,family_history,treatment,work_interfere,care_options,wellness_program,seek_help,mental_health_consequence,phys_health_consequence
0,37,Female,No,Yes,Often,Not sure,No,Yes,No,No
1,44,M,No,No,Rarely,No,Don't know,Don't know,Maybe,No
2,32,Male,No,No,Rarely,No,No,No,No,No
3,31,Male,Yes,Yes,Often,Yes,No,No,Yes,Yes
4,31,Male,No,No,Never,No,Don't know,Don't know,No,No
...,...,...,...,...,...,...,...,...,...,...
1254,26,male,No,Yes,,No,No,No,No,No
1255,32,Male,Yes,Yes,Often,Yes,No,No,No,No
1256,34,male,Yes,Yes,Sometimes,Yes,No,No,Yes,Yes
1257,46,f,No,No,,Yes,No,No,Yes,No


In [6]:
dataset.isna().sum()

Age                            0
Gender                         0
family_history                 0
treatment                      0
work_interfere               264
care_options                   0
wellness_program               0
seek_help                      0
mental_health_consequence      0
phys_health_consequence        0
dtype: int64

In [7]:
dataset[dataset.work_interfere.isna()]

Unnamed: 0,Age,Gender,family_history,treatment,work_interfere,care_options,wellness_program,seek_help,mental_health_consequence,phys_health_consequence
19,36,Male,Yes,No,,No,Yes,No,No,No
26,33,male,No,No,,Not sure,No,Don't know,Maybe,Maybe
37,38,Male,No,No,,Yes,No,No,Maybe,No
38,50,M,No,No,,Yes,No,Don't know,No,No
41,35,Male,No,No,,Not sure,Don't know,Yes,Yes,Yes
...,...,...,...,...,...,...,...,...,...,...
1226,39,Male,No,No,,Not sure,Yes,Don't know,Maybe,Maybe
1229,39,Male,No,No,,No,No,No,Yes,No
1244,32,female,No,No,,No,No,Don't know,Maybe,No
1254,26,male,No,Yes,,No,No,No,No,No


In [8]:
dataset.work_interfere.fillna(value = "Adaptive", inplace = True)

In [9]:
dataset.duplicated().sum()

46

In [10]:
dataset.drop_duplicates(inplace = True)

In [11]:
dataset

Unnamed: 0,Age,Gender,family_history,treatment,work_interfere,care_options,wellness_program,seek_help,mental_health_consequence,phys_health_consequence
0,37,Female,No,Yes,Often,Not sure,No,Yes,No,No
1,44,M,No,No,Rarely,No,Don't know,Don't know,Maybe,No
2,32,Male,No,No,Rarely,No,No,No,No,No
3,31,Male,Yes,Yes,Often,Yes,No,No,Yes,Yes
4,31,Male,No,No,Never,No,Don't know,Don't know,No,No
...,...,...,...,...,...,...,...,...,...,...
1254,26,male,No,Yes,Adaptive,No,No,No,No,No
1255,32,Male,Yes,Yes,Often,Yes,No,No,No,No
1256,34,male,Yes,Yes,Sometimes,Yes,No,No,Yes,Yes
1257,46,f,No,No,Adaptive,Yes,No,No,Yes,No


In [12]:
dataset.Gender.value_counts()

Gender
Male                                              579
male                                              200
Female                                            121
M                                                 112
female                                             62
F                                                  38
m                                                  34
f                                                  15
Make                                                4
Male                                                3
Woman                                               3
Cis Male                                            2
Man                                                 2
Female (trans)                                      2
Female                                              2
Trans woman                                         1
msle                                                1
male leaning androgynous                            1
Neuter               

In [13]:
dataset['Gender'].replace(['Male ', 'male', 'M', 'm', 'Male', 'Cis Male','Man', 'cis male', 'Mail', 'Male-ish', 'Male (CIS)',
                    'Cis Man', 'msle', 'Malr', 'Mal', 'maile', 'Make',], 'Male', inplace = True)

dataset['Gender'].replace(['Female ', 'female', 'F', 'f', 'Woman', 'Female','femail', 'Cis Female', 'cis-female/femme',
                    'Femake', 'Female (cis)','woman',], 'Female', inplace = True)

dataset["Gender"].replace(['Female (trans)', 'queer/she/they', 'non-binary','fluid', 'queer', 'Androgyne', 'Trans-female', 'male leaning androgynous',
                      'Agender', 'A little about you', 'Nah', 'All','ostensibly male, unsure what that really means',
                      'Genderqueer', 'Enby', 'p', 'Neuter', 'something kinda male?','Guy (-ish) ^_^', 'Trans woman',], 'Others', inplace = True)

In [14]:
dataset.Gender.value_counts()

Gender
Male      945
Female    247
Others     21
Name: count, dtype: int64

In [15]:
dataset.Age.value_counts()

Age
 29             80
 32             78
 26             73
 33             68
 34             65
 27             65
 31             64
 28             63
 30             60
 25             57
 35             51
 23             49
 24             46
 37             41
 38             39
 36             35
 39             33
 40             32
 43             28
 41             21
 22             21
 42             20
 21             16
 45             12
 46             12
 44             11
 19              9
 18              7
 48              6
 20              6
 50              5
 51              5
 49              4
 56              4
 57              3
 54              3
 55              3
 47              2
 60              2
 99999999999     1
 5               1
-1               1
 11              1
 8               1
 61              1
 53              1
-29              1
-1726            1
 65              1
 62              1
 58              1
 329             1
 72     

In [16]:
dataset['Age'].replace([dataset['Age'][dataset['Age'] < 18]], 18, inplace = True)
dataset['Age'].replace([dataset['Age'][dataset['Age'] > 72]], 72, inplace = True)

In [17]:
dataset.Age.value_counts()

Age
29    80
32    78
26    73
33    68
34    65
27    65
31    64
28    63
30    60
25    57
35    51
23    49
24    46
37    41
38    39
36    35
39    33
40    32
43    28
41    21
22    21
42    20
21    16
18    13
45    12
46    12
44    11
19     9
48     6
20     6
50     5
51     5
49     4
56     4
55     3
72     3
57     3
54     3
47     2
60     2
58     1
62     1
65     1
53     1
61     1
Name: count, dtype: int64

In [18]:
dataset.isna().sum()

Age                          0
Gender                       0
family_history               0
treatment                    0
work_interfere               0
care_options                 0
wellness_program             0
seek_help                    0
mental_health_consequence    0
phys_health_consequence      0
dtype: int64

In [19]:
dataset.Age.describe()

count    1213.000000
mean       32.136851
std         7.574236
min        18.000000
25%        27.000000
50%        31.000000
75%        36.000000
max        72.000000
Name: Age, dtype: float64

In [20]:
dataset

Unnamed: 0,Age,Gender,family_history,treatment,work_interfere,care_options,wellness_program,seek_help,mental_health_consequence,phys_health_consequence
0,37,Female,No,Yes,Often,Not sure,No,Yes,No,No
1,44,Male,No,No,Rarely,No,Don't know,Don't know,Maybe,No
2,32,Male,No,No,Rarely,No,No,No,No,No
3,31,Male,Yes,Yes,Often,Yes,No,No,Yes,Yes
4,31,Male,No,No,Never,No,Don't know,Don't know,No,No
...,...,...,...,...,...,...,...,...,...,...
1254,26,Male,No,Yes,Adaptive,No,No,No,No,No
1255,32,Male,Yes,Yes,Often,Yes,No,No,No,No
1256,34,Male,Yes,Yes,Sometimes,Yes,No,No,Yes,Yes
1257,46,Female,No,No,Adaptive,Yes,No,No,Yes,No


In [21]:
dataset.family_history.value_counts()

family_history
No     733
Yes    480
Name: count, dtype: int64

In [22]:
dataset.treatment.value_counts()

treatment
Yes    624
No     589
Name: count, dtype: int64

In [23]:
dataset.work_interfere.value_counts()

work_interfere
Sometimes    456
Adaptive     244
Never        206
Rarely       167
Often        140
Name: count, dtype: int64

In [24]:
dataset.care_options.value_counts()

care_options
No          474
Yes         435
Not sure    304
Name: count, dtype: int64

In [25]:
dataset.wellness_program.value_counts()

wellness_program
No            806
Yes           224
Don't know    183
Name: count, dtype: int64

In [26]:
dataset.seek_help.value_counts()

seek_help
No            614
Don't know    355
Yes           244
Name: count, dtype: int64

In [27]:
dataset.mental_health_consequence.value_counts()

mental_health_consequence
Maybe    464
No       461
Yes      288
Name: count, dtype: int64

In [28]:
dataset.phys_health_consequence.value_counts()

phys_health_consequence
No       888
Maybe    264
Yes       61
Name: count, dtype: int64

In [31]:
dataset

Unnamed: 0,Age,Gender,family_history,treatment,work_interfere,care_options,wellness_program,seek_help,mental_health_consequence,phys_health_consequence
0,37,Female,No,Yes,Often,Not sure,No,Yes,No,No
1,44,Male,No,No,Rarely,No,Don't know,Don't know,Maybe,No
2,32,Male,No,No,Rarely,No,No,No,No,No
3,31,Male,Yes,Yes,Often,Yes,No,No,Yes,Yes
4,31,Male,No,No,Never,No,Don't know,Don't know,No,No
...,...,...,...,...,...,...,...,...,...,...
1254,26,Male,No,Yes,Adaptive,No,No,No,No,No
1255,32,Male,Yes,Yes,Often,Yes,No,No,No,No
1256,34,Male,Yes,Yes,Sometimes,Yes,No,No,Yes,Yes
1257,46,Female,No,No,Adaptive,Yes,No,No,Yes,No


In [32]:
dataset.Age.value_counts()

Age
29    80
32    78
26    73
33    68
34    65
27    65
31    64
28    63
30    60
25    57
35    51
23    49
24    46
37    41
38    39
36    35
39    33
40    32
43    28
41    21
22    21
42    20
21    16
18    13
45    12
46    12
44    11
19     9
48     6
20     6
50     5
51     5
49     4
56     4
55     3
72     3
57     3
54     3
47     2
60     2
58     1
62     1
65     1
53     1
61     1
Name: count, dtype: int64

In [33]:
dataset.Gender.value_counts()

Gender
Male      945
Female    247
Others     21
Name: count, dtype: int64

In [35]:
Gender_map = {'Male': 0, 'Female': 1, 'Others': 2}
dataset['Gender'] = dataset['Gender'].map(Gender_map)

In [36]:
dataset

Unnamed: 0,Age,Gender,family_history,treatment,work_interfere,care_options,wellness_program,seek_help,mental_health_consequence,phys_health_consequence
0,37,1,No,Yes,Often,Not sure,No,Yes,No,No
1,44,0,No,No,Rarely,No,Don't know,Don't know,Maybe,No
2,32,0,No,No,Rarely,No,No,No,No,No
3,31,0,Yes,Yes,Often,Yes,No,No,Yes,Yes
4,31,0,No,No,Never,No,Don't know,Don't know,No,No
...,...,...,...,...,...,...,...,...,...,...
1254,26,0,No,Yes,Adaptive,No,No,No,No,No
1255,32,0,Yes,Yes,Often,Yes,No,No,No,No
1256,34,0,Yes,Yes,Sometimes,Yes,No,No,Yes,Yes
1257,46,1,No,No,Adaptive,Yes,No,No,Yes,No


In [37]:
dataset.family_history.value_counts()

family_history
No     733
Yes    480
Name: count, dtype: int64

In [38]:
family_map = {'No': 0, 'Yes': 1}
dataset['family_history'] = dataset['family_history'].map(family_map)

In [39]:
dataset

Unnamed: 0,Age,Gender,family_history,treatment,work_interfere,care_options,wellness_program,seek_help,mental_health_consequence,phys_health_consequence
0,37,1,0,Yes,Often,Not sure,No,Yes,No,No
1,44,0,0,No,Rarely,No,Don't know,Don't know,Maybe,No
2,32,0,0,No,Rarely,No,No,No,No,No
3,31,0,1,Yes,Often,Yes,No,No,Yes,Yes
4,31,0,0,No,Never,No,Don't know,Don't know,No,No
...,...,...,...,...,...,...,...,...,...,...
1254,26,0,0,Yes,Adaptive,No,No,No,No,No
1255,32,0,1,Yes,Often,Yes,No,No,No,No
1256,34,0,1,Yes,Sometimes,Yes,No,No,Yes,Yes
1257,46,1,0,No,Adaptive,Yes,No,No,Yes,No


In [40]:
dataset.treatment.value_counts()

treatment
Yes    624
No     589
Name: count, dtype: int64

In [41]:
treatment_map = {'No': 0, 'Yes': 1}
dataset['treatment'] = dataset['treatment'].map(treatment_map)

In [42]:
dataset

Unnamed: 0,Age,Gender,family_history,treatment,work_interfere,care_options,wellness_program,seek_help,mental_health_consequence,phys_health_consequence
0,37,1,0,1,Often,Not sure,No,Yes,No,No
1,44,0,0,0,Rarely,No,Don't know,Don't know,Maybe,No
2,32,0,0,0,Rarely,No,No,No,No,No
3,31,0,1,1,Often,Yes,No,No,Yes,Yes
4,31,0,0,0,Never,No,Don't know,Don't know,No,No
...,...,...,...,...,...,...,...,...,...,...
1254,26,0,0,1,Adaptive,No,No,No,No,No
1255,32,0,1,1,Often,Yes,No,No,No,No
1256,34,0,1,1,Sometimes,Yes,No,No,Yes,Yes
1257,46,1,0,0,Adaptive,Yes,No,No,Yes,No


In [43]:
dataset.work_interfere.value_counts()

work_interfere
Sometimes    456
Adaptive     244
Never        206
Rarely       167
Often        140
Name: count, dtype: int64

In [44]:
work_map = {'Never': 0, 'Sometimes': 1, 'Rarely': 2, 'Often': 3, 'Adaptive': 4}
dataset['work_interfere'] = dataset['work_interfere'].map(work_map)

In [45]:
dataset

Unnamed: 0,Age,Gender,family_history,treatment,work_interfere,care_options,wellness_program,seek_help,mental_health_consequence,phys_health_consequence
0,37,1,0,1,3,Not sure,No,Yes,No,No
1,44,0,0,0,2,No,Don't know,Don't know,Maybe,No
2,32,0,0,0,2,No,No,No,No,No
3,31,0,1,1,3,Yes,No,No,Yes,Yes
4,31,0,0,0,0,No,Don't know,Don't know,No,No
...,...,...,...,...,...,...,...,...,...,...
1254,26,0,0,1,4,No,No,No,No,No
1255,32,0,1,1,3,Yes,No,No,No,No
1256,34,0,1,1,1,Yes,No,No,Yes,Yes
1257,46,1,0,0,4,Yes,No,No,Yes,No


In [46]:
dataset.care_options.value_counts()

care_options
No          474
Yes         435
Not sure    304
Name: count, dtype: int64

In [48]:
care_map = {'No': 0, 'Yes': 1, 'Not sure': 2}
dataset['care_options'] = dataset['care_options'].map(care_map)

In [49]:
dataset

Unnamed: 0,Age,Gender,family_history,treatment,work_interfere,care_options,wellness_program,seek_help,mental_health_consequence,phys_health_consequence
0,37,1,0,1,3,2,No,Yes,No,No
1,44,0,0,0,2,0,Don't know,Don't know,Maybe,No
2,32,0,0,0,2,0,No,No,No,No
3,31,0,1,1,3,1,No,No,Yes,Yes
4,31,0,0,0,0,0,Don't know,Don't know,No,No
...,...,...,...,...,...,...,...,...,...,...
1254,26,0,0,1,4,0,No,No,No,No
1255,32,0,1,1,3,1,No,No,No,No
1256,34,0,1,1,1,1,No,No,Yes,Yes
1257,46,1,0,0,4,1,No,No,Yes,No


In [50]:
dataset.wellness_program.value_counts()

wellness_program
No            806
Yes           224
Don't know    183
Name: count, dtype: int64

In [51]:
wellness_map = {'No': 0, 'Yes': 1, "Don't know": 2}
dataset['wellness_program'] = dataset['wellness_program'].map(wellness_map)

In [52]:
dataset

Unnamed: 0,Age,Gender,family_history,treatment,work_interfere,care_options,wellness_program,seek_help,mental_health_consequence,phys_health_consequence
0,37,1,0,1,3,2,0,Yes,No,No
1,44,0,0,0,2,0,2,Don't know,Maybe,No
2,32,0,0,0,2,0,0,No,No,No
3,31,0,1,1,3,1,0,No,Yes,Yes
4,31,0,0,0,0,0,2,Don't know,No,No
...,...,...,...,...,...,...,...,...,...,...
1254,26,0,0,1,4,0,0,No,No,No
1255,32,0,1,1,3,1,0,No,No,No
1256,34,0,1,1,1,1,0,No,Yes,Yes
1257,46,1,0,0,4,1,0,No,Yes,No


In [53]:
dataset.seek_help.value_counts()

seek_help
No            614
Don't know    355
Yes           244
Name: count, dtype: int64

In [56]:
seek_map = {'No': 0, 'Yes': 1, "Don't know": 2}
dataset['seek_help'] = dataset['seek_help'].map(seek_map)

In [57]:
dataset

Unnamed: 0,Age,Gender,family_history,treatment,work_interfere,care_options,wellness_program,seek_help,mental_health_consequence,phys_health_consequence
0,37,1,0,1,3,2,0,1,No,No
1,44,0,0,0,2,0,2,2,Maybe,No
2,32,0,0,0,2,0,0,0,No,No
3,31,0,1,1,3,1,0,0,Yes,Yes
4,31,0,0,0,0,0,2,2,No,No
...,...,...,...,...,...,...,...,...,...,...
1254,26,0,0,1,4,0,0,0,No,No
1255,32,0,1,1,3,1,0,0,No,No
1256,34,0,1,1,1,1,0,0,Yes,Yes
1257,46,1,0,0,4,1,0,0,Yes,No


In [58]:
dataset.mental_health_consequence.value_counts()

mental_health_consequence
Maybe    464
No       461
Yes      288
Name: count, dtype: int64

In [59]:
mental_map = {'No': 0, 'Yes': 1, 'Maybe': 2}
dataset['mental_health_consequence'] = dataset['mental_health_consequence'].map(mental_map)

In [60]:
dataset

Unnamed: 0,Age,Gender,family_history,treatment,work_interfere,care_options,wellness_program,seek_help,mental_health_consequence,phys_health_consequence
0,37,1,0,1,3,2,0,1,0,No
1,44,0,0,0,2,0,2,2,2,No
2,32,0,0,0,2,0,0,0,0,No
3,31,0,1,1,3,1,0,0,1,Yes
4,31,0,0,0,0,0,2,2,0,No
...,...,...,...,...,...,...,...,...,...,...
1254,26,0,0,1,4,0,0,0,0,No
1255,32,0,1,1,3,1,0,0,0,No
1256,34,0,1,1,1,1,0,0,1,Yes
1257,46,1,0,0,4,1,0,0,1,No


In [61]:
dataset.phys_health_consequence.value_counts()

phys_health_consequence
No       888
Maybe    264
Yes       61
Name: count, dtype: int64

In [62]:
phys_map = {'No': 0, 'Yes': 1, 'Maybe': 2}
dataset['phys_health_consequence'] = dataset['phys_health_consequence'].map(mental_map)

In [63]:
dataset

Unnamed: 0,Age,Gender,family_history,treatment,work_interfere,care_options,wellness_program,seek_help,mental_health_consequence,phys_health_consequence
0,37,1,0,1,3,2,0,1,0,0
1,44,0,0,0,2,0,2,2,2,0
2,32,0,0,0,2,0,0,0,0,0
3,31,0,1,1,3,1,0,0,1,1
4,31,0,0,0,0,0,2,2,0,0
...,...,...,...,...,...,...,...,...,...,...
1254,26,0,0,1,4,0,0,0,0,0
1255,32,0,1,1,3,1,0,0,0,0
1256,34,0,1,1,1,1,0,0,1,1
1257,46,1,0,0,4,1,0,0,1,0


In [64]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_dataset = scaler.fit_transform(dataset)
print(scaled_dataset)

[[ 0.64232951  1.63859152 -0.80922377 ...  0.24601861 -1.14797928
  -0.58665804]
 [ 1.56689636 -0.51250319 -0.80922377 ...  1.39822161  1.14231491
  -0.58665804]
 [-0.01807539 -0.51250319 -0.80922377 ... -0.90618438 -1.14797928
  -0.58665804]
 ...
 [ 0.24608657 -0.51250319  1.23575213 ... -0.90618438 -0.00283219
   0.62151888]
 [ 1.83105832  1.63859152 -0.80922377 ... -0.90618438 -0.00283219
  -0.58665804]
 [-0.94264224 -0.51250319  1.23575213 ... -0.90618438  1.14231491
  -0.58665804]]


In [65]:
from sklearn.cluster import KMeans
scaled_dataset = pd.DataFrame(scaled_dataset, columns =['Age', 'Gender', 'family_history', 'treatment', 'work_interfere', 'care_options', 'wellness_program', 'seek_help', 'mental_health_consequence', 'phys_health_consequence'])

scaler = StandardScaler()
scaled_features = scaler.fit_transform(scaled_dataset)

kmeans = KMeans(n_clusters=10, random_state=42)
kmeans.fit(scaled_features)

scaled_dataset['Cluster'] = kmeans.labels_

print(scaled_dataset)



           Age    Gender  family_history  treatment  work_interfere  \
0     0.642330  1.638592       -0.809224   0.971550        0.860872   
1     1.566896 -0.512503       -0.809224  -1.029283        0.142195   
2    -0.018075 -0.512503       -0.809224  -1.029283        0.142195   
3    -0.150156 -0.512503        1.235752   0.971550        0.860872   
4    -0.150156 -0.512503       -0.809224  -1.029283       -1.295159   
...        ...       ...             ...        ...             ...   
1208 -0.810561 -0.512503       -0.809224   0.971550        1.579549   
1209 -0.018075 -0.512503        1.235752   0.971550        0.860872   
1210  0.246087 -0.512503        1.235752   0.971550       -0.576482   
1211  1.831058  1.638592       -0.809224  -1.029283        1.579549   
1212 -0.942642 -0.512503        1.235752   0.971550       -0.576482   

      care_options  wellness_program  seek_help  mental_health_consequence  \
0         1.445958         -0.654938   0.246019                  -1.1

In [66]:
dataset['Cluster'] = scaled_dataset['Cluster'].values

In [67]:
dataset

Unnamed: 0,Age,Gender,family_history,treatment,work_interfere,care_options,wellness_program,seek_help,mental_health_consequence,phys_health_consequence,Cluster
0,37,1,0,1,3,2,0,1,0,0,3
1,44,0,0,0,2,0,2,2,2,0,5
2,32,0,0,0,2,0,0,0,0,0,6
3,31,0,1,1,3,1,0,0,1,1,2
4,31,0,0,0,0,0,2,2,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
1254,26,0,0,1,4,0,0,0,0,0,0
1255,32,0,1,1,3,1,0,0,0,0,2
1256,34,0,1,1,1,1,0,0,1,1,2
1257,46,1,0,0,4,1,0,0,1,0,4


In [68]:
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

In [69]:
train_dataset

Unnamed: 0,Age,Gender,family_history,treatment,work_interfere,care_options,wellness_program,seek_help,mental_health_consequence,phys_health_consequence,Cluster
932,43,0,0,1,1,1,0,1,1,2,8
854,31,0,1,0,4,2,0,0,2,0,4
55,27,0,1,1,2,1,0,0,2,0,2
533,23,0,0,0,2,0,0,2,0,0,6
503,25,0,0,0,0,2,2,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
1076,43,0,1,1,1,2,0,2,2,2,8
326,30,1,0,0,0,1,1,1,2,2,9
787,25,0,1,1,1,1,2,2,2,0,7
915,30,0,0,1,1,0,0,0,1,2,9


In [70]:
test_dataset

Unnamed: 0,Age,Gender,family_history,treatment,work_interfere,care_options,wellness_program,seek_help,mental_health_consequence,phys_health_consequence,Cluster
11,29,0,0,0,0,2,0,0,0,0,6
23,41,0,0,1,0,0,0,2,2,0,0
24,33,0,1,1,2,2,2,1,0,0,7
25,35,0,1,1,1,1,0,2,1,0,7
28,34,0,0,1,1,2,0,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1236,60,0,0,1,3,0,1,2,2,2,8
1239,30,0,0,0,1,0,0,0,2,0,6
1247,36,0,0,1,3,0,0,0,1,0,0
1254,26,0,0,1,4,0,0,0,0,0,0


In [71]:
train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('Cluster')
test_labels = test_features.pop('Cluster')

In [72]:
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.asarray(train_features))
print(normalizer.mean.numpy())
first = np.array(train_features[:1])

with np.printoptions(precision=2, suppress=True):
  print('First example:', first)
  print()
  print('Normalized:', normalizer(first).numpy())

[[32.169067    0.22783503  0.4020618   0.5041237   1.8329895   0.85670096
   0.49484524  0.7927834   0.9907217   0.46494842]]
First example: [[43  0  0  1  1  1  0  1  1  2]]

Normalized: [[ 1.44 -0.5  -0.82  0.99 -0.59  0.18 -0.66  0.24  0.01  1.88]]


In [73]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(train_features.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

In [74]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [75]:
model.fit(train_features, train_labels, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7ce5dc1913f0>

In [76]:
loss, accuracy = model.evaluate(test_features, test_labels)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.8971193432807922


In [77]:
LABEL = ["ayam", "bebek", "cicak", "domba", "elang", "flamingo", "gajah", "harimau", "ikan", "jerapah"]
new_data = [[49,  0,  1,  0,  4,  2,  0,  1,  1,  2]]

prediksi = model.predict(new_data)
predicted_classes = np.argmax(prediksi, axis=1)
predicted_classes = LABEL[predicted_classes[0]]
print(predicted_classes)
print(prediksi)


ikan
[[6.4067065e-04 3.5990240e-06 2.0905407e-03 2.9118839e-04 1.0316965e-01
  4.1584604e-04 5.3912384e-04 6.7965424e-04 8.2802355e-01 6.4146131e-02]]


In [78]:
import pickle

In [79]:
with open("model_klasifikasi.pkl", "wb") as model_file:
  pickle.dump(model, model_file)