In [None]:
import random
import numpy as np
import tensorflow as tf

random.seed(1693)
np.random.seed(1693)
tf.random.set_seed(1693)

from tensorflow import keras
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
import matplotlib.pyplot as pyplot
import datetime as dt
from datetime import datetime
import sklearn as sk
from sklearn.model_selection import train_test_split

Read and Prep Data

In [None]:
df = pd.read_csv('stackoverflow_full.csv')

In [None]:
X = df[['Age', 'EdLevel', 'Employment', 'Gender', 'MainBranch', 'YearsCode', 'YearsCodePro', 'PreviousSalary', 'ComputerSkills']]

In [None]:
y = df['Employed']

In [None]:
X = pd.get_dummies(X)

In [None]:
y = pd.get_dummies(y)

In [None]:
y = y.astype(int)
y

Unnamed: 0,0,1
0,1,0
1,0,1
2,1,0
3,1,0
4,1,0
...,...,...
73457,0,1
73458,0,1
73459,0,1
73460,0,1


In [None]:
X

Unnamed: 0,Employment,YearsCode,YearsCodePro,PreviousSalary,ComputerSkills,Age_<35,Age_>35,EdLevel_Master,EdLevel_NoHigherEd,EdLevel_Other,EdLevel_PhD,EdLevel_Undergraduate,Gender_Man,Gender_NonBinary,Gender_Woman,MainBranch_Dev,MainBranch_NotDev
0,1,7,4,51552.0,4,True,False,True,False,False,False,False,True,False,False,True,False
1,1,12,5,46482.0,12,True,False,False,False,False,False,True,True,False,False,True,False
2,1,15,6,77290.0,7,True,False,True,False,False,False,False,True,False,False,True,False
3,1,9,6,46135.0,13,True,False,False,False,False,False,True,True,False,False,True,False
4,0,40,30,160932.0,2,False,True,False,False,False,True,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73457,1,7,2,41058.0,13,True,False,False,False,False,False,True,True,False,False,True,False
73458,1,21,16,115000.0,11,False,True,False,False,False,False,True,True,False,False,True,False
73459,1,4,3,57720.0,12,True,False,False,False,False,False,True,True,False,False,True,False
73460,1,5,1,70000.0,15,True,False,False,False,False,False,True,True,False,False,True,False


In [None]:
X = X.astype(int)

Train and Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=1693)

In [None]:
X_train

Unnamed: 0,Employment,YearsCode,YearsCodePro,PreviousSalary,ComputerSkills,Age_<35,Age_>35,EdLevel_Master,EdLevel_NoHigherEd,EdLevel_Other,EdLevel_PhD,EdLevel_Undergraduate,Gender_Man,Gender_NonBinary,Gender_Woman,MainBranch_Dev,MainBranch_NotDev
51182,1,7,3,53522,7,1,0,0,0,0,0,1,1,0,0,1,0
21841,0,6,3,6600,12,1,0,0,0,0,0,1,1,0,0,1,0
26342,1,30,23,175000,14,0,1,1,0,0,0,0,1,0,0,1,0
31242,1,35,22,91883,9,0,1,0,0,0,1,0,1,0,0,1,0
67052,1,32,30,63984,7,0,1,0,0,0,0,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38115,1,5,3,22644,7,1,0,0,0,1,0,0,1,0,0,1,0
53968,1,12,6,89581,9,1,0,1,0,0,0,0,1,0,0,1,0
72750,1,7,2,36259,7,1,0,1,0,0,0,0,1,0,0,1,0
4613,0,15,12,97704,19,1,0,0,0,1,0,0,1,0,0,1,0


Define Model

In [None]:
model = Sequential()
model.add(Dense(units=20,
                input_dim=17,
                activation='relu',
                name="HL1"))
model.add(Dense(units=10,
                activation='relu',
                name="HL2"))
model.add(Dense(2,
                activation='softmax',
                name="OL"))

# Compile model
model.compile(loss='CategoricalCrossentropy',
              optimizer = "adam",
              metrics=['accuracy'])

# Inspect model summary
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 HL1 (Dense)                 (None, 20)                360       
                                                                 
 HL2 (Dense)                 (None, 10)                210       
                                                                 
 OL (Dense)                  (None, 2)                 22        
                                                                 
Total params: 592 (2.31 KB)
Trainable params: 592 (2.31 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


Fit Model

In [None]:
estimate = model.fit(X_train,
                     y_train,
                     epochs=5, # more epochs will take longer to run
                     verbose=0, # change to 1 or 2 to see more details
                     validation_split=.2) # Use tensorboard callback


Test Model

In [None]:
model.evaluate(X_test, y_test, batch_size=24)



[3.322233200073242, 0.7020735740661621]

In [None]:
pred_prob = model.predict(X_test)
print(pred_prob)

[[1.2600625e-02 9.8739928e-01]
 [9.9258262e-01 7.4173799e-03]
 [8.7512818e-31 9.9999994e-01]
 ...
 [9.9964976e-01 3.5023954e-04]
 [3.0200636e-07 9.9999964e-01]
 [3.3666220e-04 9.9966335e-01]]


In [None]:
pred_class = pred_prob.argmax(axis=-1)
np.unique(pred_class, return_counts=True)

(array([0, 1]), array([12189,  9850]))

Create Data to Show Gender Bias

In [None]:
X_bias = pd.DataFrame(columns=X_test.columns)
X_bias

Unnamed: 0,Employment,YearsCode,YearsCodePro,PreviousSalary,ComputerSkills,Age_<35,Age_>35,EdLevel_Master,EdLevel_NoHigherEd,EdLevel_Other,EdLevel_PhD,EdLevel_Undergraduate,Gender_Man,Gender_NonBinary,Gender_Woman,MainBranch_Dev,MainBranch_NotDev


In [None]:
new_row1 = {'Employment': 1, 'YearsCode': 13, 'YearsCodePro': 7, 'PreviousSalary': 86478, 'ComputerSkills': 16, 'Age_<35': 1, 'Age_>35': 0, 'EdLevel_Master': 1, 'EdLevel_NoHigherEd': 0, 'EdLevel_Other': 0, 'EdLevel_PhD': 0, 'EdLevel_Undergraduate': 0, 'Gender_Man': 1, 'Gender_NonBinary': 0, 'Gender_Woman': 0, 'MainBranch_Dev': 1, 'MainBranch_NotDev': 0}
new_row2 = {'Employment': 1, 'YearsCode': 13, 'YearsCodePro': 7, 'PreviousSalary': 86478, 'ComputerSkills': 16, 'Age_<35': 1, 'Age_>35': 0, 'EdLevel_Master': 1, 'EdLevel_NoHigherEd': 0, 'EdLevel_Other': 0, 'EdLevel_PhD': 0, 'EdLevel_Undergraduate': 0, 'Gender_Man': 0, 'Gender_NonBinary': 0, 'Gender_Woman': 1, 'MainBranch_Dev': 1, 'MainBranch_NotDev': 0}
new_row3 = {'Employment': 1, 'YearsCode': 13, 'YearsCodePro': 7, 'PreviousSalary': 86478, 'ComputerSkills': 16, 'Age_<35': 1, 'Age_>35': 0, 'EdLevel_Master': 1, 'EdLevel_NoHigherEd': 0, 'EdLevel_Other': 0, 'EdLevel_PhD': 0, 'EdLevel_Undergraduate': 0, 'Gender_Man': 0, 'Gender_NonBinary': 1, 'Gender_Woman': 0, 'MainBranch_Dev': 1, 'MainBranch_NotDev': 0}

In [None]:
new_row1_df = pd.DataFrame([new_row1])
new_row2_df = pd.DataFrame([new_row2])
new_row3_df = pd.DataFrame([new_row3])

In [None]:
X_bias = pd.concat([X_bias, new_row1_df], ignore_index=True)
X_bias = pd.concat([X_bias, new_row2_df], ignore_index=True)
X_bias = pd.concat([X_bias, new_row3_df], ignore_index=True)
X_bias = pd.DataFrame(X_bias)
X_bias

Unnamed: 0,Employment,YearsCode,YearsCodePro,PreviousSalary,ComputerSkills,Age_<35,Age_>35,EdLevel_Master,EdLevel_NoHigherEd,EdLevel_Other,EdLevel_PhD,EdLevel_Undergraduate,Gender_Man,Gender_NonBinary,Gender_Woman,MainBranch_Dev,MainBranch_NotDev
0,1,13,7,86478,16,1,0,1,0,0,0,0,1,0,0,1,0
1,1,13,7,86478,16,1,0,1,0,0,0,0,0,0,1,1,0
2,1,13,7,86478,16,1,0,1,0,0,0,0,0,1,0,1,0


In [None]:
X_bias = X_bias.astype(int)

Explore Gender Bias Results

In [None]:
pred_prob1 = model.predict(X_bias)
pred_prob1



array([[0.6048658 , 0.39513415],
       [0.82460326, 0.1753968 ],
       [0.38214144, 0.6178586 ]], dtype=float32)

Create Data to Show Age Bias

In [None]:
X_bias2 = pd.DataFrame(columns=X_test.columns)
X_bias2

Unnamed: 0,Employment,YearsCode,YearsCodePro,PreviousSalary,ComputerSkills,Age_<35,Age_>35,EdLevel_Master,EdLevel_NoHigherEd,EdLevel_Other,EdLevel_PhD,EdLevel_Undergraduate,Gender_Man,Gender_NonBinary,Gender_Woman,MainBranch_Dev,MainBranch_NotDev


In [None]:
new_row4 = {'Employment': 1, 'YearsCode': 13, 'YearsCodePro': 7, 'PreviousSalary': 86478, 'ComputerSkills': 16, 'Age_<35': 1, 'Age_>35': 0, 'EdLevel_Master': 1, 'EdLevel_NoHigherEd': 0, 'EdLevel_Other': 0, 'EdLevel_PhD': 0, 'EdLevel_Undergraduate': 0, 'Gender_Man': 1, 'Gender_NonBinary': 0, 'Gender_Woman': 0, 'MainBranch_Dev': 1, 'MainBranch_NotDev': 0}
new_row5 = {'Employment': 1, 'YearsCode': 13, 'YearsCodePro': 7, 'PreviousSalary': 86478, 'ComputerSkills': 16, 'Age_<35': 0, 'Age_>35': 1, 'EdLevel_Master': 1, 'EdLevel_NoHigherEd': 0, 'EdLevel_Other': 0, 'EdLevel_PhD': 0, 'EdLevel_Undergraduate': 0, 'Gender_Man': 1, 'Gender_NonBinary': 0, 'Gender_Woman': 0, 'MainBranch_Dev': 1, 'MainBranch_NotDev': 0}

In [None]:
new_row4_df = pd.DataFrame([new_row4])
new_row5_df = pd.DataFrame([new_row5])

In [None]:
X_bias2 = pd.concat([X_bias2, new_row4_df], ignore_index=True)
X_bias2 = pd.concat([X_bias2, new_row5_df], ignore_index=True)
X_bias2 = pd.DataFrame(X_bias2)
X_bias2

Unnamed: 0,Employment,YearsCode,YearsCodePro,PreviousSalary,ComputerSkills,Age_<35,Age_>35,EdLevel_Master,EdLevel_NoHigherEd,EdLevel_Other,EdLevel_PhD,EdLevel_Undergraduate,Gender_Man,Gender_NonBinary,Gender_Woman,MainBranch_Dev,MainBranch_NotDev
0,1,13,7,86478,16,1,0,1,0,0,0,0,1,0,0,1,0
1,1,13,7,86478,16,0,1,1,0,0,0,0,1,0,0,1,0


In [None]:
X_bias2 = X_bias2.astype(int)

Explore Age Bias Results

In [None]:
pred_prob2 = model.predict(X_bias2)
pred_prob2



array([[0.6046324 , 0.3953676 ],
       [0.26971015, 0.7302899 ]], dtype=float32)

In [None]:
np.unique(df['Gender'], return_counts=True)

(array(['Man', 'NonBinary', 'Woman'], dtype=object),
 array([68573,  1371,  3518]))

In [None]:
np.unique(df['Age'], return_counts=True)

(array(['<35', '>35'], dtype=object), array([47819, 25643]))