TEAM: sehba wani, Oghenetega Ojegun, vennela gedipudi


DATASET: https://www.kaggle.com/datasets/osmi/mental-health-in-tech-survey

QUESTION: In the context of working at a tech job, does the likelihood of seeking treatment for burnout vary based on whether or not an individual has sought mental health help before?

import libraries


In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import data and reading it

In [None]:
pre_df = pd.read_csv('/survey.csv')

In [None]:
pre_df.head()

Information about our particular question


Target:
Does the worker work at a tech company (Yes or No?) → Categorical Value

Features:
Customer Characteristics of Distinction: age, gender, country, state
Customer job-related characteristics: work interference, remote, tech company(main feature / categorical value), coworkers, leave
Customer history: family history, mental health, physical health, bipolar disorder, obs consequence(memory impairment, logical function and agitation)
Other: comments


cleaning the df


In [None]:
#drop country and state
survey_df = pre_df.drop(columns=['Timestamp','self_employed','no_employees','benefits','care_options','wellness_program','anonymity','supervisor','mental_health_interview','phys_health_interview','mental_vs_physical','outcome'],axis=1)

In [None]:
survey_df

cleaning code


In [None]:
survey_df = survey_df.dropna()
#survey_filter = survey_df[survey_df['Gender']=='Male' & 'Female']
survey_filter = survey_df[(survey_df['Gender'] == 'Male') | (survey_df['Gender'] == 'Female')]
#survey_df = survey_df.drop(['1','1190'])
len(survey_filter)


In [None]:
#Use LabelEncoder to change the Dtypes to 'int'
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
#Make the dataset include all the columns we need to change their dtypes
columns_to_encode = ['Gender','family_history','treatment','remote_work','work_interfere','tech_company','seek_help','leave','mental_health_consequence','phys_health_consequence','coworkers','obs_consequence']
#Write a Loop for fitting LabelEncoder on columns_to_encode
for columns in columns_to_encode:
    survey_df[columns] = le.fit_transform(survey_df[columns])

survey_df.info()

In [None]:
survey_df.info()

In [None]:
survey_df

# Setting up the model


In [None]:
#X = survey_df.iloc[:,0:-1] #collecting all the data except the last column aka the outcome
X = survey_df.drop(columns = ['treatment'])
y = survey_df.treatment

In [None]:
X.head()

In [None]:
len(survey_df[survey_df["treatment"]==0])

In [None]:
len(survey_df[survey_df["treatment"]==1])

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

In [None]:
#not necessarily using this code just took as reference from lecture

LogReg = LogisticRegression()

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

LogReg.fit(X_train,y_train)


y_pred=LogReg.predict(X_test)

In [None]:
print('Classes', LogReg.classes_)
print('Intercept',LogReg.intercept_)
print("Coefficients", LogReg.coef_ )

In [None]:
print("Accuracy", LogReg.score(X_test, y_test))
print(classification_report(y_test, LogReg.predict(X_test)))

In [None]:
# Dummy classifier in SKlearn
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="most_frequent")

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

#Confusion Matrix

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
conf_mat

In [None]:
# Dummy classifier in SKlearn
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="most_frequent")

In [None]:
#heatmap matrix

categories = [0,1]
fig, ax = plt.subplots()
plt.xticks([0,1], categories)
plt.yticks([0,1], categories)
# create heatmap
sns.heatmap(pd.DataFrame(conf_mat), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

F1 score

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

#FAIRNESS W/ DEMOGRAPHIC VARIABLE: GENDER


In [None]:
# just to access the data easily
survey_df #the data is still encoded

##Male only

In [None]:
df_male = survey_df[survey_df["Gender"] == 8]#8 is the class it is giving us for the Male values

#df_male = survey_df.loc([survey_df["Gender"] == '8']) - this did not work either

# Print the number of rows in the filtered DataFrame to see what we get
print("Number of rows with 'Male' gender:", len(df_male))

In [None]:
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X = df_male.drop("treatment",axis=1)
y = df_male.treatment

#logistic regression training
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)
LogReg=LogisticRegression()

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

LogReg.fit(X_train,y_train)

y_pred=LogReg.predict(X_test)

In [None]:
print('Classes', LogReg.classes_)
print('Intercept',LogReg.intercept_)
print("Coefficients", LogReg.coef_ )

In [None]:
print("Accuracy", LogReg.score(X_test, y_test))
print(classification_report(y_test, LogReg.predict(X_test)))

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
conf_mat

In [None]:
categories = [4,8]
fig, ax = plt.subplots()
plt.xticks([0,1], categories)
plt.yticks([0,1], categories)
# create heatmap
sns.heatmap(pd.DataFrame(conf_mat), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

##Female Only

In [None]:
df_female = survey_df[survey_df["Gender"]== 4] #replaced female with its class: 4
X = df_female.drop("treatment",axis=1)
y = df_female.treatment

#logistic regression training
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)
LogReg=LogisticRegression()

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

LogReg.fit(X_train,y_train)

y_pred=LogReg.predict(X_test)

In [None]:
print("Number of rows with 'Female' gender:", len(df_female))

Number of rows with 'Female' gender: 202


In [None]:
print("Accuracy", LogReg.score(X_test, y_test))
print(classification_report(y_test, LogReg.predict(X_test)))

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
conf_mat

categories = [4,8]
fig, ax = plt.subplots()
plt.xticks([0,1], categories)
plt.yticks([0,1], categories)
# create heatmap
sns.heatmap(pd.DataFrame(conf_mat), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')