In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
train=pd.read_csv("../input/treatment-prediction-of-employees/training.csv") 
new_data=pd.read_csv("../input/treatment-prediction-of-employees/test_1.csv")

In [None]:
df=train.copy()
df_new=new_data.copy()

In [None]:
df.head()

In [None]:
s_no=df['S.No']

In [None]:
print(train.shape)
print(new_data.shape)

In [None]:
## we can observe that we have too many categorical variables in the data . Having a look at how many unique values are there in every categorical column

In [None]:
df=df[['treatment','Age','Gender','Country','state','self_employed','family_history','work_interfere','remote_work','tech_company','benefits','care_options','wellness_program','seek_help','anonymity','leave','mental_health_consequence','phys_health_consequence','coworkers','supervisor','mental_health_interview','phys_health_interview','mental_vs_physical','obs_consequence']]
df_new=df_new[['Age','Gender','Country','state','self_employed','family_history','work_interfere','remote_work','tech_company','benefits','care_options','wellness_program','seek_help','anonymity','leave','mental_health_consequence','phys_health_consequence','coworkers','supervisor','mental_health_interview','phys_health_interview','mental_vs_physical','obs_consequence']]

In [None]:
categorical_df=[j for j in df if train[j].dtype == 'object']
categorical_df_new =[k for k in df_new if new_data[k].dtype == 'object']

In [None]:
for i in categorical_df:
    columns = df[i].unique()
    print(i,columns)

In [None]:
for i in categorical_df_new:
    columns = df_new[i].unique()
    print(i,columns)

In [None]:
## since gender has too many duplicate values , replacing the names of gender into three categories

In [None]:
print(df['Gender'].unique())
print(df_new['Gender'].unique())

In [None]:
df['Gender']=df['Gender'].replace(['Female','M','Male','m','Trans-female','Cis Female','F','something kinda male?','Cis Male','Woman','f','Mal','Male (CIS)','non-binary','Femake','woman','Make','Nah','All','Enby','fluid','Genderqueer','Female ','Androgyne','Agender','cis-female/femme','Guy (-ish) ^_^','male leaning androgynous','Male ','Man','Trans woman','msle','Neuter','Female (trans)','queer','Female (cis)','Mail','cis male','A little about you','Malr'],['female','male','male','male','transgender','transgender','female','transgender','transgender','female','female','male','transgender','transgender','female','female','male','transgender','transgender','transgender','transgender','transgender','female','transgender','transgender','transgender','transgender','transgender','male','male','transgender','male','transgender','transgender','transgender','transgender','male','transgender','transgender','male'],inplace = False)
df['Gender']=df['Gender'].replace(['Male-ish','maile','queer/she/they'],['transgender','male','transgender'],inplace = False)
df_new['Gender']=df_new['Gender'].replace(['Male', 'female', 'Female', 'M', 'male', 'Male ', 'm', 'p', 'F',
       'Woman', 'femail', 'Cis Man',
       'ostensibly male, unsure what that really means', 'f'],
    ['male','female','female','male','male','male','male','transgender','female','female','female','transgender','transgender',
    'female'],inplace=False)

In [None]:
print(df['Gender'].unique())
print(df_new['Gender'].unique())

In [None]:
## having a look at value counts of categorical variables so that the data is not biased
## also many of categorical variables have mentioned the data as 'dont know' which is referred to as null values 
## few categorical variables such as 'some of them' which says its a partial or complete yes

In [None]:
df=df.drop(['state','Country'],axis=1)
df_new=df_new.drop(['state','Country'],axis=1)

In [None]:
## state is removed since it has too many missing values 
## country is removed since it has too many unique values

In [None]:
print(df['mental_health_consequence'].value_counts())
print(df['phys_health_consequence'].value_counts())
print(df['coworkers'].value_counts())
print(df['supervisor'].value_counts())
print(df['mental_health_interview'].value_counts())
df['wellness_program'].value_counts()

In [None]:
df['remote_work']=df.remote_work.map(dict(Yes=1, No=0))
df['family_history']=df.family_history.map(dict(Yes=1, No=0))
df['tech_company']=df.tech_company.map(dict(Yes=1, No=0))
df['obs_consequence']=df.obs_consequence.map(dict(Yes=1, No=0))
df['treatment']=df.treatment.map(dict(Yes=1, No=0))

In [None]:
df_new['remote_work']=df_new.remote_work.map(dict(Yes=1, No=0))
df_new['family_history']=df_new.family_history.map(dict(Yes=1, No=0))
df_new['tech_company']=df_new.tech_company.map(dict(Yes=1, No=0))
df_new['obs_consequence']=df_new.obs_consequence.map(dict(Yes=1, No=0))

In [None]:
## we have a special category of 'some if them ' coworkers and supervisor variables
## alloting some of them as yes since the variable means that if they discuss their mental health issues or not
## some of them says that they do discuss with few of them 

In [None]:
df['coworkers']=df['coworkers'].replace(['Some of them','No','Yes'],['1','0','1'],inplace = False)
df['supervisor']=df['supervisor'].replace(['Some of them','No','Yes'],['1','0','1'],inplace = False)
df_new['coworkers']=df_new['coworkers'].replace(['Some of them','No','Yes'],['1','0','1'],inplace = False)
df_new['supervisor']=df_new['supervisor'].replace(['Some of them','No','Yes'],['1','0','1'],inplace = False)

In [None]:
## in categorical columns, we now have data left which says 'dont know' and 'may be'
## since many columns have 'dont know' (null values) , statistical method imputation might lead to too much bias in the data 
## apart from statistical methods , we can use random imputatio which might also lead to biasing the data
## for health data sensitivity and not biasing or loosing of data is extremely important
## therefore all the missing values which says dont know are being give special importance in one hot encoding by creating a special column of null values 

Data Analysis

In [None]:
# checking outliers for age 
sns.distplot(df['Age'])

In [None]:
df['Age'].value_counts()

In [None]:
#replacing the extreme values in age with mode
df['Age'].mode()

In [None]:
df['Age']=df['Age'].replace([99999999999,-29,329,-1726],[29,29,29,29],inplace = False)

In [None]:
sns.distplot(df['Age'])

In [None]:
## removing very few outliers has totally changed the distribution of age which was misleaded earlier

In [None]:
age_mode=df_new['Age'].mode()
age_mode

In [None]:
df_new['Age']=df_new['Age'].replace([-1],[32],inplace=False)

In [None]:
sns.distplot(df_new['Age'])

In [None]:
import seaborn as sns
corrmat = df.corr()
top_corr_features = corrmat.index
g=sns.heatmap(df.corr(),annot=True,cmap="RdYlGn")

In [None]:
sns.barplot(x='treatment',y='family_history',data=df)

In [None]:
## we can say that people which have family history of medical issues needed treatment 

In [None]:
X=df.iloc[:,1:]
y=df.iloc[:,0]

In [None]:
categorical_columns = df.describe(include='object').columns.to_list()

In [None]:
X= pd.get_dummies(X,categorical_columns,drop_first=True)

In [None]:
categorical_columns2 = df_new.describe(include='object').columns.to_list()

In [None]:
df_new= pd.get_dummies(df_new,categorical_columns2,drop_first=True)

In [None]:
X.shape

In [None]:
from sklearn.feature_selection import mutual_info_classif
mutual_info=mutual_info_classif(X,y)

In [None]:
mutual_data=pd.Series(mutual_info,index=X.columns)
mutual_data.sort_values(ascending=False)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model=ExtraTreesClassifier()
model.fit(X,y)

In [None]:
ranked_features=pd.Series(model.feature_importances_,index=X.columns)
ranked_features.nlargest(37).plot(kind='barh')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

In [None]:
X_train=sc.fit_transform(X_train)
X_train=pd.DataFrame(X_train,columns=X_test.columns)

X_test=sc.transform(X_test)
X_test=pd.DataFrame(X_test,columns=X_train.columns)

In [None]:
df_new=sc.fit_transform(df_new)
df_new=pd.DataFrame(df_new,columns=X_test.columns)

In [None]:
from sklearn.linear_model import LogisticRegression
model1= LogisticRegression()
model1.fit(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score
y_pred_train= model1.predict(X_train)
accuracy_score(y_train,y_pred_train)

In [None]:
y_pred_test= model1.predict(X_test)
accuracy_score(y_test,y_pred_test)

In [None]:
from sklearn.model_selection import cross_val_score
rec=(cross_val_score(estimator=model1,X=X,y=y,cv=10,scoring='recall')).mean()
pre=(cross_val_score(estimator=model1,X=X,y=y,cv=10,scoring='precision')).mean()
print(rec)
print(pre)
f1score= 2*((pre*rec)/(pre+rec))
f1score

In [None]:
from sklearn.svm import SVC
model2=SVC()

In [None]:
from sklearn.model_selection import GridSearchCV
param={'C':[1,10,50,100],'kernel':['linear','rbf','sigmoid']}
#param={'kernel':['linear','rbf','poly']}

In [None]:
my_grid=GridSearchCV(estimator=model2,param_grid=param,scoring='accuracy',cv=10)

In [None]:
my_grid.fit(X_train,y_train)

In [None]:
print(my_grid.best_params_)
print(my_grid.best_score_)

In [None]:
model2=SVC(C=1,kernel='sigmoid')
model2.fit(X_train,y_train)

In [None]:
y_pred_train= model2.predict(X_train)
accuracy_score(y_train,y_pred_train)

In [None]:
y_pred_test= model1.predict(X_test)
accuracy_score(y_test,y_pred_test)

In [None]:
from sklearn.model_selection import cross_val_score
rec=(cross_val_score(estimator=model2,X=X,y=y,cv=10,scoring='recall')).mean()
pre=(cross_val_score(estimator=model2,X=X,y=y,cv=10,scoring='precision')).mean()
print(rec)
print(pre)
f1score= 2*((pre*rec)/(pre+rec))
f1score

In [None]:
from sklearn.ensemble import AdaBoostClassifier
model3 = AdaBoostClassifier()
model3.fit(X_train,y_train)

In [None]:
## using pipelines to predict new data

In [None]:
from sklearn.pipeline import make_pipeline
pipe=make_pipeline(sc,model3)
pipe.fit(X_train,y_train)

In [None]:
y_pred_train= pipe.predict(X_train)
y_pred_train

In [None]:
y_pred= model3.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train,y_pred_train)

In [None]:
from sklearn.model_selection import cross_val_score
rec=(cross_val_score(estimator=model3,X=X,y=y,cv=10,scoring='recall')).mean()
pre=(cross_val_score(estimator=model3,X=X,y=y,cv=10,scoring='precision')).mean()
print(rec)
print(pre)

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
cm

In [None]:
sns.heatmap(cm,annot=True)

In [None]:
f1score= 2*((pre*rec)/(pre+rec))
f1score

In [None]:
## both logistic regression and xgboost are perfoming equally weel with precission as well as recall

In [None]:
import pickle
output=open("treatment.pickle","wb")
pickle.dump(pipe,output)

In [None]:
model=open("treatment.pickle","rb")
treat=pickle.load(model)

In [None]:
treat.predict(df_new)

In [None]:
final_pred=treat.predict(df_new)

In [None]:
df_new['treatment']=final_pred

In [None]:
df_new