### Imports and helpers

In [15]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import RandomizedSearchCV, cross_validate
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
from sklearn import datasets
from catboost import CatBoostRegressor
from xgboost import XGBClassifier, XGBRegressor

import seaborn as sns
import scipy.stats as st

import warnings
warnings.filterwarnings('ignore')

import os

import matplotlib.pyplot as plt

In [2]:
pd.options.display.max_rows
pd.options.display.max_columns

20

### Load pickled data

In [4]:
df = pickle.load(open("./pickles/DALY_merged.pkl", 'rb'))

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14880 entries, 0 to 14879
Data columns (total 11 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Code                            14880 non-null  object 
 1   Year                            14880 non-null  int64  
 2   Schizophrenia                   14880 non-null  float64
 3   Bipolar disorder                14880 non-null  float64
 4   Eating disorders                14880 non-null  float64
 5   Anxiety disorders               14880 non-null  float64
 6   Drug use disorders              14880 non-null  float64
 7   Depressive disorders            14880 non-null  float64
 8   Alcohol use disorders           14880 non-null  float64
 9   Mental disorders                14880 non-null  float64
 10  Disability-Adjusted Life Years  14880 non-null  float64
dtypes: float64(9), int64(1), object(1)
memory usage: 1.4+ MB


In [6]:
df.head()

Unnamed: 0,Code,Year,Schizophrenia,Bipolar disorder,Eating disorders,Anxiety disorders,Drug use disorders,Depressive disorders,Alcohol use disorders,Mental disorders,Disability-Adjusted Life Years
0,AFG,1990,0.228979,0.721207,0.131001,4.835127,0.454202,5.125291,0.444036,16.659229,1.69667
1,AFG,1990,0.228979,0.721207,0.131001,4.835127,0.454202,5.125291,0.444036,16.659229,1.417621
2,AFG,1990,0.228979,0.721207,0.131001,4.835127,0.454202,5.125291,0.444036,12.411191,1.69667
3,AFG,1990,0.228979,0.721207,0.131001,4.835127,0.454202,5.125291,0.444036,12.411191,1.417621
4,AFG,1990,0.216352,0.581938,0.102983,3.537832,0.43711,4.725167,1.152445,16.659229,1.69667


In [7]:
df.tail()

Unnamed: 0,Code,Year,Schizophrenia,Bipolar disorder,Eating disorders,Anxiety disorders,Drug use disorders,Depressive disorders,Alcohol use disorders,Mental disorders,Disability-Adjusted Life Years
14875,ZWE,2015,0.209359,0.560882,0.09961,3.315701,0.599604,3.548613,1.734969,11.156429,2.193166
14876,ZWE,2016,0.209979,0.561768,0.100821,3.32423,0.603658,3.557508,1.689281,11.164133,2.279813
14877,ZWE,2017,0.210631,0.562612,0.101671,3.330569,0.608096,3.564138,1.651805,11.170427,2.364265
14878,ZWE,2018,0.211237,0.563283,0.102398,3.3175,0.609065,3.563141,1.686711,11.158765,2.472949
14879,ZWE,2019,0.211969,0.56382,0.102902,3.283934,0.610644,3.554571,1.776729,11.119021,2.525892


### Modeling 

In [8]:
target = 'Disability-Adjusted Life Years'

In [9]:
# Define label encoder
def label_encoding(old_column):
    le = LabelEncoder()
    le.fit(old_column)
    new_column = le.transform(old_column)
    return new_column

In [10]:
# Label encode categorical columns
for i in df.columns:
    if type(df[i][0]) == str:
        df[i] = label_encoding(df[i])

In [11]:
# Separate X prediction matrix from target y
y = df[target].values
 
X = df.drop([target], axis=1).values

In [16]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)

In [20]:
# Baseline model using linear regression without specified params
model_1 = LinearRegression()

params = {
        "n_jobs": st.randint(1, 5)
}

# Random Search Training with 5 folds Cross Validation
clf1 = RandomizedSearchCV(model_1, params, cv=5, n_iter=100) #n_jobs=1, 

clf1.fit(X_train, y_train)  

y_pred = clf1.predict(X_test)
results = mean_squared_error(y_pred, y_test)
print(results)

2.6885261425953018


In [21]:
model_2 = RandomForestRegressor()

params = {  
        "n_estimators": st.randint(50, 150),
        "max_depth": st.randint(2, 40),
        "min_samples_split": st.randint(2, 40),
        "min_samples_leaf": st.randint(2, 20),
    }

# Random Search Training with 5 folds Cross Validation
clf2 = RandomizedSearchCV(model_2, params, cv=5, n_jobs=1, n_iter=100) 

clf2.fit(X_train, y_train)  

y_pred = clf2.predict(X_test)
results = mean_squared_error(y_pred, y_test)
print(results)

1.5790314915928862


In [22]:
model_3 = KNeighborsRegressor()

params = {  
        "n_neighbors": st.randint(3, 10),
        "p": st.randint(1, 5),
    }

# Random Search Training with 5 folds Cross Validation
clf3 = RandomizedSearchCV(model_3, params, cv=5, n_jobs=1, n_iter=100) 

clf3.fit(X_train, y_train)  

y_pred = clf3.predict(X_test)
results = mean_squared_error(y_pred, y_test)
print(results)

1.7874686968239537


In [23]:
model_4 = XGBRegressor()

params = {  
        "n_estimators": st.randint(3, 40),
        "max_depth": st.randint(3, 40),
        "learning_rate": st.uniform(0.05, 0.4),
        "colsample_bytree": st.beta(10, 1),
        "subsample": st.beta(10, 1),
        "gamma": st.uniform(0, 10),
        'scale_pos_weight': st.randint(0, 2),
        "min_child_weight": st.expon(0, 50),
    }

# Random Search Training with 5 folds Cross Validation
clf4 = RandomizedSearchCV(model_4, params, cv=5, n_jobs=1, n_iter=100) 

clf4.fit(X_train, y_train)  

y_pred = clf4.predict(X_test)
results = mean_squared_error(y_pred, y_test)
print(results)

1.549464288583638


In [24]:
# save models
pickle.dump(clf1, open('./pickles/clf1_lr.pkl', 'wb'))
pickle.dump(clf2, open('./pickles/clf2_rf.pkl', 'wb'))
pickle.dump(clf3, open('./pickles/clf3_kn.pkl', 'wb'))
pickle.dump(clf4, open('./pickles/clf4_xgb.pkl', 'wb'))