In [38]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)
from sklearn.metrics import brier_score_loss
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [21]:
train_data = pd.read_csv('./train_model.csv')

In [22]:
train_data.head()

Unnamed: 0,PatientGuid,DMIndicator,Gender,Age,BMI,SmokingStatus,SeverityName,ActiveIngredient
0,5BC4324E-B5D5-4AAB-A000-003EACACE12F,1.0,F,84.0,31.381,smoker,Mild,QUEtiapine
1,5BC4324E-B5D5-4AAB-A000-003EACACE12F,1.0,F,84.0,31.381,smoker,Mild,celecoxib
2,5BC4324E-B5D5-4AAB-A000-003EACACE12F,1.0,F,84.0,31.381,smoker,Mild,QUEtiapine
3,5BC4324E-B5D5-4AAB-A000-003EACACE12F,1.0,F,84.0,31.381,smoker,Mild,celecoxib
4,5BC4324E-B5D5-4AAB-A000-003EACACE12F,1.0,F,84.0,31.381,smoker,Mild,QUEtiapine


In [23]:
train_data.shape

(163531, 8)

In [24]:
train_data.isnull().sum()


PatientGuid         0
DMIndicator         0
Gender              0
Age                 0
BMI                 0
SmokingStatus       0
SeverityName        0
ActiveIngredient    0
dtype: int64

In [25]:
train_data.dtypes

PatientGuid          object
DMIndicator         float64
Gender               object
Age                 float64
BMI                 float64
SmokingStatus        object
SeverityName         object
ActiveIngredient     object
dtype: object

In [26]:
model_data = train_data.copy()

In [27]:
model_data.head()

Unnamed: 0,PatientGuid,DMIndicator,Gender,Age,BMI,SmokingStatus,SeverityName,ActiveIngredient
0,5BC4324E-B5D5-4AAB-A000-003EACACE12F,1.0,F,84.0,31.381,smoker,Mild,QUEtiapine
1,5BC4324E-B5D5-4AAB-A000-003EACACE12F,1.0,F,84.0,31.381,smoker,Mild,celecoxib
2,5BC4324E-B5D5-4AAB-A000-003EACACE12F,1.0,F,84.0,31.381,smoker,Mild,QUEtiapine
3,5BC4324E-B5D5-4AAB-A000-003EACACE12F,1.0,F,84.0,31.381,smoker,Mild,celecoxib
4,5BC4324E-B5D5-4AAB-A000-003EACACE12F,1.0,F,84.0,31.381,smoker,Mild,QUEtiapine


In [28]:
model_data = model_data.drop(['PatientGuid'], axis=1)


In [29]:
model_data.head()

Unnamed: 0,DMIndicator,Gender,Age,BMI,SmokingStatus,SeverityName,ActiveIngredient
0,1.0,F,84.0,31.381,smoker,Mild,QUEtiapine
1,1.0,F,84.0,31.381,smoker,Mild,celecoxib
2,1.0,F,84.0,31.381,smoker,Mild,QUEtiapine
3,1.0,F,84.0,31.381,smoker,Mild,celecoxib
4,1.0,F,84.0,31.381,smoker,Mild,QUEtiapine


In [30]:
model_data.dtypes

DMIndicator         float64
Gender               object
Age                 float64
BMI                 float64
SmokingStatus        object
SeverityName         object
ActiveIngredient     object
dtype: object

### one-hot encoding for columns of the type object to convert them into a numerical format suitable for machine learning models.

In [31]:
# Using pandas to one-hot encode the object columns
model_data_encoded = pd.get_dummies(model_data, columns=['Gender', 'SmokingStatus', 'SeverityName', 'ActiveIngredient'])

model_data_encoded['DMIndicator'] = model_data_encoded['DMIndicator'].astype(int)
model_data_encoded['Age'] = model_data_encoded['Age'].astype(int)

In [32]:
model_data_encoded.shape

(163531, 234)

In [33]:
model_data_encoded.head()

Unnamed: 0,DMIndicator,Age,BMI,Gender_F,Gender_M,SmokingStatus_non-smoker,SmokingStatus_smoker,SeverityName_Mild,SeverityName_Modest,SeverityName_Severe,SeverityName_Very Mild,ActiveIngredient_ALPRAZolam,ActiveIngredient_APAP/ASA/caffeine,ActiveIngredient_APAP/butalbital/caffeine,ActiveIngredient_APAP/chlorpheniramine/phenylephrine,ActiveIngredient_ARIPiprazole,ActiveIngredient_DULoxetine,ActiveIngredient_EPINEPHrine-lidocaine,ActiveIngredient_FLUoxetine,ActiveIngredient_HYDROcodone,ActiveIngredient_HYDROcodone-pseudoephedrine,ActiveIngredient_HYDROmorphone,ActiveIngredient_PACLitaxel,ActiveIngredient_PARoxetine,ActiveIngredient_QUEtiapine,ActiveIngredient_SUMAtriptan,ActiveIngredient_Td,ActiveIngredient_acetaminophen,ActiveIngredient_acetaminophen-HYDROcodone,ActiveIngredient_acetaminophen-codeine,ActiveIngredient_acetaminophen-oxyCODONE,ActiveIngredient_acetaminophen-propoxyphene,ActiveIngredient_acetaminophen-traMADol,ActiveIngredient_alendronate,ActiveIngredient_alfuzosin,ActiveIngredient_aliskiren,ActiveIngredient_aloe vera topical,ActiveIngredient_amLODIPine,ActiveIngredient_amLODIPine-benazepril,ActiveIngredient_amitriptyline,ActiveIngredient_amoxicillin,ActiveIngredient_amoxicillin-clavulanate,ActiveIngredient_amoxicillin/clarithromycin/lansoprazole,ActiveIngredient_ampicillin-sulbactam,ActiveIngredient_aspirin,ActiveIngredient_atorvastatin,ActiveIngredient_azelastine nasal,ActiveIngredient_azithromycin,ActiveIngredient_bacitracin topical,ActiveIngredient_bacitracin/neomycin/polymyxin B topical,ActiveIngredient_benazepril,ActiveIngredient_benzonatate,ActiveIngredient_benzoyl peroxide topical,ActiveIngredient_bimatoprost ophthalmic,ActiveIngredient_bismuth subsalicylate,ActiveIngredient_bisoprolol-hydrochlorothiazide,ActiveIngredient_buPROPion,ActiveIngredient_busPIRone,ActiveIngredient_butorphanol,ActiveIngredient_caffeine-ergotamine,ActiveIngredient_carBAMazepine,ActiveIngredient_ceFAZolin,ActiveIngredient_cefTRIAXone,ActiveIngredient_cefaclor,ActiveIngredient_cefadroxil,ActiveIngredient_cefdinir,ActiveIngredient_cefprozil,ActiveIngredient_cefuroxime,ActiveIngredient_celecoxib,ActiveIngredient_cephalexin,ActiveIngredient_chlorophyllin,ActiveIngredient_chlorpheniramine-HYDROcodone,ActiveIngredient_chlorproMAZINE,ActiveIngredient_chlorzoxazone,ActiveIngredient_cilostazol,ActiveIngredient_ciprofloxacin,ActiveIngredient_citalopram,ActiveIngredient_clarithromycin,ActiveIngredient_clindamycin,ActiveIngredient_cloNIDine,ActiveIngredient_clonazePAM,ActiveIngredient_codeine,ActiveIngredient_codeine-guaiFENesin,ActiveIngredient_codeine/guaiFENesin/PSE,ActiveIngredient_cortisone,ActiveIngredient_cyclobenzaprine,ActiveIngredient_desvenlafaxine,ActiveIngredient_dextromethorphan,ActiveIngredient_diatrizoate,ActiveIngredient_diazepam,ActiveIngredient_dicyclomine,ActiveIngredient_diltiazem,ActiveIngredient_diphenhydrAMINE,ActiveIngredient_divalproex sodium,ActiveIngredient_docusate,ActiveIngredient_doxycycline,"ActiveIngredient_emollients, topical",ActiveIngredient_enalapril,ActiveIngredient_enalapril-hydrochlorothiazide,ActiveIngredient_erythromycin,ActiveIngredient_erythromycin ophthalmic,ActiveIngredient_escitalopram,ActiveIngredient_esomeprazole,ActiveIngredient_eszopiclone,ActiveIngredient_ethinyl estradiol-norethindrone,ActiveIngredient_etodolac,ActiveIngredient_ezetimibe,ActiveIngredient_ezetimibe-simvastatin,ActiveIngredient_fenofibrate,ActiveIngredient_fentaNYL,ActiveIngredient_finasteride,ActiveIngredient_fluoride,ActiveIngredient_fluticasone nasal,ActiveIngredient_fluticasone-salmeterol,ActiveIngredient_fosinopril,ActiveIngredient_furosemide,ActiveIngredient_gabapentin,ActiveIngredient_gemfibrozil,ActiveIngredient_gentamicin,ActiveIngredient_griseofulvin,ActiveIngredient_guanFACINE,ActiveIngredient_haloperidol,ActiveIngredient_heparin flush,ActiveIngredient_hydrochlorothiazide-lisinopril,ActiveIngredient_hydrochlorothiazide-triamterene,ActiveIngredient_hydrocortisone topical,ActiveIngredient_ibuprofen,ActiveIngredient_indomethacin,"ActiveIngredient_influenza virus vaccine, inactivated",ActiveIngredient_iodine I 131 tositumomab,ActiveIngredient_iodine topical,ActiveIngredient_iron polysaccharide,ActiveIngredient_isradipine,ActiveIngredient_ketorolac,ActiveIngredient_labetalol,ActiveIngredient_lamoTRIgine,ActiveIngredient_leuprolide,ActiveIngredient_levETIRAcetam,ActiveIngredient_levofloxacin,ActiveIngredient_lidocaine,ActiveIngredient_lisdexamfetamine,ActiveIngredient_lisinopril,ActiveIngredient_losartan,ActiveIngredient_meclizine,ActiveIngredient_medroxyPROGESTERone,ActiveIngredient_meloxicam,ActiveIngredient_meperidine,ActiveIngredient_mepivacaine,ActiveIngredient_metaxalone,ActiveIngredient_methadone,ActiveIngredient_methocarbamol,ActiveIngredient_methylPREDNISolone,ActiveIngredient_methylergonovine,ActiveIngredient_metoclopramide,ActiveIngredient_metoprolol,ActiveIngredient_metroNIDAZOLE,ActiveIngredient_miconazole topical,ActiveIngredient_mirtazapine,ActiveIngredient_montelukast,ActiveIngredient_morphine,ActiveIngredient_moxifloxacin,ActiveIngredient_mupirocin topical,ActiveIngredient_nabumetone,ActiveIngredient_nafcillin,ActiveIngredient_naloxone-pentazocine,ActiveIngredient_naproxen,ActiveIngredient_neomycin,ActiveIngredient_neomycin topical,ActiveIngredient_niacin,ActiveIngredient_nitrofurantoin,ActiveIngredient_nystatin,ActiveIngredient_obsolete,ActiveIngredient_olmesartan,ActiveIngredient_omega-3 polyunsaturated fatty acids,ActiveIngredient_omeprazole,ActiveIngredient_ondansetron,ActiveIngredient_oxaprozin,ActiveIngredient_oxyCODONE,ActiveIngredient_oxybutynin,ActiveIngredient_oxytocin,ActiveIngredient_paliperidone,ActiveIngredient_pantoprazole,ActiveIngredient_pegfilgrastim,ActiveIngredient_penicillamine,ActiveIngredient_penicillin V potassium,ActiveIngredient_pentazocine,ActiveIngredient_phenytoin,ActiveIngredient_phytonadione,ActiveIngredient_piperacillin-tazobactam,ActiveIngredient_povidone iodine ophthalmic,ActiveIngredient_povidone iodine topical,ActiveIngredient_pravastatin,ActiveIngredient_pregabalin,ActiveIngredient_procaine penicillin,ActiveIngredient_prochlorperazine,ActiveIngredient_propranolol,ActiveIngredient_pseudoephedrine,ActiveIngredient_quinapril,ActiveIngredient_ramipril,ActiveIngredient_ranitidine,ActiveIngredient_risedronate,ActiveIngredient_risperiDONE,ActiveIngredient_rivastigmine,ActiveIngredient_rizatriptan,ActiveIngredient_rosuvastatin,ActiveIngredient_sertraline,ActiveIngredient_silver sulfADIAZINE topical,ActiveIngredient_simvastatin,ActiveIngredient_sulfacetamide sodium ophthalmic,ActiveIngredient_sulfacetamide sodium topical,ActiveIngredient_sulfamethoxazole-trimethoprim,ActiveIngredient_sulfanilamide topical,ActiveIngredient_sulfur topical,ActiveIngredient_tamoxifen,ActiveIngredient_terazosin,ActiveIngredient_terbinafine,ActiveIngredient_terbinafine topical,ActiveIngredient_tetanus toxoid,ActiveIngredient_tetracycline,ActiveIngredient_tetracycline topical,ActiveIngredient_theophylline,ActiveIngredient_thioridazine,ActiveIngredient_tiZANidine,ActiveIngredient_tolmetin,ActiveIngredient_topiramate,ActiveIngredient_traMADol,ActiveIngredient_trifluoperazine,ActiveIngredient_valsartan,ActiveIngredient_vancomycin,ActiveIngredient_vardenafil,ActiveIngredient_venlafaxine,ActiveIngredient_verapamil,ActiveIngredient_warfarin,ActiveIngredient_zolpidem
0,1,84,31.381,True,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,1,84,31.381,True,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,1,84,31.381,True,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,1,84,31.381,True,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,1,84,31.381,True,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


# MODEL

In [34]:
#Split data for data_v1
X  = model_data_encoded.drop('DMIndicator', axis=1)
y  = model_data_encoded['DMIndicator']

In [35]:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [39]:
random_search = RandomizedSearchCV(estimator=model, 
                                   param_distributions=param_grid, 
                                   n_iter=50,  # Number of parameter settings to sample
                                   cv=3, 
                                   n_jobs=-1, 
                                   verbose=2, 
                                   scoring='accuracy',
                                   random_state=42) 

In [40]:
# Initialize the Random Forest Classifier
model = RandomForestClassifier()


In [None]:
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [21]:
# Fit the grid search to the data
# grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 576 candidates, totalling 1728 fits
[CV] END bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time= 1.3min
[CV] END bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time= 1.4min
[CV] END bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time= 1.4min
[CV] END bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 2.3min
[CV] END bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 2.5min
[CV] END bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 2.5min
[CV] END bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time= 1.

KeyboardInterrupt: 