In [1]:
import pandas as pd
import numpy as np

### Model 1.1.1:
 - Input Features: generic_name (reduced Dim), drug_char, drug_indication (reduced Dim), admin route, reaction_medDRA at SOC level
 - Model: Random Forest, 100 estimators
 - Input Data: 2008, Quarter 2
 
### Model 1.2.1:
 - Input Features: generic_name (reduced Dim), drug_char, drug_indication (reduced Dim), admin route, reaction_medDRA at HLGT level
 - Model: Random Forest, 100 estimators
 - Input Data: 2008, Quarter 2
 
### Model 1.3.1:
 - Input Features: generic_name (reduced Dim), drug_char, drug_indication (reduced Dim), admin route, reaction_medDRA at HLT level
 - Model: Random Forest, 100 estimators
 - Input Data: 2008, Quarter 2
 
(Convention: Model_X_Y_Z where X is data input size (rows), Y is feature size (cols), and Z is model type (e.g. RF))

In [2]:
test_col = 'seriousness_lifethreatening'
new_col_name = 'Model 1.1 (Life Threatening)'
file_to_read = 'progress/modeling/df_ML_model_1_1.csv'
file_for_results = 'progress/modeling/model_1_1_RF_results.csv'
model_filename = './progress/modeling/RF_1_1.pkl'
firstrun = True

In [3]:
# Read in the data from Wrangling
df = pd.read_csv(file_to_read, index_col = 0)
# Drop all outcomes except that of interest
all_cols = ['serious', 'seriousness_congential_anomali', 'seriousness_death', 
              'seriousness_disabling', 'seriousness_hospitalization', 
              'seriousness_lifethreatening', 'seriousness_other']

drop_cols = [col for col in all_cols if col != test_col]
df = df.drop(drop_cols, axis = 1)

df.head()

Unnamed: 0,ABACAVIR,ABACAVIR SULFATE,ACETAMINOPHEN AND CODEINE,ADENOSINE,ALCOHOL,ALENDRONATE SODIUM,ALISKIREN HEMIFUMARATE AN,ALUMINUM HYDROXIDE AND MA,AMLODIPINE BESYLATE AND V,ANTI-THYMOCYTE GLOBULIN (,...,PRODUCT ISSUES,PSYCHIATRIC DISORDERS,RENAL AND URINARY DISORDERS,REPRODUCTIVE SYSTEM AND BREAST DISORDERS,RESPIRATORY,SKIN AND SUBCUTANEOUS TISSUE DISORDERS,SOCIAL CIRCUMSTANCES,SURGICAL AND MEDICAL PROCEDURES,VASCULAR DISORDERS,seriousness_lifethreatening
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,7.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [4]:
from sklearn.model_selection import train_test_split

  from collections import Sequence


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.drop([test_col], axis = 1), 
                                                    df[test_col].astype('category'), 
                                                    test_size = 0.33, 
                                                    random_state = 189)

In [6]:
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [7]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_estimators = 100, random_state=0)

In [8]:
# Train the Classifier
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [9]:
# Apply the Classifier we trained to the train and test data 
preds_train = clf.predict(X_train)
preds_test = clf.predict(X_test)

In [10]:
# View the predicted probabilities of the first 10 observations
clf.predict_proba(X_test)[0:10]

array([[0.99, 0.01],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.77, 0.23],
       [0.94, 0.06],
       [0.84, 0.16],
       [0.97, 0.03],
       [1.  , 0.  ],
       [0.93, 0.07]])

In [12]:
#import pickle
with open(model_filename, "wb") as fp:   #Pickling
    pickle.dump(clf, fp)

In [11]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

In [12]:
# Confusion matrix for training set
cm_train = confusion_matrix(y_train, preds_train)
cm_train = pd.DataFrame(cm_train)
cm_train.columns = ['Negative', 'Positive']
cm_train.index = cm_train.columns
print("Confusion Matrix (Train):")
cm_train

Confusion Matrix (Train):


Unnamed: 0,Negative,Positive
Negative,44773,0
Positive,31,1750


In [13]:
# Confusion matrix for test set
cm_test = confusion_matrix(y_test, preds_test)
cm_test = pd.DataFrame(cm_test)
cm_test.columns = ['Negative', 'Positive']
cm_test.index = cm_test.columns
print("Confusion Matrix (Test):")
cm_test

Confusion Matrix (Test):


Unnamed: 0,Negative,Positive
Negative,21960,29
Positive,839,103


In [14]:
# Model accuracy
acc_train = np.trace(np.asarray(cm_train))/len(X_train)
acc_test = np.trace(np.asarray(cm_test))/len(X_test)
fraction_pos = sum(y_train)/len(y_train)
print('Train Accuracy is {a:0.2f}%'.format(a = acc_train*100))
print('Test Accuracy is {a:0.2f}%'.format(a = acc_test*100))
print('Percent Positive: {a:0.2f}%'.format(a = fraction_pos*100))

Train Accuracy is 99.93%
Test Accuracy is 96.21%
Percent Positive: 3.83%


In [15]:
# Output metrics for train set
prfs_train = precision_recall_fscore_support(y_train, preds_train)
prfs_train = np.array([prfs_train[0], prfs_train[1], prfs_train[2], prfs_train[3]])
prfs_train = pd.DataFrame(prfs_train)
prfs_train.columns = ['Negative', 'Positive']
prfs_train.index = ['Precision','Recall','F-score','Support']
print("Precision, Recall, F-Score (Train):")
prfs_train

Precision, Recall, F-Score (Train):


Unnamed: 0,Negative,Positive
Precision,0.999308,1.0
Recall,1.0,0.982594
F-score,0.999654,0.991221
Support,44773.0,1781.0


In [16]:
# Output metrics for train set
prfs_test = precision_recall_fscore_support(y_test, preds_test)
prfs_test = np.array([prfs_test[0], prfs_test[1], prfs_test[2], prfs_test[3]])
prfs_test = pd.DataFrame(prfs_test)
prfs_test.columns = ['Negative', 'Positive']
prfs_test.index = ['Precision','Recall','F-score','Support']
print("Precision, Recall, F-Score (Test):")
prfs_test

Precision, Recall, F-Score (Test):


Unnamed: 0,Negative,Positive
Precision,0.9632,0.780303
Recall,0.998681,0.109342
F-score,0.98062,0.191806
Support,21989.0,942.0


In [17]:
# View a list of the features and their importance scores
list(zip(X_train, clf.feature_importances_))

[('ABACAVIR', 0.0014815507988388999),
 ('ABACAVIR SULFATE', 0.0023639059754465837),
 ('ACETAMINOPHEN AND CODEINE', 4.381993587993433e-06),
 ('ADENOSINE', 0.0017621163654919629),
 ('ALCOHOL', 0.0034663669814863733),
 ('ALENDRONATE SODIUM', 0.002946209165999521),
 ('ALISKIREN HEMIFUMARATE', 0.00019395872696145546),
 ('ALUMINUM HYDROXIDE AND MA', 1.0929388750281025e-06),
 ('AMLODIPINE BESYLATE AND B', 0.001533448445859485),
 ('ANTI-THYMOCYTE GLOBULIN (', 0.0),
 ('ANTIFUNGAL POWDER SPRAY', 0.0),
 ('ANTIFUNGAL SPRAY LIQUID', 0.0),
 ('ASPIRIN', 0.005635698256669289),
 ('ASPIRIN AND CAFFEINE', 0.0005963617110759723),
 ('ATHLETES FOOT SPRAY', 0.002261501176388885),
 ('ATORVASTATIN CALCIUM', 0.0031007946817767653),
 ('ATROPA BELLADONNA', 0.0),
 ('AVOBENZONE OCTISALATE OCT', 6.64900902306175e-08),
 ('AZITHROMYCIN DIHYDRATE', 0.0007205794042272484),
 ('BETHANECHOL CHLORIDE 50 M', 7.031339866474282e-07),
 ('BOSENTAN', 0.002738776616380701),
 ('BRIMONIDINE TARTRATE', 0.002584087463178371),
 ('BUPIV

## Save Results

In [18]:
if firstrun:
    results_prev = pd.DataFrame()
else:
    results_prev = pd.read_csv(file_for_results, index_col = 0)
results_prev

Unnamed: 0_level_0,Model 1.3 (Serious),Model 1.3 (Death),Model 1.3 (Disabling),Model 1.3 (Hospital)
names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Train_acc,99.563947,99.79164,99.92267,99.578984
Train_Recall_0,0.994647,0.999364,0.999912,0.99898
Train_Recall_1,0.997091,0.982839,0.971606,0.985785
Train_Precision_0,0.998004,0.998354,0.999296,0.995483
Train_Precision_1,0.99221,0.99331,0.99636,0.996766
Train_Fscore_0,0.996323,0.998859,0.999604,0.997229
Train_Fscore_1,0.994644,0.988047,0.983827,0.991245
Test_acc,86.782085,94.670969,97.771576,83.463434
Test_Recall_0,0.89925,0.990409,0.999106,0.947709
Test_Recall_1,0.822029,0.508418,0.107273,0.486664


In [19]:
# Set up new results to add in a list
results_new = [
    acc_train*100, 
    prfs_train['Negative'][1], prfs_train['Positive'][1], 
    prfs_train['Negative'][0], prfs_train['Positive'][0], 
    prfs_train['Negative'][2], prfs_train['Positive'][2], 
    acc_test*100, 
    prfs_test['Negative'][1], prfs_test['Positive'][1], 
    prfs_test['Negative'][0], prfs_test['Positive'][0], 
    prfs_test['Negative'][2], prfs_test['Positive'][2], 
    fraction_pos*100
]

In [20]:
# add the new results
results = results_prev
results[new_col_name] = results_new

# Set index
if firstrun:
    results['names'] = pd.DataFrame(['Train_acc', 
                                     'Train_Recall_0', 'Train_Recall_1', 'Train_Precision_0',
                                     'Train_Precision_1', 'Train_Fscore_0', 'Train_Fscore_1', 
                                     'Test_acc', 
                                     'Test_Recall_0', 'Test_Recall_1','Test_Precision_0', 
                                     'Test_Precision_1', 'Test_Fscore_0', 'Test_Fscore_1', 
                                     'Percent_positive'])
    results = results.set_index('names')
# Show
results

Unnamed: 0_level_0,Model 1.3 (Serious),Model 1.3 (Death),Model 1.3 (Disabling),Model 1.3 (Hospital),Model 1.3 (Life Threatening)
names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Train_acc,99.563947,99.79164,99.92267,99.578984,99.933411
Train_Recall_0,0.994647,0.999364,0.999912,0.99898,1.0
Train_Recall_1,0.997091,0.982839,0.971606,0.985785,0.982594
Train_Precision_0,0.998004,0.998354,0.999296,0.995483,0.999308
Train_Precision_1,0.99221,0.99331,0.99636,0.996766,1.0
Train_Fscore_0,0.996323,0.998859,0.999604,0.997229,0.999654
Train_Fscore_1,0.994644,0.988047,0.983827,0.991245,0.991221
Test_acc,86.782085,94.670969,97.771576,83.463434,96.214731
Test_Recall_0,0.89925,0.990409,0.999106,0.947709,0.998681
Test_Recall_1,0.822029,0.508418,0.107273,0.486664,0.109342


In [21]:
# Save it
#results.to_csv(file_for_results)