In [1]:
import pandas as pd
import numpy as np

### Model 1.1.NB:
 - Input Features: generic_name (reduced Dim), drug_char, drug_indication (reduced Dim), admin route, reaction_medDRA at SOC level
 - Model: Naive Bayes, no priors
 - Input Data: 2008, Quarter 2
 
### Model 1.2.NB:
 - Input Features: generic_name (reduced Dim), drug_char, drug_indication (reduced Dim), admin route, reaction_medDRA at HLGT level
 - Model: Naive Bayes, no priors
 - Input Data: 2008, Quarter 2
 
### Model 1.3.NB:
 - Input Features: generic_name (reduced Dim), drug_char, drug_indication (reduced Dim), admin route, reaction_medDRA at HLT level
 - Model: Naive Bayes, no priors
 - Input Data: 2008, Quarter 2
 
(Convention: Model_X_Y_Z where X is data input size (rows), Y is feature size (cols), and Z is model type (e.g. RF))

In [2]:
test_col = 'seriousness_lifethreatening'
new_col_name = 'Model 1.3 (Life Threatening)'
file_to_read = 'progress/modeling/df_ML_model_1_3.csv'
#file_for_results = 'progress/modeling/model_1_3_NB_results.csv'
firstrun = False

In [3]:
# Read in the data from Wrangling
df = pd.read_csv(file_to_read, index_col = 0)
# Drop all outcomes except that of interest
all_cols = ['serious', 'seriousness_congential_anomali', 'seriousness_death', 
              'seriousness_disabling', 'seriousness_hospitalization', 
              'seriousness_lifethreatening', 'seriousness_other']

drop_cols = [col for col in all_cols if col != test_col]
df = df.drop(drop_cols, axis = 1)

df.head()

Unnamed: 0,ABACAVIR,ABACAVIR SULFATE,ACETAMINOPHEN AND CODEINE,ADENOSINE,ALCOHOL,ALENDRONATE SODIUM,ALISKIREN HEMIFUMARATE,ALUMINUM HYDROXIDE AND MA,AMLODIPINE BESYLATE AND B,ANTI-THYMOCYTE GLOBULIN (,...,VULVOVAGINAL DISORDERS NEC,VULVOVAGINAL SIGNS AND SYMPTOMS,WALDENSTROM'S MACROGLOBULINAEMIAS,WATER AND ELECTROLYTE ANALYSES NEC,WATER SOLUBLE VITAMIN DEFICIENCIES,WHITE BLOOD CELL ABNORMAL FINDINGS NEC,WHITE BLOOD CELL ANALYSES,WITHDRAWAL AND REBOUND EFFECTS,YERSINIA INFECTIONS,seriousness_lifethreatening
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [4]:
from sklearn.model_selection import train_test_split

  from collections import Sequence


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.drop([test_col], axis = 1), 
                                                    df[test_col].astype('category'), 
                                                    test_size = 0.33, 
                                                    random_state = 189)

In [6]:
# Load scikit's Naive Bayes Library
from sklearn.naive_bayes import GaussianNB

In [7]:
# Create a NB Classifier. By convention, clf means 'Classifier'
NB = GaussianNB(priors = None)

In [8]:
# Train the Classifier
NB.fit(X_train, y_train)

GaussianNB(priors=None)

In [9]:
# Apply the Classifier we trained to the train and test data 
preds_train = NB.predict(X_train)
preds_test = NB.predict(X_test)

In [10]:
# View the predicted probabilities of the first 10 observations
NB.predict_proba(X_test)[0:10]

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.]])

In [11]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

In [12]:
# Confusion matrix for training set
cm_train = confusion_matrix(y_train, preds_train)
cm_train = pd.DataFrame(cm_train)
cm_train.columns = ['Negative', 'Positive']
cm_train.index = cm_train.columns
print("Confusion Matrix (Train):")
cm_train

Confusion Matrix (Train):


Unnamed: 0,Negative,Positive
Negative,6281,38492
Positive,0,1781


In [13]:
    # Confusion matrix for test set
    cm_test = confusion_matrix(y_test, preds_test)
    cm_test = pd.DataFrame(cm_test)
    cm_test.columns = ['Negative', 'Positive']
    cm_test.index = cm_test.columns
print("Confusion Matrix (Test):")
cm_test

Confusion Matrix (Test):


Unnamed: 0,Negative,Positive
Negative,3064,18925
Positive,105,837


In [14]:
# Model accuracy
acc_train = np.trace(np.asarray(cm_train))/len(X_train)
acc_test = np.trace(np.asarray(cm_test))/len(X_test)
fraction_pos = sum(y_train)/len(y_train)
print('Train Accuracy is {a:0.2f}%'.format(a = acc_train*100))
print('Test Accuracy is {a:0.2f}%'.format(a = acc_test*100))
print('Percent Positive: {a:0.2f}%'.format(a = fraction_pos*100))

Train Accuracy is 17.32%
Test Accuracy is 17.01%
Percent Positive: 3.83%


In [15]:
# Output metrics for train set
prfs_train = precision_recall_fscore_support(y_train, preds_train)
prfs_train = np.array([prfs_train[0], prfs_train[1], prfs_train[2], prfs_train[3]])
prfs_train = pd.DataFrame(prfs_train)
prfs_train.columns = ['Negative', 'Positive']
prfs_train.index = ['Precision','Recall','F-score','Support']
print("Precision, Recall, F-Score (Train):")
prfs_train

Precision, Recall, F-Score (Train):


Unnamed: 0,Negative,Positive
Precision,1.0,0.044223
Recall,0.140285,1.0
F-score,0.246053,0.084701
Support,44773.0,1781.0


In [16]:
# Output metrics for train set
prfs_test = precision_recall_fscore_support(y_test, preds_test)
prfs_test = np.array([prfs_test[0], prfs_test[1], prfs_test[2], prfs_test[3]])
prfs_test = pd.DataFrame(prfs_test)
prfs_test.columns = ['Negative', 'Positive']
prfs_test.index = ['Precision','Recall','F-score','Support']
print("Precision, Recall, F-Score (Test):")
prfs_test

Precision, Recall, F-Score (Test):


Unnamed: 0,Negative,Positive
Precision,0.966867,0.042354
Recall,0.139342,0.888535
F-score,0.243581,0.080854
Support,21989.0,942.0


In [17]:
# View a list of the features and their relevance
NB.theta_

array([[1.38699663e-02, 1.64384785e-02, 6.70046680e-05, ...,
        1.26638823e-02, 8.06289505e-03, 2.23348893e-05],
       [2.02133633e-02, 3.64963504e-02, 0.00000000e+00, ...,
        2.52667041e-02, 8.42223470e-03, 0.00000000e+00]])

In [18]:
# View a list of the features and their importance scores
NB.sigma_

array([[1.67598164e-02, 1.97418486e-02, 6.70115595e-05, ...,
        2.08567683e-02, 8.35525438e-03, 2.23457716e-05],
       [2.87885116e-02, 3.51643782e-02, 1.13811206e-08, ...,
        5.83172479e-02, 8.35131204e-03, 1.13811206e-08]])

## Save Results

In [19]:
if firstrun:
    results_prev = pd.DataFrame()
else:
    results_prev = pd.read_csv(file_for_results, index_col = 0)
results_prev

Unnamed: 0_level_0,Model 1.3 (Serious),Model 1.3 (Death),Model 1.3 (Disabling),Model 1.3 (Hospital)
names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Train_acc,65.120076,24.962409,18.932852,76.060059
Train_Recall_0,0.425513,0.177634,0.169217,0.775625
Train_Recall_1,0.981275,0.999265,1.0,0.713486
Train_Precision_0,0.97079,0.999603,1.0,0.894618
Train_Precision_1,0.538725,0.104497,0.028996,0.503479
Train_Fscore_0,0.591682,0.301661,0.289453,0.830883
Train_Fscore_1,0.695576,0.189207,0.056358,0.590363
Test_acc,65.003707,24.133269,18.90454,74.466879
Test_Recall_0,0.43021,0.171398,0.174434,0.763159
Test_Recall_1,0.97032,0.942761,0.783636,0.687767


In [20]:
# Set up new results to add in a list
results_new = [
    acc_train*100, 
    prfs_train['Negative'][1], prfs_train['Positive'][1], 
    prfs_train['Negative'][0], prfs_train['Positive'][0], 
    prfs_train['Negative'][2], prfs_train['Positive'][2], 
    acc_test*100, 
    prfs_test['Negative'][1], prfs_test['Positive'][1], 
    prfs_test['Negative'][0], prfs_test['Positive'][0], 
    prfs_test['Negative'][2], prfs_test['Positive'][2], 
    fraction_pos*100
]

In [21]:
# add the new results
results = results_prev
results[new_col_name] = results_new

# Set index
if firstrun:
    results['names'] = pd.DataFrame(['Train_acc', 
                                     'Train_Recall_0', 'Train_Recall_1', 'Train_Precision_0',
                                     'Train_Precision_1', 'Train_Fscore_0', 'Train_Fscore_1', 
                                     'Test_acc', 
                                     'Test_Recall_0', 'Test_Recall_1','Test_Precision_0', 
                                     'Test_Precision_1', 'Test_Fscore_0', 'Test_Fscore_1', 
                                     'Percent_positive'])
    results = results.set_index('names')
# Show
results

Unnamed: 0_level_0,Model 1.3 (Serious),Model 1.3 (Death),Model 1.3 (Disabling),Model 1.3 (Hospital),Model 1.3 (Life Threatening)
names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Train_acc,65.120076,24.962409,18.932852,76.060059,17.317524
Train_Recall_0,0.425513,0.177634,0.169217,0.775625,0.140285
Train_Recall_1,0.981275,0.999265,1.0,0.713486,1.0
Train_Precision_0,0.97079,0.999603,1.0,0.894618,1.0
Train_Precision_1,0.538725,0.104497,0.028996,0.503479,0.044223
Train_Fscore_0,0.591682,0.301661,0.289453,0.830883,0.246053
Train_Fscore_1,0.695576,0.189207,0.056358,0.590363,0.084701
Test_acc,65.003707,24.133269,18.90454,74.466879,17.011905
Test_Recall_0,0.43021,0.171398,0.174434,0.763159,0.139342
Test_Recall_1,0.97032,0.942761,0.783636,0.687767,0.888535


In [22]:
# Save it
results.to_csv(file_for_results)