In [1]:
import pandas as pd
import numpy as np

### Model 1.1.LR:
 - Input Features: generic_name (reduced Dim), drug_char, drug_indication (reduced Dim), admin route, reaction_medDRA at SOC level
 - Model: Logisitic Regression, Ridge
 - Input Data: 2008, Quarter 2
 
### Model 1.2.LR:
 - Input Features: generic_name (reduced Dim), drug_char, drug_indication (reduced Dim), admin route, reaction_medDRA at HLGT level
 - Model: Logisitic Regression, Ridge
 - Input Data: 2008, Quarter 2
 
### Model 1.3.LR:
 - Input Features: generic_name (reduced Dim), drug_char, drug_indication (reduced Dim), admin route, reaction_medDRA at HLT level
 - Model: Logisitic Regression, Ridge
 - Input Data: 2008, Quarter 2
 
(Convention: Model_X_Y_Z where X is data input size (rows), Y is feature size (cols), and Z is model type (e.g. RF))

In [2]:
test_col = 'seriousness_lifethreatening'
new_col_name = 'Model 1.3 (Life Threatening)'
file_to_read = 'progress/modeling/df_ML_model_1_3.csv'
#file_for_results = 'progress/modeling/model_1_3_LR_results.csv'
firstrun = False

In [3]:
# Read in the data from Wrangling
df = pd.read_csv(file_to_read, index_col = 0)

In [4]:
from helper_funcs.model_preds import LR_pred

  from collections import Sequence
  from numpy.core.umath_tests import inner1d


In [5]:
LR, results, cm_train, cm_test, prfs_train, prfs_test = LR_pred(df, test_col)

In [3]:
# Read in the data from Wrangling
df = pd.read_csv(file_to_read, index_col = 0)
# Drop all outcomes except that of interest
all_cols = ['serious', 'seriousness_congential_anomali', 'seriousness_death', 
              'seriousness_disabling', 'seriousness_hospitalization', 
              'seriousness_lifethreatening', 'seriousness_other']

drop_cols = [col for col in all_cols if col != test_col]
df = df.drop(drop_cols, axis = 1)

df.head()

Unnamed: 0,ABACAVIR,ABACAVIR SULFATE,ACETAMINOPHEN AND CODEINE,ADENOSINE,ALCOHOL,ALENDRONATE SODIUM,ALISKIREN HEMIFUMARATE,ALUMINUM HYDROXIDE AND MA,AMLODIPINE BESYLATE AND B,ANTI-THYMOCYTE GLOBULIN (,...,VULVOVAGINAL DISORDERS NEC,VULVOVAGINAL SIGNS AND SYMPTOMS,WALDENSTROM'S MACROGLOBULINAEMIAS,WATER AND ELECTROLYTE ANALYSES NEC,WATER SOLUBLE VITAMIN DEFICIENCIES,WHITE BLOOD CELL ABNORMAL FINDINGS NEC,WHITE BLOOD CELL ANALYSES,WITHDRAWAL AND REBOUND EFFECTS,YERSINIA INFECTIONS,seriousness_lifethreatening
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [4]:
from sklearn.model_selection import train_test_split

  from collections import Sequence


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.drop([test_col], axis = 1), 
                                                    df[test_col].astype('category'), 
                                                    test_size = 0.33, 
                                                    random_state = 189)

In [6]:
# Load scikit's logistic regression library
from sklearn.linear_model import LogisticRegression

In [7]:
# Create a model
LR = LogisticRegression('l2', random_state=0)

In [8]:
# Train the Classifier
LR.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
# Apply the Classifier we trained to the train and test data 
preds_train = LR.predict(X_train)
preds_test = LR.predict(X_test)

In [10]:
# View the predicted probabilities of the first 10 observations
LR.predict_proba(X_test)[0:10]

array([[0.97742184, 0.02257816],
       [0.96176344, 0.03823656],
       [0.98024311, 0.01975689],
       [0.93021467, 0.06978533],
       [0.98914076, 0.01085924],
       [0.99825951, 0.00174049],
       [0.96363557, 0.03636443],
       [0.98348351, 0.01651649],
       [0.98238639, 0.01761361],
       [0.98906457, 0.01093543]])

In [11]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

In [12]:
# Confusion matrix for training set
cm_train = confusion_matrix(y_train, preds_train)
cm_train = pd.DataFrame(cm_train)
cm_train.columns = ['Negative', 'Positive']
cm_train.index = cm_train.columns
print("Confusion Matrix (Train):")
cm_train

Confusion Matrix (Train):


Unnamed: 0,Negative,Positive
Negative,44651,122
Positive,1585,196


In [13]:
# Confusion matrix for test set
cm_test = confusion_matrix(y_test, preds_test)
cm_test = pd.DataFrame(cm_test)
cm_test.columns = ['Negative', 'Positive']
cm_test.index = cm_test.columns
print("Confusion Matrix (Test):")
cm_test

Confusion Matrix (Test):


Unnamed: 0,Negative,Positive
Negative,21854,135
Positive,879,63


In [14]:
# Model accuracy
acc_train = np.trace(np.asarray(cm_train))/len(X_train)
acc_test = np.trace(np.asarray(cm_test))/len(X_test)
fraction_pos = sum(y_train)/len(y_train)
print('Train Accuracy is {a:0.2f}%'.format(a = acc_train*100))
print('Test Accuracy is {a:0.2f}%'.format(a = acc_test*100))
print('Percent Positive: {a:0.2f}%'.format(a = fraction_pos*100))

Train Accuracy is 96.33%
Test Accuracy is 95.58%
Percent Positive: 3.83%


In [15]:
# Output metrics for train set
prfs_train = precision_recall_fscore_support(y_train, preds_train)
prfs_train = np.array([prfs_train[0], prfs_train[1], prfs_train[2], prfs_train[3]])
prfs_train = pd.DataFrame(prfs_train)
prfs_train.columns = ['Negative', 'Positive']
prfs_train.index = ['Precision','Recall','F-score','Support']
print("Precision, Recall, F-Score (Train):")
prfs_train

Precision, Recall, F-Score (Train):


Unnamed: 0,Negative,Positive
Precision,0.965719,0.616352
Recall,0.997275,0.110051
F-score,0.981244,0.186756
Support,44773.0,1781.0


In [16]:
# Output metrics for train set
prfs_test = precision_recall_fscore_support(y_test, preds_test)
prfs_test = np.array([prfs_test[0], prfs_test[1], prfs_test[2], prfs_test[3]])
prfs_test = pd.DataFrame(prfs_test)
prfs_test.columns = ['Negative', 'Positive']
prfs_test.index = ['Precision','Recall','F-score','Support']
print("Precision, Recall, F-Score (Test):")
prfs_test

Precision, Recall, F-Score (Test):


Unnamed: 0,Negative,Positive
Precision,0.961334,0.318182
Recall,0.993861,0.066879
F-score,0.977327,0.110526
Support,21989.0,942.0


In [17]:
# View a list of the features and their importance scores
LR.coef_

array([[-0.05756435,  0.5455549 , -0.0612878 , ..., -0.21852241,
        -0.10259404, -0.01669519]])

## Save Results

In [18]:
if firstrun:
    results_prev = pd.DataFrame()
else:
    results_prev = pd.read_csv(file_for_results, index_col = 0)
results_prev

Unnamed: 0_level_0,Model 1.3 (Serious),Model 1.3 (Death),Model 1.3 (Disabling),Model 1.3 (Hospital)
names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Train_acc,87.4318,95.085277,97.673669,83.103493
Train_Recall_0,0.876632,0.991878,0.999252,0.946371
Train_Recall_1,0.870934,0.523658,0.06921,0.46935
Train_Precision_0,0.908539,0.955914,0.977413,0.848316
Train_Precision_1,0.828386,0.860943,0.696429,0.736204
Train_Fscore_0,0.8923,0.973564,0.988211,0.894665
Train_Fscore_1,0.849127,0.65122,0.125908,0.573242
Test_acc,86.123588,94.117134,97.461951,81.392002
Test_Recall_0,0.863215,0.988874,0.997364,0.938349
Test_Recall_1,0.858352,0.462722,0.049091,0.43101


In [19]:
# Set up new results to add in a list
results_new = [
    acc_train*100, 
    prfs_train['Negative'][1], prfs_train['Positive'][1], 
    prfs_train['Negative'][0], prfs_train['Positive'][0], 
    prfs_train['Negative'][2], prfs_train['Positive'][2], 
    acc_test*100, 
    prfs_test['Negative'][1], prfs_test['Positive'][1], 
    prfs_test['Negative'][0], prfs_test['Positive'][0], 
    prfs_test['Negative'][2], prfs_test['Positive'][2], 
    fraction_pos*100
]

In [20]:
# add the new results
results = results_prev
results[new_col_name] = results_new

# Set index
if firstrun:
    results['names'] = pd.DataFrame(['Train_acc', 
                                     'Train_Recall_0', 'Train_Recall_1', 'Train_Precision_0',
                                     'Train_Precision_1', 'Train_Fscore_0', 'Train_Fscore_1', 
                                     'Test_acc', 
                                     'Test_Recall_0', 'Test_Recall_1','Test_Precision_0', 
                                     'Test_Precision_1', 'Test_Fscore_0', 'Test_Fscore_1', 
                                     'Percent_positive'])
    results = results.set_index('names')
# Show
results

Unnamed: 0_level_0,Model 1.3 (Serious),Model 1.3 (Death),Model 1.3 (Disabling),Model 1.3 (Hospital),Model 1.3 (Life Threatening)
names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Train_acc,87.4318,95.085277,97.673669,83.103493,96.33329
Train_Recall_0,0.876632,0.991878,0.999252,0.946371,0.997275
Train_Recall_1,0.870934,0.523658,0.06921,0.46935,0.110051
Train_Precision_0,0.908539,0.955914,0.977413,0.848316,0.965719
Train_Precision_1,0.828386,0.860943,0.696429,0.736204,0.616352
Train_Fscore_0,0.8923,0.973564,0.988211,0.894665,0.981244
Train_Fscore_1,0.849127,0.65122,0.125908,0.573242,0.186756
Test_acc,86.123588,94.117134,97.461951,81.392002,95.578038
Test_Recall_0,0.863215,0.988874,0.997364,0.938349,0.993861
Test_Recall_1,0.858352,0.462722,0.049091,0.43101,0.066879


In [21]:
# Save it
results.to_csv(file_for_results)