In [97]:
import pandas as pd
import numpy as np
import copy
import sklearn 
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
import sklearn.metrics as metrics 
import scipy
import scipy.stats as st

# from sklearn.exceptions import DeprecationWarning
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [2]:
# Load Data
df_features_any = pd.read_csv('features/df_features_any.csv')

In [3]:
df_features_any.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,...,AVG_CREAT,MAX_CREAT,MIN_UREA,AVG_UREA,MAX_UREA,MIN_INR,AVG_INR,MAX_INR,THIRTYDAYREADMIT,ONEYEARREADMIT
0,23,152223,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,NOT STATED,CATHOLIC,MARRIED,WHITE,...,0.72,0.8,14,17.25,20,1.5,1.75,2.0,False,False
1,24,161859,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,NOT STATED,PROTESTANT QUAKER,SINGLE,WHITE,...,0.92,1.0,10,11.5,13,1.0,1.0,1.0,False,False
2,25,129635,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,NOT STATED,UNOBTAINABLE,MARRIED,WHITE,...,1.19,1.6,12,39.56,54,0.9,1.0,1.1,False,False
3,26,197661,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Medicare,NOT STATED,CATHOLIC,SINGLE,UNKNOWN/NOT SPECIFIED,...,1.31,1.4,28,31.5,36,1.2,1.77,2.5,False,False
4,28,162569,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,NOT STATED,CATHOLIC,MARRIED,WHITE,...,1.05,1.2,13,19.0,25,1.1,1.25,1.4,False,False


In [4]:
# One Hot Encoding
df_features = pd.get_dummies(df_features_any, drop_first = True)
df_features.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADLOS,EDLOS,AVG_WEIGHT,DIABETES,CHF,ASTHMA,CVA,ARRHYTHMIA,...,ETHNICITY_MULTI RACE ETHNICITY,ETHNICITY_NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER,ETHNICITY_OTHER,ETHNICITY_PATIENT DECLINED TO ANSWER,ETHNICITY_PORTUGUESE,ETHNICITY_SOUTH AMERICAN,ETHNICITY_UNABLE TO OBTAIN,ETHNICITY_UNKNOWN/NOT SPECIFIED,ETHNICITY_WHITE,GENDER_M
0,23,152223,5.5,0.0,158.95,False,False,False,False,False,...,0,0,0,0,0,0,0,0,1,1
1,24,161859,2.86,0.0,264.55,False,False,False,False,False,...,0,0,0,0,0,0,0,0,1,1
2,25,129635,3.53,0.14,221.34,False,False,False,False,False,...,0,0,0,0,0,0,0,0,1,1
3,26,197661,6.99,0.0,176.37,False,False,False,False,False,...,0,0,0,0,0,0,0,1,0,1
4,28,162569,5.36,0.0,211.64,False,False,False,False,False,...,0,0,0,0,0,0,0,0,1,1


In [5]:
df_final = df_features
df_final.ONEYEARREADMIT = df_final.ONEYEARREADMIT.astype(int)
df_final.THIRTYDAYREADMIT = df_final.THIRTYDAYREADMIT.astype(int)

In [6]:
df_final.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADLOS,EDLOS,AVG_WEIGHT,DIABETES,CHF,ASTHMA,CVA,ARRHYTHMIA,...,ETHNICITY_MULTI RACE ETHNICITY,ETHNICITY_NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER,ETHNICITY_OTHER,ETHNICITY_PATIENT DECLINED TO ANSWER,ETHNICITY_PORTUGUESE,ETHNICITY_SOUTH AMERICAN,ETHNICITY_UNABLE TO OBTAIN,ETHNICITY_UNKNOWN/NOT SPECIFIED,ETHNICITY_WHITE,GENDER_M
0,23,152223,5.5,0.0,158.95,False,False,False,False,False,...,0,0,0,0,0,0,0,0,1,1
1,24,161859,2.86,0.0,264.55,False,False,False,False,False,...,0,0,0,0,0,0,0,0,1,1
2,25,129635,3.53,0.14,221.34,False,False,False,False,False,...,0,0,0,0,0,0,0,0,1,1
3,26,197661,6.99,0.0,176.37,False,False,False,False,False,...,0,0,0,0,0,0,0,1,0,1
4,28,162569,5.36,0.0,211.64,False,False,False,False,False,...,0,0,0,0,0,0,0,0,1,1


In [7]:
df_final[['ELIXHAUSER_SCORE']] = df_final[['ELIXHAUSER_SCORE']].fillna(value=0)
df_final[['CHARLSON_SCORE']] = df_final[['CHARLSON_SCORE']].fillna(value=0)

In [8]:
df_final.isnull().values.any()

False

# MI

In [9]:
df_mi_thirtyday = pd.read_csv('features/MI/MI_py_THIRTYDAYREADMIT.csv')
df_mi_onyear = pd.read_csv('features/MI/MI_py_ONEYEARREADMIT.csv')

In [10]:
#df_mi_thirtyday
df_mi_thirtyday.sort_values('MI', ascending=False).head()

Unnamed: 0,FEATURE,MI
0,ELIXHAUSER_SCORE,0.00761
1,MAX_UREA,0.00672
2,CHARLSON_SCORE,0.00626
3,EDLOS,0.00576
4,MAX_CREAT,0.00532


In [11]:
def TopK (readm_type, k):
    
    if (readm_type == 'THIRTYDAYREADMIT'):
        tk = df_mi_thirtyday.sort_values('MI', ascending=False).head(n=k)
        dfList = tk['FEATURE'].tolist()    
    elif (readm_type == 'ONEYEARREADMIT'):
        tk = df_mi_onyear.sort_values('MI', ascending=False).head(n=k)
        dfList = tk['FEATURE'].tolist()  
    elif (readm_type == 'THIRTYDAYREADMIT_ALL'):
        tk = df_mi_all_thirtyday.sort_values('MI', ascending=False).head(n=k)
        dfList = tk['FEATURE'].tolist()    
    elif (readm_type == 'ONEYEARREADMIT_ALL'):
        tk = df_mi_all_onyear.sort_values('MI', ascending=False).head(n=k)
        dfList = tk['FEATURE'].tolist()   
    else:
        return
    
    return dfList


In [12]:
mi_list = TopK('THIRTYDAYREADMIT', 20)
df_topk = df_final[mi_list]
df_topk.head()

Unnamed: 0,ELIXHAUSER_SCORE,MAX_UREA,CHARLSON_SCORE,EDLOS,MAX_CREAT,AVG_UREA,CABG,AVG_CREAT,ICULOS,AVG_HGB,ADLOS,MIN_UREA,MAX_HGB,MIN_CREAT,DISCHARGE_LOCATION_LONG TERM CARE HOSPITAL,MAX_POT,MAX_CALC_BLD,MIN_CALC_BLD,ETHNICITY_BLACK,AVG_CALC_BLD
0,-1.0,20,0.0,0.0,0.8,17.25,True,0.72,1.2641,9.3,5.5,14,10.7,0.7,0,4.1,8.0,8.0,0,8.0
1,0.0,13,1.0,0.0,1.0,11.5,False,0.92,0.5124,14.33,2.86,10,14.5,0.9,0,4.1,9.7,9.2,0,9.47
2,0.0,54,1.0,0.14,1.6,39.56,False,1.19,3.5466,11.04,3.53,12,13.1,0.7,0,6.5,8.5,7.4,0,7.95
3,12.0,36,2.0,0.0,1.4,31.5,False,1.31,2.1407,11.87,6.99,28,12.3,1.2,0,4.7,8.8,8.0,0,8.36
4,3.0,25,3.0,0.0,1.2,19.0,True,1.05,1.1224,11.0,5.36,13,11.2,0.9,0,4.7,8.0,8.0,0,8.0


In [13]:
df_topk.shape

(12918, 20)

In [14]:
mi_list_ONE = TopK('ONEYEARREADMIT', 20)
df_topk_oneyear = df_final[mi_list_ONE]
df_topk_oneyear.head()

Unnamed: 0,CHARLSON_SCORE,CABG,MAX_CREAT,ELIXHAUSER_SCORE,AVG_CREAT,EDLOS,MAX_UREA,MIN_CREAT,AVG_UREA,ETHNICITY_BLACK,MIN_UREA,ADMISSION_LOCATION_EMERGENCY ROOM ADMIT,ETHNICITY_UNKNOWN/NOT SPECIFIED,MAX_POT,ICULOS,MAX_HGB,AVG_INR,ADLOS,ADMISSION_LOCATION_TRANSFER FROM HOSP/EXTRAM,MAX_INR
0,0.0,True,0.8,-1.0,0.72,0.0,20,0.7,17.25,0,14,0,0,4.1,1.2641,10.7,1.75,5.5,0,2.0
1,1.0,False,1.0,0.0,0.92,0.0,13,0.9,11.5,0,10,0,0,4.1,0.5124,14.5,1.0,2.86,1,1.0
2,1.0,False,1.6,0.0,1.19,0.14,54,0.7,39.56,0,12,1,0,6.5,3.5466,13.1,1.0,3.53,0,1.1
3,2.0,False,1.4,12.0,1.31,0.0,36,1.2,31.5,0,28,0,1,4.7,2.1407,12.3,1.77,6.99,1,2.5
4,3.0,True,1.2,3.0,1.05,0.0,25,0.9,19.0,0,13,0,0,4.7,1.1224,11.2,1.25,5.36,0,1.4


## Hard EM

In [15]:
# split train test 
X_train, X_test, Y_train, Y_test = train_test_split(df_topk_oneyear, df_final[['ONEYEARREADMIT']], stratify=df_final[['ONEYEARREADMIT']], test_size=0.3)

In [16]:
X_train.shape

(9042, 20)

In [17]:
X_test.shape

(3876, 20)

In [18]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
Y_train = Y_train.reset_index(drop=True)
Y_test = Y_test.reset_index(drop=True)

In [19]:
index_data =range(0,9042)
column_split = ['ETHNICITY_BLACK']
df_cohort = pd.DataFrame(index=index_data, columns=column_split)
df_cohort.head()

Unnamed: 0,ETHNICITY_BLACK
0,
1,
2,
3,
4,


In [20]:
for i in range(9042):
    if X_train['ETHNICITY_BLACK'][i] == 0:
        df_cohort['ETHNICITY_BLACK'][i] = 0
    elif X_train['ETHNICITY_BLACK'][i] == 1:
        df_cohort['ETHNICITY_BLACK'][i] = 1

df_cohort.head()

Unnamed: 0,ETHNICITY_BLACK
0,0
1,0
2,0
3,0
4,0


In [21]:
data = X_train.as_matrix()
print(data.shape)

target = (Y_train.as_matrix()).ravel()
print(target.shape)

cohort = (df_cohort.as_matrix()).ravel()
print(cohort.shape)

test_data = X_test.as_matrix()
print(test_data.shape)

test_target = (Y_test.as_matrix()).ravel()
print(test_target.shape)

(9042, 20)
(9042,)
(9042,)
(3876, 20)
(3876,)


In [22]:
data

array([[3.0, False, 1.7, ..., 93.94, 1, 4.9],
       [3.0, False, 3.4, ..., 3.5, 0, 1.5],
       [3.0, False, 1.7, ..., 4.92, 0, 1.2],
       ..., 
       [2.0, False, 1.1, ..., 4.24, 0, 1.2],
       [1.0, True, 1.3, ..., 5.05, 1, 1.4],
       [1.0, True, 1.3, ..., 8.73, 1, 2.2]], dtype=object)

In [23]:
target[0:10]

array([0, 0, 0, 1, 0, 1, 0, 0, 0, 0])

In [27]:
clfFn_test = LR(data, target)
# clfFn_test.get_params()
clfFn_test

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
clfFn_test.score(data, target)

0.83952665339526655

In [29]:
clfFn_test.predict(data)[0:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [30]:
index_counter = 1
num_change = 0
while True:
    print('ITERATION:', index_counter)
    data_1 = data[np.where(cohort == 0)]
#     print(data_1.shape)
    target_1 = target[np.where(cohort == 0)]
#     print(target_1.shape)


    data_2 = data[np.where(cohort == 1)]
    target_2 = target[np.where(cohort == 1)]
#     print(target_2.shape)


    cohort1_Model = LR(data_1, target_1)

    cohort2_Model = LR(data_2, target_2)
    
    cohort1_pred = cohort1_Model.predict(data_1) #reshape(-1,1)
    cohort1_current_accuracy = metrics.accuracy_score(target_1, cohort1_pred)
    print('Cohort 1 Accuracy:', cohort1_current_accuracy*100, '%')
    print("Not Readmitted %:", (1 - (sum(target_1)/len(target_1)))*100,'%')
    print('cohort size:', (len(target_1)/len(target))*100,'%')
    print(" ")
    cohort2_pred = cohort2_Model.predict(data_2) #reshape(-1,1)
    cohort2_current_accuracy = metrics.accuracy_score(target_2, cohort2_pred)
    print('Cohort 2 Accuracy:', cohort2_current_accuracy*100, '%')
    print("Not Readmitted %:", (1 - (sum(target_2)/len(target_2)))*100,'%')
    print('cohort size:', (len(target_2)/len(target))*100,'%')


    oldcohort = copy.deepcopy(cohort)
    
    # reassign based on whichever model gives higher probability
    for i in range(len(target)):
        curr_data = data[i]
        if (target[i] == 0):  
            if np.any((cohort1_Model.predict_proba(curr_data)[:,0]) > (cohort2_Model.predict_proba(curr_data)[:,0])):
                cohort[i] = 0
            else:
                cohort[i] = 1
        else:
            if np.any((cohort1_Model.predict_proba(curr_data)[:,1]) > (cohort2_Model.predict_proba(curr_data)[:,1])):
                cohort[i] = 0
            else:
                cohort[i] = 1
                    
    if sum(np.absolute(np.subtract(oldcohort, cohort))) == 0 or index_counter == 50:
        num_change = sum(np.absolute(np.subtract(oldcohort, cohort)))
        print('num_change', num_change)
        print("break")
        break
    else:
        num_change = sum(np.absolute(np.subtract(oldcohort, cohort)))
        print('num_change', num_change)
    index_counter += 1
    print(" ")
    print(" ")
    print(" ")


ITERATION: 1
Cohort 1 Accuracy: 84.9721926399 %
Not Readmitted %: 84.8656963673 %
cohort size: 93.46383543463836 %
 
Cohort 2 Accuracy: 69.5431472081 %
Not Readmitted %: 58.5448392555 %
cohort size: 6.536164565361647 %
num_change 1688
 
 
 
ITERATION: 2
Cohort 1 Accuracy: 99.9462148716 %
Not Readmitted %: 98.6688180718 %
cohort size: 82.24950232249503 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 11.214953271 %
cohort size: 17.750497677504974 %
num_change 12
 
 
 
ITERATION: 3
Cohort 1 Accuracy: 100.0 %
Not Readmitted %: 98.7070707071 %
cohort size: 82.11678832116789 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 11.6883116883 %
cohort size: 17.88321167883212 %
num_change 5
 
 
 
ITERATION: 4
Cohort 1 Accuracy: 100.0 %
Not Readmitted %: 98.7072448155 %
cohort size: 82.12784782127848 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 11.6336633663 %
cohort size: 17.872152178721525 %
num_change 4
 
 
 
ITERATION: 5
Cohort 1 Accuracy: 100.0 %
Not Readmitted %: 98.6944818304 %
cohort si

In [31]:
sum(target_1)

95

In [32]:
sum(target_2)

1429

In [33]:
cohort=cohort.astype('int')
cohort_clf = LR(data, cohort)

In [34]:
# predict cohorts for test set 
test_cohort = cohort_clf.predict(X_test)
test_cohort[0:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

In [35]:
testdata_1 = test_data[np.where(test_cohort == 0)]
testtarget_1 = test_target[np.where(test_cohort == 0)]

testdata_2 = test_data[np.where(test_cohort == 1)]
testtarget_2 = test_target[np.where(test_cohort == 1)]

In [36]:
# cohort 1
test_cohort1_pred = cohort1_Model.predict(testdata_1)
test_cohort1_acc = metrics.accuracy_score(testtarget_1, test_cohort1_pred)
print('Accuracy', test_cohort1_acc*100,'%')
print('Cohort fraction:', (len(testtarget_1)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_1)/len(testtarget_1)))*100,'%')
print("Readmitted:", (sum(testtarget_1)/len(testtarget_1))*100,'%')
print('Predicted readmitted:', (sum(test_cohort1_pred)/len(test_cohort1_pred))*100,'%')
print()

# cohort 2
test_cohort2_pred = cohort2_Model.predict(testdata_2)
test_cohort2_acc = metrics.accuracy_score(testtarget_2, test_cohort2_pred)
print('Accuracy', test_cohort2_acc*100,'%')
print('Cohort fraction:', (len(testtarget_2)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_2)/len(testtarget_2)))*100,'%')
print("Readmitted:", (sum(testtarget_2)/len(testtarget_2))*100,'%')
print('Predicted readmitted:', (sum(test_cohort2_pred)/len(test_cohort2_pred))*100,'%')
print()
print()

# total
total_accuracy = (test_cohort1_acc*(len(testtarget_1)/len(test_target))) + (test_cohort2_acc*(len(testtarget_2)/len(test_target))) 
print('Total Accuracy:', total_accuracy)

Accuracy 83.4899328859 %
Cohort fraction: 0.9610423116615067
Not Readmitted: 84.2684563758 %
Readmitted: 15.7315436242 %
Predicted readmitted: 1.31543624161 %

Accuracy 57.6158940397 %
Cohort fraction: 0.03895768833849329
Not Readmitted: 55.6291390728 %
Readmitted: 44.3708609272 %
Predicted readmitted: 60.2649006623 %


Total Accuracy: 0.824819401445


## Hard EM - Random Initilization

In [37]:
# split train test 
X_train, X_test, Y_train, Y_test = train_test_split(df_topk_oneyear, df_final[['ONEYEARREADMIT']], stratify=df_final[['ONEYEARREADMIT']], test_size=0.3)

In [38]:
X_train.shape

(9042, 20)

In [39]:
X_test.shape

(3876, 20)

In [40]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
Y_train = Y_train.reset_index(drop=True)
Y_test = Y_test.reset_index(drop=True)

In [41]:
# randomly initialize cohort 
cohort = np.random.randint(2, size=9042)
cohort[0:10]

array([1, 1, 1, 1, 0, 0, 1, 0, 1, 0])

In [42]:
data = X_train.as_matrix()
print(data.shape)

target = (Y_train.as_matrix()).ravel()
print(target.shape)

test_data = X_test.as_matrix()
print(test_data.shape)

test_target = (Y_test.as_matrix()).ravel()
print(test_target.shape)

print(cohort.shape)

(9042, 20)
(9042,)
(3876, 20)
(3876,)
(9042,)


In [43]:
data

array([[1.0, False, 0.7, ..., 26.67, 1, 2.3],
       [6.0, False, 1.6, ..., 5.96, 0, 1.4],
       [3.0, False, 1.1, ..., 1.16, 0, 1.9],
       ..., 
       [6.0, False, 0.8, ..., 4.82, 1, 0.9],
       [0.0, True, 1.4, ..., 10.06, 0, 1.4],
       [3.0, False, 1.9, ..., 5.19, 1, 1.2]], dtype=object)

In [44]:
target[0:10]

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0])

In [45]:
clfFn_test = LR(data, target)
# clfFn_test.get_params()
clfFn_test

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [46]:
clfFn_test.score(data, target)

0.83731475337314754

In [47]:
clfFn_test.predict(data)[0:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [48]:
index_counter = 1
num_change = 0
while True:
    print('ITERATION:', index_counter)
    data_1 = data[np.where(cohort == 0)]
#     print(data_1.shape)
    target_1 = target[np.where(cohort == 0)]
#     print(target_1.shape)


    data_2 = data[np.where(cohort == 1)]
    target_2 = target[np.where(cohort == 1)]
#     print(target_2.shape)


    cohort1_Model = LR(data_1, target_1)

    cohort2_Model = LR(data_2, target_2)
    
    cohort1_pred = cohort1_Model.predict(data_1) #reshape(-1,1)
    cohort1_current_accuracy = metrics.accuracy_score(target_1, cohort1_pred)
    print('Cohort 1 Accuracy:', cohort1_current_accuracy*100, '%')
    print("Not Readmitted %:", (1 - (sum(target_1)/len(target_1)))*100,'%')
    print('cohort size:', (len(target_1)/len(target))*100,'%')
    print(" ")
    cohort2_pred = cohort2_Model.predict(data_2) #reshape(-1,1)
    cohort2_current_accuracy = metrics.accuracy_score(target_2, cohort2_pred)
    print('Cohort 2 Accuracy:', cohort2_current_accuracy*100, '%')
    print("Not Readmitted %:", (1 - (sum(target_2)/len(target_2)))*100,'%')
    print('cohort size:', (len(target_2)/len(target))*100,'%')


    oldcohort = copy.deepcopy(cohort)
    
    # reassign based on whichever model gives higher probability
    for i in range(len(target)):
        curr_data = data[i]
        if (target[i] == 0):  
            if np.any((cohort1_Model.predict_proba(curr_data)[:,0]) > (cohort2_Model.predict_proba(curr_data)[:,0])):
                cohort[i] = 0
            else:
                cohort[i] = 1
        else:
            if np.any((cohort1_Model.predict_proba(curr_data)[:,1]) > (cohort2_Model.predict_proba(curr_data)[:,1])):
                cohort[i] = 0
            else:
                cohort[i] = 1
                    
    if sum(np.absolute(np.subtract(oldcohort, cohort))) == 0 or index_counter == 50:
        num_change = sum(np.absolute(np.subtract(oldcohort, cohort)))
        print('num_change', num_change)
        print("break")
        break
    else:
        num_change = sum(np.absolute(np.subtract(oldcohort, cohort)))
        print('num_change', num_change)
    index_counter += 1
    print(" ")
    print(" ")
    print(" ")


ITERATION: 1
Cohort 1 Accuracy: 83.4363957597 %
Not Readmitted %: 82.9946996466 %
cohort size: 50.07741650077416 %
 
Cohort 2 Accuracy: 83.7616304829 %
Not Readmitted %: 83.2964111653 %
cohort size: 49.92258349922584 %
num_change 4267
 
 
 
ITERATION: 2
Cohort 1 Accuracy: 99.7322623829 %
Not Readmitted %: 81.499330656 %
cohort size: 41.307232913072326 %
 
Cohort 2 Accuracy: 99.811569625 %
Not Readmitted %: 84.3037497645 %
cohort size: 58.69276708692767 %
num_change 21
 
 
 
ITERATION: 3
Cohort 1 Accuracy: 99.8927613941 %
Not Readmitted %: 81.4209115282 %
cohort size: 41.25193541251935 %
 
Cohort 2 Accuracy: 99.8493975904 %
Not Readmitted %: 84.3561746988 %
cohort size: 58.74806458748064

  np.exp(prob, prob)


 %
num_change 20
 
 
 
ITERATION: 4
Cohort 1 Accuracy: 99.9195710456 %
Not Readmitted %: 81.3404825737 %
cohort size: 41.25193541251935 %
 
Cohort 2 Accuracy: 99.9435240964 %
Not Readmitted %: 84.4126506024 %
cohort size: 58.74806458748064 %

  np.exp(prob, prob)



num_change 7
 
 
 
ITERATION: 5
Cohort 1 Accuracy: 99.946366318 %
Not Readmitted %: 81.3622955216 %
cohort size: 41.24087591240876 %
 
Cohort 2 Accuracy: 99.9058912102 %
Not Readmitted %: 84.3967626576 %
cohort size: 58.75912408759124 %


  np.exp(prob, prob)


num_change 5
 
 
 
ITERATION: 6
Cohort 1 Accuracy: 99.9463806971 %
Not Readmitted %: 81.3941018767 %
cohort size: 41.25193541251935 %
 
Cohort 2 Accuracy: 99.7176204819 %
Not Readmitted %: 84.375 %
cohort size: 58.74806458748064 %


  np.exp(prob, prob)


num_change 6
 
 
 
ITERATION: 7
Cohort 1 Accuracy: 99.9464094319 %
Not Readmitted %: 81.4040728832 %
cohort size: 41.27405441274055 %
 
Cohort 2 Accuracy: 99.9246704331 %
Not Readmitted %: 84.3691148776 %
cohort size: 58.72594558725945 %


  np.exp(prob, prob)


num_change 5
 
 
 
ITERATION: 8
Cohort 1 Accuracy: 99.9463950683 %
Not Readmitted %: 81.3722862503 %
cohort size: 41.26299491262995 %
 
Cohort 2 Accuracy: 99.905855771 %
Not Readmitted %: 84.3908868386 %

  np.exp(prob, prob)



cohort size: 58.73700508737005 %
num_change 4
 
 
 
ITERATION: 9
Cohort 1 Accuracy: 99.946366318 %
Not Readmitted %: 81.3622955216 %
cohort size: 41.24087591240876 %
 
Cohort 2 Accuracy: 99.9058912102 %
Not Readmitted %: 84.3967626576 %
cohort size: 58.75912408759124

  np.exp(prob, prob)


 %
num_change 3
 
 
 
ITERATION: 10
Cohort 1 Accuracy: 99.8927038627 %
Not Readmitted %: 81.3572961373 %
cohort size: 41.22981641229816 %
 
Cohort 2 Accuracy: 99.9247271359 %
Not Readmitted %: 84.3996989085 %
cohort size: 58.770183587701844

  np.exp(prob, prob)


 %
num_change 5
 
 
 
ITERATION: 11
Cohort 1 Accuracy: 99.8926174497 %
Not Readmitted %: 81.3691275168 %
cohort size: 41.19663791196638 %
 
Cohort 2 Accuracy: 99.9623848035 %
Not Readmitted %: 84.3896934361 %
cohort size: 58.803362088033616 %


  np.exp(prob, prob)


num_change 0
break


In [49]:
sum(target_1)

694

In [50]:
sum(target_2)

830

In [51]:
cohort=cohort.astype('int')
cohort_clf = LR(data, cohort)

In [52]:
# predict cohorts for test set 
test_cohort = cohort_clf.predict(X_test)
test_cohort[0:100]

array([1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 0])

In [53]:
testdata_1 = test_data[np.where(test_cohort == 0)]
testtarget_1 = test_target[np.where(test_cohort == 0)]

testdata_2 = test_data[np.where(test_cohort == 1)]
testtarget_2 = test_target[np.where(test_cohort == 1)]

In [54]:
# cohort 1
test_cohort1_pred = cohort1_Model.predict(testdata_1)
test_cohort1_acc = metrics.accuracy_score(testtarget_1, test_cohort1_pred)
print('Accuracy', test_cohort1_acc*100,'%')
print('Cohort fraction:', (len(testtarget_1)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_1)/len(testtarget_1)))*100,'%')
print("Readmitted:", (sum(testtarget_1)/len(testtarget_1))*100,'%')
print('Predicted readmitted:', (sum(test_cohort1_pred)/len(test_cohort1_pred))*100,'%')
print()

# cohort 2
test_cohort2_pred = cohort2_Model.predict(testdata_2)
test_cohort2_acc = metrics.accuracy_score(testtarget_2, test_cohort2_pred)
print('Accuracy', test_cohort2_acc*100,'%')
print('Cohort fraction:', (len(testtarget_2)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_2)/len(testtarget_2)))*100,'%')
print("Readmitted:", (sum(testtarget_2)/len(testtarget_2))*100,'%')
print('Predicted readmitted:', (sum(test_cohort2_pred)/len(test_cohort2_pred))*100,'%')
print()
print()

# total
total_accuracy = (test_cohort1_acc*(len(testtarget_1)/len(test_target))) + (test_cohort2_acc*(len(testtarget_2)/len(test_target))) 
print('Total Accuracy:', total_accuracy)

Accuracy 73.8717339667 %
Cohort fraction: 0.434468524251806
Not Readmitted: 79.1567695962 %
Readmitted: 20.8432304038 %
Predicted readmitted: 10.3919239905 %

Accuracy 83.6678832117 %
Cohort fraction: 0.5655314757481941
Not Readmitted: 86.2226277372 %
Readmitted: 13.7773722628 %
Predicted readmitted: 5.74817518248 %


Total Accuracy: 0.794117647059


## Hard EM - Random Initilization (k = 3)

In [73]:
# split train test 
X_train, X_test, Y_train, Y_test = train_test_split(df_topk_oneyear, df_final[['ONEYEARREADMIT']], stratify=df_final[['ONEYEARREADMIT']], test_size=0.3)

In [74]:
X_train.shape

(9042, 20)

In [75]:
X_test.shape

(3876, 20)

In [76]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
Y_train = Y_train.reset_index(drop=True)
Y_test = Y_test.reset_index(drop=True)

In [77]:
# randomly initialize cohort 
cohort = np.random.randint(3, size=9042)
cohort[0:10]

array([1, 0, 1, 1, 0, 2, 1, 2, 0, 0])

In [78]:
data = X_train.as_matrix()
print(data.shape)

target = (Y_train.as_matrix()).ravel()
print(target.shape)

test_data = X_test.as_matrix()
print(test_data.shape)

test_target = (Y_test.as_matrix()).ravel()
print(test_target.shape)

print(cohort.shape)

(9042, 20)
(9042,)
(3876, 20)
(3876,)
(9042,)


In [79]:
data

array([[0.0, False, 1.1, ..., 6.78, 0, 1.1],
       [1.0, False, 0.8, ..., 4.67, 0, 1.1],
       [2.0, False, 0.8, ..., 1.97, 0, 1.1],
       ..., 
       [1.0, True, 0.7, ..., 4.32, 0, 1.2],
       [3.0, True, 2.5, ..., 6.85, 0, 1.6],
       [1.0, False, 1.9, ..., 28.03, 0, 2.7]], dtype=object)

In [80]:
target[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [81]:
clfFn_test = LR(data, target)
# clfFn_test.get_params()
clfFn_test

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [82]:
clfFn_test.score(data, target)

0.83842070338420704

In [83]:
clfFn_test.predict(data)[0:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [84]:
index_counter = 1
num_change = 0
while True:
    print('ITERATION:', index_counter)
    data_1 = data[np.where(cohort == 0)]
    target_1 = target[np.where(cohort == 0)]

    data_2 = data[np.where(cohort == 1)]
    target_2 = target[np.where(cohort == 1)]
    
    data_3 = data[np.where(cohort == 2)]
    target_3 = target[np.where(cohort == 2)]


    cohort1_Model = LR(data_1, target_1)

    cohort2_Model = LR(data_2, target_2)

    cohort3_Model = LR(data_3, target_3)


    cohort1_pred = cohort1_Model.predict(data_1) #reshape(-1,1)
    cohort1_current_accuracy = metrics.accuracy_score(target_1, cohort1_pred)
    print('Cohort 1 Accuracy:', cohort1_current_accuracy*100, '%')
    print("Not Readmitted %:", (1 - (sum(target_1)/len(target_1)))*100,'%')
    print('cohort size:', (len(target_1)/len(target))*100,'%')
    print(" ")
    cohort2_pred = cohort2_Model.predict(data_2) #reshape(-1,1)
    cohort2_current_accuracy = metrics.accuracy_score(target_2, cohort2_pred)
    print('Cohort 2 Accuracy:', cohort2_current_accuracy*100, '%')
    print("Not Readmitted %:", (1 - (sum(target_2)/len(target_2)))*100,'%')
    print('cohort size:', (len(target_2)/len(target))*100,'%')
    print(" ")
    cohort3_pred = cohort3_Model.predict(data_3) #reshape(-1,1)
    cohort3_current_accuracy = metrics.accuracy_score(target_3, cohort3_pred)
    print('Cohort 3 Accuracy:', cohort3_current_accuracy*100, '%')
    print("Not Readmitted %:", (1 - (sum(target_3)/len(target_3)))*100,'%')
    print('cohort size:', (len(target_3)/len(target))*100,'%')


    oldcohort = copy.deepcopy(cohort)
    
    # reassign based on whichever model gives higher probability
    for i in range(len(target)):
        curr_data = data[i]
        if (target[i] == 0):  
            if (cohort1_Model.predict_proba(curr_data)[:,0]) > ((cohort2_Model.predict_proba(curr_data)[:,0]) and (cohort3_Model.predict_proba(curr_data)[:,0])) :
                cohort[i] = 0
            elif (cohort2_Model.predict_proba(curr_data)[:,0]) > ((cohort1_Model.predict_proba(curr_data)[:,0]) and (cohort3_Model.predict_proba(curr_data)[:,0])) :
                cohort[i] = 1
            else:
                cohort[i] = 2
        else:
            if (cohort1_Model.predict_proba(curr_data)[:,1]) > ((cohort2_Model.predict_proba(curr_data)[:,1]) and (cohort3_Model.predict_proba(curr_data)[:,1])):
                cohort[i] = 0
            elif (cohort2_Model.predict_proba(curr_data)[:,1]) > ((cohort1_Model.predict_proba(curr_data)[:,1]) and (cohort3_Model.predict_proba(curr_data)[:,1])):
                cohort[i] = 1
            else:
                cohort[i] = 2
                    
    if sum(np.absolute(np.subtract(oldcohort, cohort))) == 0 or index_counter == 50 or ((cohort3_current_accuracy+cohort2_current_accuracy+cohort1_current_accuracy) == 3):
        num_change = sum(np.absolute(np.subtract(oldcohort, cohort)))
        print('num_change', num_change)
        print("break")
        break
    else:
        num_change = sum(np.absolute(np.subtract(oldcohort, cohort)))
        print('num_change', num_change)
    index_counter += 1
    print(" ")
    print(" ")
    print(" ")


ITERATION: 1
Cohort 1 Accuracy: 83.7998024366 %
Not Readmitted %: 83.0754033586 %
cohort size: 33.58770183587702 %
 
Cohort 2 Accuracy: 83.2268907563 %
Not Readmitted %: 82.2521008403 %
cohort size: 32.90201282902013 %
 
Cohort 3 Accuracy: 84.4884488449 %
Not Readmitted %: 84.0924092409 %
cohort size: 33.51028533510285 %
num_change 8521
 
 
 
ITERATION: 2
Cohort 1 Accuracy: 99.9241418547 %
Not Readmitted %: 82.8750237057 %
cohort size: 58.316744083167436 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 79.7247706422 %
cohort size: 12.05485512054855 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 85.0690556178 %
cohort size: 29.62840079628401 %


  np.exp(prob, prob)


num_change 1934
 
 
 
ITERATION: 3
Cohort 1 Accuracy: 99.6224990562 %
Not Readmitted %: 82.8237070593 %
cohort size: 58.593231585932315 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 86.5930599369 %
cohort size: 28.04689228046892 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 77.3178807947 %
cohort size: 13.35987613359876 %
num_change 1182
 
 
 
ITERATION: 4
Cohort 1 Accuracy: 99.8694273456 %
Not Readmitted %: 82.3913448983 %
cohort size: 59.289980092899796 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 84.0972871843 %
cohort size: 23.645211236452113 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 84.4458846403 %
cohort size: 17.064808670648087 

  np.exp(prob, prob)


%
num_change 1461
 
 
 
ITERATION: 5
Cohort 1 Accuracy: 99.9814367923 %
Not Readmitted %: 82.3835158715 %
cohort size: 59.577527095775274 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 86.1725184222 %
cohort size: 25.514266755142668 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 81.0089020772 %
cohort size: 14.908206149082062 

  np.exp(prob, prob)


%
num_change 1160
 
 
 
ITERATION: 6
Cohort 1 Accuracy: 99.926199262 %
Not Readmitted %: 81.8265682657 %
cohort size: 59.9424905994249 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 88.9416231609 %
cohort size: 23.302366733023668 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 79.801980198 %
cohort size: 16.755142667551425

  np.exp(prob, prob)


 %
num_change 1271
 
 
 
ITERATION: 7
Cohort 1 Accuracy: 99.8865140912 %
Not Readmitted %: 80.9154529979 %
cohort size: 58.471577084715776 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 90.9262759924 %
cohort size: 23.401902234019023 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 80.2928615009 %
cohort size: 18.126520681265205 %

  np.exp(prob, prob)



num_change 1307
 
 
 
ITERATION: 8
Cohort 1 Accuracy: 99.9210422424 %
Not Readmitted %: 79.3130675089 %
cohort size: 56.027427560274276 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 93.336376084 %
cohort size: 24.231364742313648 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 81.512605042 %
cohort size: 19.74120769741208 %


  np.exp(prob, prob)


num_change 877
 
 
 
ITERATION: 9
Cohort 1 Accuracy: 99.8992950655 %
Not Readmitted %: 78.9526686808 %
cohort size: 54.91041804910418 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 94.1126997477 %
cohort size: 26.299491262994913 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 80.0470865215 %
cohort size: 18.790090687900907 

  np.exp(prob, prob)


%
num_change 567
 
 
 
ITERATION: 10
Cohort 1 Accuracy: 99.9194847021 %
Not Readmitted %: 78.8647342995 %
cohort size: 54.943596549435966 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 92.7221552071 %
cohort size: 27.504976775049766 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 81.5374921235 %
cohort size: 17.551426675514268 %
num_change 899
 
 
 
ITERATION: 11
Cohort 1 Accuracy: 99.8151950719 %
Not Readmitted %: 78.6858316222 %
cohort size: 53.85976553859766 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 92.6861702128 %
cohort size: 24.95023224950232 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 83.2463465553 %


  np.exp(prob, prob)


cohort size: 21.190002211900023 %
num_change 939
 
 
 
ITERATION: 12
Cohort 1 Accuracy: 99.9373171751 %
Not Readmitted %: 78.8549937317 %
cohort size: 52.93076752930767 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 93.7523242841 %
cohort size: 29.738995797389954 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 78.0472239949 %
cohort size: 17.330236673302366 %
num_change 780
 
 
 
ITERATION: 13
Cohort 1 Accuracy: 99.937225361 %
Not Readmitted %: 79.1588198368 %
cohort size: 52.85335102853351 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 92.9679762298 %
cohort size: 33.49922583499226 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 74.4732576985 %
cohort size: 13.647423136474233 %
num_change 656
 
 
 
ITERATION: 14
Cohort 1 Accuracy: 99.9791492911 %
Not Readmitted %: 79.7956630525 %
cohort size: 53.04136253041363 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 92.9090380703 %
cohort size: 38.0557398805574 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 61.3664596273 %
cohort size

  np.exp(prob, prob)


25.580623755806236 %
num_change 750
 
 
 
ITERATION: 28
Cohort 1 Accuracy: 100.0 %
Not Readmitted %: 94.7984790875 %
cohort size: 72.71621322716213 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 88.5486018642 %
cohort size: 8.305684583056845 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 36.1305361305 %
cohort size: 18.97810218978102 

  np.exp(prob, prob)


%
num_change 304
 
 
 
ITERATION: 29
Cohort 1 Accuracy: 100.0 %
Not Readmitted %: 94.6795461515 %
cohort size: 72.1300597213006 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 84.8878394333 %
cohort size: 9.367396593673966 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 37.298266587 %
cohort size: 18.502543685025437 

  np.exp(prob, prob)


%
num_change 387
 
 
 
ITERATION: 30
Cohort 1 Accuracy: 99.9846059113 %
Not Readmitted %: 94.6274630542 %
cohort size: 71.84251271842513 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 73.5454545455 %
cohort size: 12.165450121654501 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 38.8658367911 %
cohort size: 15.992037159920372 %


  np.exp(prob, prob)


num_change 547
 
 
 
ITERATION: 31
Cohort 1 Accuracy: 99.9842866122 %
Not Readmitted %: 94.3746071653 %
cohort size: 70.38265870382658 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 68.177613321 %
cohort size: 11.955319619553197 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 48.5284909205 %
cohort size: 17.662021676620217

  np.exp(prob, prob)


 %
num_change 981
 
 
 
ITERATION: 32
Cohort 1 Accuracy: 99.9842470069 %
Not Readmitted %: 94.3761814745 %
cohort size: 70.20570670205707 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 44.3796394486 %
cohort size: 20.85821720858217 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 85.396039604 %
cohort size: 8.936076089360762 %


  np.exp(prob, prob)


num_change 690
 
 
 
ITERATION: 33
Cohort 1 Accuracy: 99.9185269676 %
Not Readmitted %: 94.0198794199 %
cohort size: 67.87215217872152 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 43.6847599165 %
cohort size: 21.190002211900023 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 92.1132457027 %
cohort size: 10.937845609378456 %


  np.exp(prob, prob)


num_change 1716
 
 
 
ITERATION: 34
Cohort 1 Accuracy: 99.8915793278 %
Not Readmitted %: 92.3744127214 %
cohort size: 61.20327361203274 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 42.3860911271 %
cohort size: 18.447246184472462 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 92.3369565217 %
cohort size: 20.349480203494803

  np.exp(prob, prob)


 %
num_change 1703
 
 
 
ITERATION: 35
Cohort 1 Accuracy: 99.9800319489 %
Not Readmitted %: 91.0343450479 %
cohort size: 55.38597655385976 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 53.6811594203 %
cohort size: 19.077637690776378 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 88.046773495 %
cohort size: 25.536385755363856 

  np.exp(prob, prob)


%
num_change 1332
 
 
 
ITERATION: 36
Cohort 1 Accuracy: 99.8965231788 %
Not Readmitted %: 90.4387417219 %
cohort size: 53.43950453439504 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 63.3695215119 %
cohort size: 27.504976775049766 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 91.2362159025 %
cohort size:

  np.exp(prob, prob)


 19.055518690555186 %
num_change 1121
 
 
 
ITERATION: 37
Cohort 1 Accuracy: 99.9566630553 %
Not Readmitted %: 89.4041170098 %
cohort size: 51.03959301039593 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 56.6253574833 %
cohort size: 23.202831232028313 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 94.6328896522 %
cohort size: 25.757575757575758 %

  np.exp(prob, prob)



num_change 1394
 
 
 
ITERATION: 38
Cohort 1 Accuracy: 99.9775381851 %
Not Readmitted %: 88.2749326146 %
cohort size: 49.23689449236895 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 68.9978370584 %
cohort size: 30.679053306790532 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 92.1806167401 %
cohort size: 20.08405220084052 

  np.exp(prob, prob)


%
num_change 1767
 
 
 
ITERATION: 39
Cohort 1 Accuracy: 99.9298081423 %
Not Readmitted %: 86.5699578849 %
cohort size: 47.26830347268303 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 79.7002366553 %
cohort size: 42.05927892059279 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 81.5544041451 %
cohort size: 10.672417606724176 %


  np.exp(prob, prob)


num_change 860
 
 
 
ITERATION: 40
Cohort 1 Accuracy: 99.9018404908 %
Not Readmitted %: 84.6625766871 %
cohort size: 45.06746295067463 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 80.0050365147 %
cohort size: 43.91727493917275 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 89.4578313253 %
cohort size: 11.015262110152621

  np.exp(prob, prob)


 %
num_change 1506
 
 
 
ITERATION: 41
Cohort 1 Accuracy: 100.0 %
Not Readmitted %: 80.8041861746 %
cohort size: 40.15704490157045 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 83.2136667492 %
cohort size: 44.66932094669321 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 89.139941691 %
cohort size: 15.173634151736342 %


  np.exp(prob, prob)


num_change 1977
 
 
 
ITERATION: 42
Cohort 1 Accuracy: 99.7406807131 %
Not Readmitted %: 73.8087520259 %
cohort size: 34.11855784118558 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 86.9310183013 %
cohort size: 47.13558947135589 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 90.6194690265 %
cohort size: 18.745852687458527 %
num_change 2518
 
 
 
ITERATION: 43
Cohort 1 Accuracy: 99.9159310635 %
Not Readmitted %: 58.6380832282 %
cohort size: 26.310550763105507 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 90.7053034445 %
cohort size: 40.45565140455651 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 93.3444259567 %
cohort size: 33.23379783233798 %
num_change 1860
 
 
 
ITERATION: 44
Cohort 1 Accuracy: 99.5764705882 %
Not Readmitted %: 49.0823529412 %
cohort size: 23.501437735014377 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 94.5180447693 %
cohort size: 48.41849148418492 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 92.0441118551 %
cohort size: 28.08007078080071 %
num_cha

  np.exp(prob, prob)


%
num_change 1679
 
 
 
ITERATION: 46
Cohort 1 Accuracy: 99.4799791992 %
Not Readmitted %: 39.365574623 %
cohort size: 21.26741871267419 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 98.1900452489 %
cohort size: 65.99203715992037 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 78.2986111111 %
cohort size: 12.740544127405443 %
num_change 2894
 
 
 
ITERATION: 47
Cohort 1 Accuracy: 99.8936735779 %
Not Readmitted %: 36.2041467305 %
cohort size: 20.802919708029197 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 98.499849985 %
cohort size: 36.86131386861314 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 92.842215256 %
cohort size: 42.33576642335766 %
num_change 1546
 
 
 
ITERATION: 48
Cohort 1 Accuracy: 99.9461497038 %
Not Readmitted %: 34.9488422186 %
cohort size: 20.537491705374915 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 98.7545630234 %
cohort size: 51.504092015040925 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 89.7943037975 %
cohort size: 27.958416279584164 %


  np.exp(prob, prob)


num_change 1646
 
 
 
ITERATION: 49
Cohort 1 Accuracy: 100.0 %
Not Readmitted %: 35.1103931072 %
cohort size: 20.537491705374915 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 98.8742046011 %
cohort size: 67.78367617783675 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 76.3257575758 %
cohort size:

  np.exp(prob, prob)


 11.678832116788321 %
num_change 548
 
 
 
ITERATION: 50
Cohort 1 Accuracy: 100.0 %
Not Readmitted %: 35.5913978495 %
cohort size: 20.5706702057067 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 98.9485078089 %
cohort size: 71.52178721521787 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 63.9160839161 %
cohort size:

  np.exp(prob, prob)


 7.907542579075426 %
num_change 306
break


In [85]:
sum(target_1)

1198

In [86]:
sum(target_2)

68

In [87]:
sum(target_3)

258

In [88]:
cohort=cohort.astype('int')
cohort_clf = LR(data, cohort)

In [89]:
# predict cohorts for test set 
test_cohort = cohort_clf.predict(X_test)
test_cohort[0:100]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1])

In [90]:
testdata_1 = test_data[np.where(test_cohort == 0)]
testtarget_1 = test_target[np.where(test_cohort == 0)]

testdata_2 = test_data[np.where(test_cohort == 1)]
testtarget_2 = test_target[np.where(test_cohort == 1)]

testdata_3 = test_data[np.where(test_cohort == 2)]
testtarget_3 = test_target[np.where(test_cohort == 2)]

In [91]:
# cohort 1
test_cohort1_pred = cohort1_Model.predict(testdata_1)
test_cohort1_acc = metrics.accuracy_score(testtarget_1, test_cohort1_pred)
print('Accuracy', test_cohort1_acc*100,'%')
print('Cohort fraction:', (len(testtarget_1)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_1)/len(testtarget_1)))*100,'%')
print("Readmitted:", (sum(testtarget_1)/len(testtarget_1))*100,'%')
print('Predicted readmitted:', (sum(test_cohort1_pred)/len(test_cohort1_pred))*100,'%')
print()

# cohort 2
test_cohort2_pred = cohort2_Model.predict(testdata_2)
test_cohort2_acc = metrics.accuracy_score(testtarget_2, test_cohort2_pred)
print('Accuracy', test_cohort2_acc*100,'%')
print('Cohort fraction:', (len(testtarget_2)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_2)/len(testtarget_2)))*100,'%')
print("Readmitted:", (sum(testtarget_2)/len(testtarget_2))*100,'%')
print('Predicted readmitted:', (sum(test_cohort2_pred)/len(test_cohort2_pred))*100,'%')
print()

# cohort 3
test_cohort3_pred = cohort3_Model.predict(testdata_3)
test_cohort3_acc = metrics.accuracy_score(testtarget_3, test_cohort3_pred)
print('Accuracy', test_cohort3_acc*100,'%')
print('Cohort fraction:', (len(testtarget_3)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_3)/len(testtarget_3)))*100,'%')
print("Readmitted:", (sum(testtarget_3)/len(testtarget_3))*100,'%')
print('Predicted readmitted:', (sum(test_cohort3_pred)/len(test_cohort3_pred))*100,'%')
print()
print()

# total
total_accuracy = (test_cohort1_acc*(len(testtarget_1)/len(test_target))) + (test_cohort2_acc*(len(testtarget_2)/len(test_target))) + (test_cohort3_acc*(len(testtarget_3)/len(test_target))) 
print('Total Accuracy:', total_accuracy)

Accuracy 57.0093457944 %
Cohort fraction: 0.13802889576883384
Not Readmitted: 64.1121495327 %
Readmitted: 35.8878504673 %
Predicted readmitted: 31.7757009346 %

Accuracy 85.7964869776 %
Cohort fraction: 0.8519091847265222
Not Readmitted: 86.3718958207 %
Readmitted: 13.6281041793 %
Predicted readmitted: 0.757116898849 %

Accuracy 28.2051282051 %
Cohort fraction: 0.010061919504643963
Not Readmitted: 71.7948717949 %
Readmitted: 28.2051282051 %
Predicted readmitted: 89.7435897436 %


Total Accuracy: 0.812435500516


In [26]:
def LR(features, target):
    """
    INPUT
        features
        target
    
    OUTPUT
        optimized clf
    """

    # Create regularization penalty space
    penalty = ['l1', 'l2']

    # Create regularization hyperparameter space
    C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

    # Create hyperparameter options
    hyperparameters = dict(C=C, penalty=penalty)
    
    clf = GridSearchCV(estimator=LogisticRegression(), 
                    param_grid=hyperparameters,
                    cv=3,
                    refit = True)

    clf.fit(features, target)
    optimised_logistic_regression = clf.best_estimator_
    return optimised_logistic_regression
    

In [110]:
def randomSplitCI(data, topKfeatures, readm_type, clf, num_run, **params):
    """
    INPUT
        data: 2D numpy array. Pre-processed data
        clf: string. Name of the classifier from ['LR', 'SVM', 'NB']
        num_run: int. How many times you want to run for random evaluation?
        params: string->real. Hyper-parameter of classifier. PS: c=1.0, r=0.01
    
    OUTPUT
        train_scores: list. Results of trials
        test_scores: list. Results of trials
        train_mean: scalar. Average accuracy
        test_mean: scalar. Average accuracy
        train_ci: scalar. Confidence Interval
        test_ci: scalar. Confidence Interval
    """
    
    features = topKfeatures.as_matrix()
    
    if (readm_type == 'THIRTYDAYREADMIT'):
        target = data[['THIRTYDAYREADMIT']].as_matrix()
        notread = len(data.loc[data['THIRTYDAYREADMIT'] == False])
        majority_class = notread/len(data['THIRTYDAYREADMIT'])
    elif (readm_type == 'ONEYEARREADMIT'):
        target = (data[['ONEYEARREADMIT']].as_matrix()).ravel()
        notread = len(data.loc[data['ONEYEARREADMIT'] == False])
        majority_class = notread/len(data['ONEYEARREADMIT'])
    else:
        return
   
    # Create regularization penalty space
    penalty = ['l1', 'l2']

    # Create regularization hyperparameter space
    C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

    # Create hyperparameter options
    hyperparameters = dict(C=C, penalty=penalty)
    
    
    train_scores = []
    test_scores = []
    train_scores_auc = []
    test_scores_auc = []
    
    test_scores_pre = []
    train_scores_pre = []
    train_mean_pre = 0
    test_mean_pre = 0
    
    test_scores_rec = []
    train_scores_rec = []
    train_mean_rec = 0
    test_mean_rec = 0
    
    train_mean = 0
    test_mean = 0
    train_mean_auc = 0
    test_mean_auc = 0   
    train_ci = 0
    test_ci = 0
    
    
    
    if (clf == 'LR'):
#         clfFn = LogisticRegression(C=params["c"])
        clfFn = GridSearchCV(estimator=LogisticRegression(), param_grid=hyperparameters, cv=3, refit = True)
    elif (clf == 'NB'):
        clfFn = GaussianNB()
    elif (clf == 'DM'):
        clfFn = DummyClassifier(strategy='most_frequent',random_state=0)
    else:
        return
    
    for i in range(num_run):
        # x_train,      x_test,        y_train,      y_test
        features_train, features_test, target_train, target_test = train_test_split(features, target, stratify=target, test_size=0.3)
        # train_features, test_features, target_train, target_test = train_test_split(features, target, test_size=0.3)
    
        # train the features and target datasets and fit to a model
        clfModel = clfFn.fit(features_train, target_train)
                
        # predict target with feature test set using trained model
        target_pred = clfModel.predict(features_test)  # x_test
        
        # predict target with feature train test set using trained model
        feature_pred = clfModel.predict(features_train)  #x_train
        
        train_scores.append(metrics.accuracy_score(target_train, feature_pred))
        test_scores.append(metrics.accuracy_score(target_test, target_pred))
        train_scores_auc.append(metrics.roc_auc_score(target_train, feature_pred))
        test_scores_auc.append(metrics.roc_auc_score(target_test, target_pred))
        
        train_scores_pre.append(metrics.precision_score(target_train, feature_pred))
        test_scores_pre.append(metrics.precision_score(target_test, target_pred))
        train_scores_rec.append(metrics.recall_score(target_train, feature_pred))
        test_scores_rec.append(metrics.recall_score(target_test, target_pred))


        
    train_mean = np.mean(train_scores)
    test_mean = np.mean(test_scores)
    
    train_mean_auc = np.mean(train_scores_auc)
    test_mean_auc = np.mean(test_scores_auc)
    
    train_mean_pre = np.mean(train_scores_pre)
    test_mean_pre = np.mean(test_scores_pre) 
    
    train_mean_rec = np.mean(train_scores_rec)
    test_mean_rec = np.mean(test_scores_rec) 
    
    train_ci_full = st.t.interval(0.95, len(train_scores)-1, loc=train_mean, scale = st.sem(train_scores))
    test_ci_full = st.t.interval(0.95, len(test_scores)-1, loc=test_mean, scale = st.sem(test_scores))

    train_ci = train_mean - train_ci_full[0]
    test_ci = test_mean - test_ci_full[0]
    
    
    notread = len(data.loc[data['ONEYEARREADMIT'] == False])
    majority_class = notread/len(data['ONEYEARREADMIT'])
    print('not readmitted:',majority_class)

    
    ###########         end         ###########
    return train_scores,test_scores,train_mean,test_mean, train_mean_auc, test_mean_auc, train_mean_pre, test_mean_pre, train_mean_rec, test_mean_rec, train_ci,test_ci 

In [111]:
train_scores,test_scores,train_mean,test_mean, train_mean_auc, test_mean_auc, train_mean_pre, test_mean_pre, train_mean_rec, test_mean_rec, train_ci,test_ci = randomSplitCI(df_final, df_topk_oneyear, 'ONEYEARREADMIT' , 'LR', 10, c=1.0)
print("Train\
    \nResult of trails:{0} \
    \nAverage Accuracy: {1} \
    \nAverage AUC: {2} \
    \nAverage Precision: {3} \
    \nAverage Recall: {4} \
    \nConfidence Interval: {5}\n".format(train_scores, train_mean, train_mean_auc, train_mean_pre, train_mean_rec, train_ci)
     )
print("Test\
    \nResult of trails:{0} \
    \nAverage Accuracy: {1} \
    \nAverage AUC: {2} \
    \nAverage Precision: {3} \
    \nAverage Recall: {4} \
    \nConfidence Interval: {5}".format(test_scores, test_mean, test_mean_auc, test_mean_pre, test_mean_rec, test_ci)
     )

not readmitted: 0.8314754605976157
Train    
Result of trails:[0.83797832337978329, 0.83853129838531293, 0.83764653837646541, 0.83709356337093566, 0.83775713337757129, 0.83864189338641892, 0.83676177836761778, 0.83598761335987615, 0.83930546339305467, 0.83831010838310105]     
Average Accuracy: 0.8378013713780138     
Average AUC: 0.545617499628189     
Average Precision: 0.6096989271560729     
Average Recall: 0.1048556430446194     
Confidence Interval: 0.0007028918263267681

Test    
Result of trails:[0.83746130030959753, 0.83926728586171306, 0.83642930856553144, 0.83771929824561409, 0.83694530443756454, 0.836171310629515, 0.83720330237358098, 0.83952528379772962, 0.83333333333333337, 0.83668730650154799]     
Average Accuracy: 0.8370743034055728     
Average AUC: 0.5448536053318914     
Average Precision: 0.5957911334831103     
Average Recall: 0.10413476263399694     
Confidence Interval: 0.0012342430584847408


## Hard EM - Random Initilization (k = 3)

In [112]:
# split train test 
X_train, X_test, Y_train, Y_test = train_test_split(df_topk_oneyear, df_final[['ONEYEARREADMIT']], stratify=df_final[['ONEYEARREADMIT']], test_size=0.3)

In [113]:
X_train.shape

(9042, 20)

In [114]:
X_test.shape

(3876, 20)

In [115]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
Y_train = Y_train.reset_index(drop=True)
Y_test = Y_test.reset_index(drop=True)

In [116]:
# randomly initialize cohort 
cohort = np.random.randint(3, size=9042)
cohort[0:10]

array([0, 0, 0, 1, 2, 1, 1, 0, 2, 2])

In [117]:
data = X_train.as_matrix()
print(data.shape)

target = (Y_train.as_matrix()).ravel()
print(target.shape)

test_data = X_test.as_matrix()
print(test_data.shape)

test_target = (Y_test.as_matrix()).ravel()
print(test_target.shape)

print(cohort.shape)

(9042, 20)
(9042,)
(3876, 20)
(3876,)
(9042,)


In [118]:
data

array([[2.0, False, 5.5, ..., 5.96, 0, 1.0],
       [7.0, False, 0.7, ..., 2.85, 0, 1.1],
       [4.0, False, 2.0, ..., 2.01, 0, 1.0],
       ..., 
       [2.0, False, 1.0, ..., 4.49, 0, 1.3],
       [1.0, False, 1.0, ..., 2.0, 0, 1.1],
       [0.0, True, 1.9, ..., 11.31, 0, 3.2]], dtype=object)

In [119]:
target[0:10]

array([1, 0, 0, 1, 0, 0, 0, 0, 0, 0])

In [120]:
clfFn_test = LR(data, target)
# clfFn_test.get_params()
clfFn_test

LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [121]:
clfFn_test.score(data, target)

0.8388630833886308

In [122]:
clfFn_test.predict(data)[0:20]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [123]:
index_counter = 1
num_change = 0
while True:
    print('ITERATION:', index_counter)
    data_1 = data[np.where(cohort == 0)]
    target_1 = target[np.where(cohort == 0)]

    data_2 = data[np.where(cohort == 1)]
    target_2 = target[np.where(cohort == 1)]
    
    data_3 = data[np.where(cohort == 2)]
    target_3 = target[np.where(cohort == 2)]


    cohort1_Model = LR(data_1, target_1)

    cohort2_Model = LR(data_2, target_2)

    cohort3_Model = LR(data_3, target_3)


    cohort1_pred = cohort1_Model.predict(data_1) #reshape(-1,1)
    cohort1_current_accuracy = metrics.accuracy_score(target_1, cohort1_pred)
    print('Cohort 1 Accuracy:', cohort1_current_accuracy*100, '%')
    print("Not Readmitted %:", (1 - (sum(target_1)/len(target_1)))*100,'%')
    print('cohort size:', (len(target_1)/len(target))*100,'%')
    print(" ")
    cohort2_pred = cohort2_Model.predict(data_2) #reshape(-1,1)
    cohort2_current_accuracy = metrics.accuracy_score(target_2, cohort2_pred)
    print('Cohort 2 Accuracy:', cohort2_current_accuracy*100, '%')
    print("Not Readmitted %:", (1 - (sum(target_2)/len(target_2)))*100,'%')
    print('cohort size:', (len(target_2)/len(target))*100,'%')
    print(" ")
    cohort3_pred = cohort3_Model.predict(data_3) #reshape(-1,1)
    cohort3_current_accuracy = metrics.accuracy_score(target_3, cohort3_pred)
    print('Cohort 3 Accuracy:', cohort3_current_accuracy*100, '%')
    print("Not Readmitted %:", (1 - (sum(target_3)/len(target_3)))*100,'%')
    print('cohort size:', (len(target_3)/len(target))*100,'%')


    oldcohort = copy.deepcopy(cohort)
    
    # reassign based on whichever model gives higher probability
    for i in range(len(target)):
        curr_data = data[i]
        if (target[i] == 0):  
            if (cohort1_Model.predict_proba(curr_data)[:,0]) > ((cohort2_Model.predict_proba(curr_data)[:,0]) and (cohort3_Model.predict_proba(curr_data)[:,0])) :
                cohort[i] = 0
            elif (cohort2_Model.predict_proba(curr_data)[:,0]) > ((cohort1_Model.predict_proba(curr_data)[:,0]) and (cohort3_Model.predict_proba(curr_data)[:,0])) :
                cohort[i] = 1
            else:
                cohort[i] = 2
        else:
            if (cohort1_Model.predict_proba(curr_data)[:,1]) > ((cohort2_Model.predict_proba(curr_data)[:,1]) and (cohort3_Model.predict_proba(curr_data)[:,1])):
                cohort[i] = 0
            elif (cohort2_Model.predict_proba(curr_data)[:,1]) > ((cohort1_Model.predict_proba(curr_data)[:,1]) and (cohort3_Model.predict_proba(curr_data)[:,1])):
                cohort[i] = 1
            else:
                cohort[i] = 2
                    
    if sum(np.absolute(np.subtract(oldcohort, cohort))) == 0 or index_counter == 50 or ((cohort3_current_accuracy+cohort2_current_accuracy+cohort1_current_accuracy) == 3):
        num_change = sum(np.absolute(np.subtract(oldcohort, cohort)))
        print('num_change', num_change)
        print("break")
        break
    else:
        num_change = sum(np.absolute(np.subtract(oldcohort, cohort)))
        print('num_change', num_change)
    index_counter += 1
    print(" ")
    print(" ")
    print(" ")


ITERATION: 1
Cohort 1 Accuracy: 84.2174629325 %
Not Readmitted %: 82.7347611203 %
cohort size: 33.565582835655825 %
 
Cohort 2 Accuracy: 83.7908496732 %
Not Readmitted %: 82.9411764706 %
cohort size: 33.842070338420704 %
 
Cohort 3 Accuracy: 83.7801153716 %
Not Readmitted %: 83.7801153716 %
cohort size: 32.59234682592347 %
num_change 8799
 
 
 
ITERATION: 2
Cohort 1 Accuracy: 99.9518459069 %
Not Readmitted %: 84.3017656501 %
cohort size: 68.90068568900686 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 82.3899371069 %
cohort size: 5.2753815527538155 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 80.2141327623 %
cohort size: 25.823932758239327 %
num_change 1626
 
 
 
ITERATION: 3
Cohort 1 Accuracy: 99.9679127226 %
Not Readmitted %: 84.2291031606 %
cohort size: 68.93386418933865 %
 
Cohort 2 Accuracy: 99.8910081744 %
Not Readmitted %: 93.4604904632 %
cohort size: 20.294182702941825 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 56.7761806982 %
cohort size: 10.771953107719531 %
num_c

  np.exp(prob, prob)


num_change 1227
 
 
 
ITERATION: 8
Cohort 1 Accuracy: 99.2766436264 %
Not Readmitted %: 87.86368751 %
cohort size: 68.8011501880115 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 85.5151515152 %
cohort size: 18.248175182481752 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 54.7395388557 %
cohort size: 12.950674629506748 %
num_change 1797
 
 
 
ITERATION: 9
Cohort 1 Accuracy: 99.7007978723 %
Not Readmitted %: 89.2453457447 %
cohort size: 66.53395266533953 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 75.4072924748 %
cohort size: 14.255695642556956 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 67.7605066206 %
cohort size: 19.210351692103515 

  np.exp(prob, prob)


%
num_change 1994
 
 
 
ITERATION: 10
Cohort 1 Accuracy: 97.2648995441 %
Not Readmitted %: 91.4232652372 %
cohort size: 65.50541915505418 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 79.1007493755 %
cohort size: 13.282459632824597 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 60.1147028154 %
cohort size: 21.21212121212121 %
num_change 4909
 
 
 
ITERATION: 11
Cohort 1 Accuracy: 95.7719714964 %
Not Readmitted %: 88.2185273159 %
cohort size: 46.56049546560496 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 76.3661202186 %
cohort size: 8.09555408095554 %
 
Cohort 3 Accuracy: 99.9756097561 %
Not Readmitted %: 79.1463414634 %
cohort size: 45.343950453439504 %
num_change 2816
 
 
 
ITERATION: 12
Cohort 1 Accuracy: 99.3300852619 %
Not Readmitted %: 85.0487210719 %
cohort size: 36.31939836319398 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 79.4392523364 %
cohort size: 7.100199071001991 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 82.3885848319 %
cohort size: 56.58040256580402 %

  np.exp(prob, prob)


num_change 1900
 
 
 
ITERATION: 23
Cohort 1 Accuracy: 98.0526315789 %
Not Readmitted %: 72.6052631579 %
cohort size: 42.026100420261 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 92.7780452576 %
cohort size: 22.970581729705817 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 89.4786729858 %
cohort size: 35.00331785003318 %
num_change 1389
 
 
 
ITERATION: 24
Cohort 1 Accuracy: 99.5045045045 %
Not Readmitted %: 75.990990991 %
cohort size: 49.1041804910418 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 93.1478968792 %
cohort size: 16.30170316301703 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 88.5869565217 %
cohort size: 34.59411634594116 %
num_change 1948
 
 
 
ITERATION: 25
Cohort 1 Accuracy: 99.5538559592 %
Not Readmitted %: 77.3741236456 %
cohort size: 52.05706702057067 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 95.6065318818 %
cohort size: 28.445034284450344 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 80.3743618832 %
cohort size: 19.49789869497899 %
num_change 1

  np.exp(prob, prob)


%
num_change 1392
 
 
 
ITERATION: 27
Cohort 1 Accuracy: 99.7057200662 %
Not Readmitted %: 80.4855618907 %
cohort size: 60.13050210130502 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 95.1142355009 %
cohort size: 31.464277814642777 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 57.3684210526 %
cohort size: 8.4052200840522 %
num_change 1008
 
 
 
ITERATION: 28
Cohort 1 Accuracy: 99.7747747748 %
Not Readmitted %: 81.9473319473 %
cohort size: 63.83543463835435 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 92.298136646 %
cohort size: 26.708692767086927 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 65.3801169591 %
cohort size: 9.455872594558725 %
num_change 1146
 
 
 
ITERATION: 29
Cohort 1 Accuracy: 99.601527478 %
Not Readmitted %: 83.2807570978 %
cohort size: 66.61136916611369 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 94.1709111488 %
cohort size: 19.542136695421366 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 66.9329073482 %
cohort size: 13.84649413846494 %
num_chang

  np.exp(prob, prob)


 %
num_change 1106
 
 
 
ITERATION: 31
Cohort 1 Accuracy: 99.9838657631 %
Not Readmitted %: 85.5598580187 %
cohort size: 68.54678168546782 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 92.522889115 %
cohort size: 21.74297721742977 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 45.1025056948 %
cohort size: 9.710241097102411 

  np.exp(prob, prob)


%
num_change 564
 
 
 
ITERATION: 32
Cohort 1 Accuracy: 99.9678766463 %
Not Readmitted %: 86.1387728879 %
cohort size: 68.85644768856449 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 84.1584158416 %
cohort size: 24.574209245742093 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 47.9797979798 %
cohort size: 6.569343065693431 %


  np.exp(prob, prob)


num_change 737
 
 
 
ITERATION: 33
Cohort 1 Accuracy: 99.9683394016 %
Not Readmitted %: 87.7156878265 %
cohort size: 69.86286219862862 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 85.3750674582 %
cohort size: 20.49325370493254 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 45.2981651376 %
cohort size:

  np.exp(prob, prob)


 9.64388409643884 %
num_change 534
 
 
 
ITERATION: 34
Cohort 1 Accuracy: 100.0 %
Not Readmitted %: 88.7222742893 %
cohort size: 70.8029197080292 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 87.8233438486 %
cohort size: 17.529307675293076 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 42.2748815166 %
cohort size: 11.667772616677727 

  np.exp(prob, prob)


%
num_change 529
break


In [124]:
sum(target_1)

722

In [125]:
sum(target_2)

193

In [126]:
sum(target_3)

609

In [127]:
cohort=cohort.astype('int')
cohort_clf = LR(data, cohort)

In [128]:
# predict cohorts for test set 
test_cohort = cohort_clf.predict(X_test)
test_cohort[0:100]

array([1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

In [129]:
testdata_1 = test_data[np.where(test_cohort == 0)]
testtarget_1 = test_target[np.where(test_cohort == 0)]

testdata_2 = test_data[np.where(test_cohort == 1)]
testtarget_2 = test_target[np.where(test_cohort == 1)]

testdata_3 = test_data[np.where(test_cohort == 2)]
testtarget_3 = test_target[np.where(test_cohort == 2)]

In [130]:
# cohort 1
test_cohort1_pred = cohort1_Model.predict(testdata_1)
test_cohort1_acc = metrics.accuracy_score(testtarget_1, test_cohort1_pred)
print('Accuracy', test_cohort1_acc*100,'%')
print('Cohort fraction:', (len(testtarget_1)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_1)/len(testtarget_1)))*100,'%')
print("Readmitted:", (sum(testtarget_1)/len(testtarget_1))*100,'%')
print('Predicted readmitted:', (sum(test_cohort1_pred)/len(test_cohort1_pred))*100,'%')
print()

# cohort 2
test_cohort2_pred = cohort2_Model.predict(testdata_2)
test_cohort2_acc = metrics.accuracy_score(testtarget_2, test_cohort2_pred)
print('Accuracy', test_cohort2_acc*100,'%')
print('Cohort fraction:', (len(testtarget_2)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_2)/len(testtarget_2)))*100,'%')
print("Readmitted:", (sum(testtarget_2)/len(testtarget_2))*100,'%')
print('Predicted readmitted:', (sum(test_cohort2_pred)/len(test_cohort2_pred))*100,'%')
print()

# cohort 3
test_cohort3_pred = cohort3_Model.predict(testdata_3)
test_cohort3_acc = metrics.accuracy_score(testtarget_3, test_cohort3_pred)
print('Accuracy', test_cohort3_acc*100,'%')
print('Cohort fraction:', (len(testtarget_3)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_3)/len(testtarget_3)))*100,'%')
print("Readmitted:", (sum(testtarget_3)/len(testtarget_3))*100,'%')
print('Predicted readmitted:', (sum(test_cohort3_pred)/len(test_cohort3_pred))*100,'%')
print()
print()

# total
total_accuracy = (test_cohort1_acc*(len(testtarget_1)/len(test_target))) + (test_cohort2_acc*(len(testtarget_2)/len(test_target))) + (test_cohort3_acc*(len(testtarget_3)/len(test_target))) 
print('Total Accuracy:', total_accuracy)

Accuracy 82.0842714699 %
Cohort fraction: 0.8021155830753354
Not Readmitted: 85.1399163718 %
Readmitted: 14.8600836282 %
Predicted readmitted: 8.78095850756 %

Accuracy 70.7349081365 %
Cohort fraction: 0.19659442724458204
Not Readmitted: 75.3280839895 %
Readmitted: 24.6719160105 %
Predicted readmitted: 6.16797900262 %

Accuracy 60.0 %
Cohort fraction: 0.0012899896800825593
Not Readmitted: 40.0 %
Readmitted: 60.0 %
Predicted readmitted: 100.0 %


Total Accuracy: 0.798245614035


## Hard EM - Random Initilization (k = 5)

In [337]:
# split train test 
X_train, X_test, Y_train, Y_test = train_test_split(df_topk_oneyear, df_final[['ONEYEARREADMIT']], stratify=df_final[['ONEYEARREADMIT']], test_size=0.3)

In [338]:
X_train.shape

(9042, 20)

In [339]:
X_test.shape

(3876, 20)

In [340]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
Y_train = Y_train.reset_index(drop=True)
Y_test = Y_test.reset_index(drop=True)

In [341]:
# randomly initialize cohort 
cohort = np.random.randint(5, size=9042)
cohort[0:10]

array([0, 4, 0, 4, 1, 1, 2, 2, 0, 3])

In [342]:
data = X_train.as_matrix()
print(data.shape)

target = (Y_train.as_matrix()).ravel()
print(target.shape)

test_data = X_test.as_matrix()
print(test_data.shape)

test_target = (Y_test.as_matrix()).ravel()
print(test_target.shape)

print(cohort.shape)

(9042, 20)
(9042,)
(3876, 20)
(3876,)
(9042,)


In [343]:
data

array([[2.0, True, 1.1, ..., 5.6, 0, 1.5],
       [0.0, False, 1.1, ..., 9.07, 0, 1.1],
       [2.0, False, 1.0, ..., 6.08, 0, 2.3],
       ..., 
       [5.0, False, 4.0, ..., 8.44, 0, 1.3],
       [9.0, True, 1.4, ..., 10.39, 0, 3.8],
       [0.0, True, 1.0, ..., 9.87, 1, 1.4]], dtype=object)

In [344]:
target[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [345]:
clfFn_test = LR(data, target)
# clfFn_test.get_params()
clfFn_test

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [346]:
clfFn_test.score(data, target)

0.83731475337314754

In [347]:
clfFn_test.predict(data)[0:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [348]:
index_counter = 1
num_change = 0
while True:
    print('ITERATION:', index_counter)
    data_1 = data[np.where(cohort == 0)]
    target_1 = target[np.where(cohort == 0)]

    data_2 = data[np.where(cohort == 1)]
    target_2 = target[np.where(cohort == 1)]
    
    data_3 = data[np.where(cohort == 2)]
    target_3 = target[np.where(cohort == 2)]
    
    data_4 = data[np.where(cohort == 3)]
    target_4 = target[np.where(cohort == 3)]
    
    data_5 = data[np.where(cohort == 4)]
    target_5 = target[np.where(cohort == 4)]

    if sum(target_1) == 0 or sum(target_2) == 0 or sum(target_3) == 0 or sum(target_4) == 0 or sum(target_5) == 0:
        break

    cohort1_Model = LR(data_1, target_1)

    cohort2_Model = LR(data_2, target_2)

    cohort3_Model = LR(data_3, target_3)

    cohort4_Model = LR(data_4, target_4)

    cohort5_Model = LR(data_5, target_5)


    cohort1_pred = cohort1_Model.predict(data_1) #reshape(-1,1)
    cohort1_current_accuracy = metrics.accuracy_score(target_1, cohort1_pred)
    print('Cohort 1 Accuracy:', cohort1_current_accuracy*100, '%')
    print("Not Readmitted %:", (1 - (sum(target_1)/len(target_1)))*100,'%')
    print('cohort size:', (len(target_1)/len(target))*100,'%')
    print(" ")
    cohort2_pred = cohort2_Model.predict(data_2) #reshape(-1,1)
    cohort2_current_accuracy = metrics.accuracy_score(target_2, cohort2_pred)
    print('Cohort 2 Accuracy:', cohort2_current_accuracy*100, '%')
    print("Not Readmitted %:", (1 - (sum(target_2)/len(target_2)))*100,'%')
    print('cohort size:', (len(target_2)/len(target))*100,'%')
    print(" ")
    cohort3_pred = cohort3_Model.predict(data_3) #reshape(-1,1)
    cohort3_current_accuracy = metrics.accuracy_score(target_3, cohort3_pred)
    print('Cohort 3 Accuracy:', cohort3_current_accuracy*100, '%')
    print("Not Readmitted %:", (1 - (sum(target_3)/len(target_3)))*100,'%')
    print('cohort size:', (len(target_3)/len(target))*100,'%')
    print(" ")
    cohort4_pred = cohort4_Model.predict(data_4) #reshape(-1,1)
    cohort4_current_accuracy = metrics.accuracy_score(target_4, cohort4_pred)
    print('Cohort 4 Accuracy:', cohort4_current_accuracy*100, '%')
    print("Not Readmitted %:", (1 - (sum(target_4)/len(target_4)))*100,'%')
    print('cohort size:', (len(target_4)/len(target))*100,'%')
    print(" ")
    cohort5_pred = cohort5_Model.predict(data_5) #reshape(-1,1)
    cohort5_current_accuracy = metrics.accuracy_score(target_5, cohort5_pred)
    print('Cohort 5 Accuracy:', cohort5_current_accuracy*100, '%')
    print("Not Readmitted %:", (1 - (sum(target_5)/len(target_5)))*100,'%')
    print('cohort size:', (len(target_5)/len(target))*100,'%')


    oldcohort = copy.deepcopy(cohort)
    
    # reassign based on whichever model gives higher probability
    for i in range(len(target)):
        curr_data = data[i]
        if (target[i] == 0):  
            if (cohort1_Model.predict_proba(curr_data)[:,0]) > ((cohort2_Model.predict_proba(curr_data)[:,0]) and (cohort3_Model.predict_proba(curr_data)[:,0]) and (cohort4_Model.predict_proba(curr_data)[:,0]) and (cohort5_Model.predict_proba(curr_data)[:,0])) :
                cohort[i] = 0
            elif (cohort2_Model.predict_proba(curr_data)[:,0]) > ((cohort1_Model.predict_proba(curr_data)[:,0]) and (cohort3_Model.predict_proba(curr_data)[:,0]) and (cohort4_Model.predict_proba(curr_data)[:,0]) and (cohort5_Model.predict_proba(curr_data)[:,0])) :
                cohort[i] = 1
            elif (cohort3_Model.predict_proba(curr_data)[:,0]) > ((cohort2_Model.predict_proba(curr_data)[:,0]) and (cohort1_Model.predict_proba(curr_data)[:,0]) and (cohort4_Model.predict_proba(curr_data)[:,0]) and (cohort5_Model.predict_proba(curr_data)[:,0])) :
                cohort[i] = 2
            elif (cohort4_Model.predict_proba(curr_data)[:,0]) > ((cohort2_Model.predict_proba(curr_data)[:,0]) and (cohort3_Model.predict_proba(curr_data)[:,0]) and (cohort1_Model.predict_proba(curr_data)[:,0]) and (cohort5_Model.predict_proba(curr_data)[:,0])) :
                cohort[i] = 3
            else:
                cohort[i] = 4


        else:
            if (cohort1_Model.predict_proba(curr_data)[:,1]) > ((cohort2_Model.predict_proba(curr_data)[:,1]) and (cohort3_Model.predict_proba(curr_data)[:,1]) and (cohort4_Model.predict_proba(curr_data)[:,1]) and (cohort5_Model.predict_proba(curr_data)[:,1])):
                cohort[i] = 0
            elif (cohort2_Model.predict_proba(curr_data)[:,1]) > ((cohort1_Model.predict_proba(curr_data)[:,1]) and (cohort3_Model.predict_proba(curr_data)[:,1]) and (cohort4_Model.predict_proba(curr_data)[:,1]) and (cohort5_Model.predict_proba(curr_data)[:,1])):
                cohort[i] = 1
            elif (cohort3_Model.predict_proba(curr_data)[:,1]) > ((cohort1_Model.predict_proba(curr_data)[:,1]) and (cohort2_Model.predict_proba(curr_data)[:,1]) and (cohort4_Model.predict_proba(curr_data)[:,1]) and (cohort5_Model.predict_proba(curr_data)[:,1])):
                cohort[i] = 2
            elif (cohort4_Model.predict_proba(curr_data)[:,1]) > ((cohort1_Model.predict_proba(curr_data)[:,1]) and (cohort3_Model.predict_proba(curr_data)[:,1]) and (cohort2_Model.predict_proba(curr_data)[:,1]) and (cohort5_Model.predict_proba(curr_data)[:,1])):
                cohort[i] = 3
            else:
                cohort[i] = 4


                    
    if sum(np.absolute(np.subtract(oldcohort, cohort))) == 0 or index_counter == 50 or ((cohort3_current_accuracy+cohort2_current_accuracy+cohort1_current_accuracy+cohort4_current_accuracy+cohort5_current_accuracy) > 4.99):
        num_change = sum(np.absolute(np.subtract(oldcohort, cohort)))
        print('num_change', num_change)
        print("break")
        break
    else:
        num_change = sum(np.absolute(np.subtract(oldcohort, cohort)))
        print('num_change', num_change)
    index_counter += 1
    print(" ")
    print(" ")
    print(" ")


ITERATION: 1
Cohort 1 Accuracy: 85.4961832061 %
Not Readmitted %: 84.8964013086 %
cohort size: 20.283123202831234 %
 
Cohort 2 Accuracy: 82.8680897646 %
Not Readmitted %: 82.8680897646 %
cohort size: 20.205706702057068 %
 
Cohort 3 Accuracy: 83.826754386 %
Not Readmitted %: 82.8399122807 %
cohort size: 20.17252820172528 %
 
Cohort 4 Accuracy: 84.5637583893 %
Not Readmitted %: 83.0536912752 %
cohort size: 19.774386197743862 %
 
Cohort 5 Accuracy: 82.7586206897 %
Not Readmitted %: 82.0237422272 %
cohort size: 19.564255695642558 %
num_change 16074
 
 
 
ITERATION: 2
Cohort 1 Accuracy: 99.8627080831 %
Not Readmitted %: 92.3116526515 %
cohort size: 64.44370714443707 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 50.5202312139 %
cohort size: 9.566467595664676 %
 
Cohort 3 Accuracy: 99.8783454988 %
Not Readmitted %: 81.8734793187 %
cohort size: 9.090909090909092 %
 
Cohort 4 Accuracy: 100.0 %
Not Readmitted %: 85.3846153846 %
cohort size: 5.7509400575094 %
 
Cohort 5 Accuracy: 100.0 %
Not R

In [349]:
sum(target_1)

448

In [350]:
sum(target_2)

428

In [351]:
sum(target_3)

149

In [352]:
sum(target_4)

76

In [353]:
sum(target_5)

423

In [354]:
cohort=cohort.astype('int')
cohort_clf = LR(data, cohort)

In [355]:
# predict cohorts for test set 
test_cohort = cohort_clf.predict(X_test)
test_cohort[0:100]

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

In [358]:
np.any(test_cohort == 2)

True

In [359]:
testdata_1 = test_data[np.where(test_cohort == 0)]
testtarget_1 = test_target[np.where(test_cohort == 0)]

testdata_2 = test_data[np.where(test_cohort == 1)]
testtarget_2 = test_target[np.where(test_cohort == 1)]

testdata_3 = test_data[np.where(test_cohort == 2)]
testtarget_3 = test_target[np.where(test_cohort == 2)]

testdata_4 = test_data[np.where(test_cohort == 3)]
testtarget_4 = test_target[np.where(test_cohort == 3)]

testdata_5 = test_data[np.where(test_cohort == 4)]
testtarget_5 = test_target[np.where(test_cohort == 4)]

In [360]:
# cohort 1
test_cohort1_pred = cohort1_Model.predict(testdata_1)
test_cohort1_acc = metrics.accuracy_score(testtarget_1, test_cohort1_pred)
print('Accuracy', test_cohort1_acc*100,'%')
print('Cohort fraction:', (len(testtarget_1)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_1)/len(testtarget_1)))*100,'%')
print("Readmitted:", (sum(testtarget_1)/len(testtarget_1))*100,'%')
print('Predicted readmitted:', (sum(test_cohort1_pred)/len(test_cohort1_pred))*100,'%')
print()

# cohort 2
test_cohort2_pred = cohort2_Model.predict(testdata_2)
test_cohort2_acc = metrics.accuracy_score(testtarget_2, test_cohort2_pred)
print('Accuracy', test_cohort2_acc*100,'%')
print('Cohort fraction:', (len(testtarget_2)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_2)/len(testtarget_2)))*100,'%')
print("Readmitted:", (sum(testtarget_2)/len(testtarget_2))*100,'%')
print('Predicted readmitted:', (sum(test_cohort2_pred)/len(test_cohort2_pred))*100,'%')
print()

# cohort 3
test_cohort3_pred = cohort3_Model.predict(testdata_3)
test_cohort3_acc = metrics.accuracy_score(testtarget_3, test_cohort3_pred)
print('Accuracy', test_cohort3_acc*100,'%')
print('Cohort fraction:', (len(testtarget_3)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_3)/len(testtarget_3)))*100,'%')
print("Readmitted:", (sum(testtarget_3)/len(testtarget_3))*100,'%')
print('Predicted readmitted:', (sum(test_cohort3_pred)/len(test_cohort3_pred))*100,'%')
print()
print()

# cohort 4
test_cohort4_pred = cohort4_Model.predict(testdata_4)
test_cohort4_acc = metrics.accuracy_score(testtarget_4, test_cohort4_pred)
print('Accuracy', test_cohort4_acc*100,'%')
print('Cohort fraction:', (len(testtarget_4)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_4)/len(testtarget_4)))*100,'%')
print("Readmitted:", (sum(testtarget_4)/len(testtarget_4))*100,'%')
print('Predicted readmitted:', (sum(test_cohort4_pred)/len(test_cohort4_pred))*100,'%')
print()
print()


# cohort 5
test_cohort5_pred = cohort5_Model.predict(testdata_5)
test_cohort5_acc = metrics.accuracy_score(testtarget_5, test_cohort5_pred)
print('Accuracy', test_cohort5_acc*100,'%')
print('Cohort fraction:', (len(testtarget_5)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_5)/len(testtarget_5)))*100,'%')
print("Readmitted:", (sum(testtarget_5)/len(testtarget_5))*100,'%')
print('Predicted readmitted:', (sum(test_cohort5_pred)/len(test_cohort5_pred))*100,'%')
print()
print()


# total

total_accuracy = (test_cohort1_acc*(len(testtarget_1)/len(test_target))) + (test_cohort2_acc*(len(testtarget_2)/len(test_target))) + (test_cohort3_acc*(len(testtarget_3)/len(test_target)))  + (test_cohort5_acc*(len(testtarget_5)/len(test_target))) + (test_cohort4_acc*(len(testtarget_4)/len(test_target))) 
print('Total Accuracy:', total_accuracy)

Accuracy 68.409466153 %
Cohort fraction: 0.9375644994840041
Not Readmitted: 84.3423225096 %
Readmitted: 15.6576774904 %
Predicted readmitted: 22.3720418272 %

Accuracy 60.1123595506 %
Cohort fraction: 0.04592363261093911
Not Readmitted: 73.0337078652 %
Readmitted: 26.9662921348 %
Predicted readmitted: 20.7865168539 %

Accuracy 33.3333333333 %
Cohort fraction: 0.0007739938080495357
Not Readmitted: 100.0 %
Readmitted: 0.0 %
Predicted readmitted: 66.6666666667 %


Accuracy 50.0 %
Cohort fraction: 0.0010319917440660474
Not Readmitted: 50.0 %
Readmitted: 50.0 %
Predicted readmitted: 100.0 %


Accuracy 63.1578947368 %
Cohort fraction: 0.014705882352941176
Not Readmitted: 40.350877193 %
Readmitted: 59.649122807 %
Predicted readmitted: 89.4736842105 %


Total Accuracy: 0.679050567595


## Hard EM - Random Initilization (k = 4)

In [315]:
# split train test 
X_train, X_test, Y_train, Y_test = train_test_split(df_topk_oneyear, df_final[['ONEYEARREADMIT']], stratify=df_final[['ONEYEARREADMIT']], test_size=0.3)

In [316]:
X_train.shape

(9042, 20)

In [317]:
X_test.shape

(3876, 20)

In [318]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
Y_train = Y_train.reset_index(drop=True)
Y_test = Y_test.reset_index(drop=True)

In [319]:
# randomly initialize cohort 
cohort = np.random.randint(4, size=9042)
cohort[0:10]

array([2, 1, 3, 2, 0, 0, 2, 3, 2, 2])

In [320]:
data = X_train.as_matrix()
print(data.shape)

target = (Y_train.as_matrix()).ravel()
print(target.shape)

test_data = X_test.as_matrix()
print(test_data.shape)

test_target = (Y_test.as_matrix()).ravel()
print(test_target.shape)

print(cohort.shape)

(9042, 20)
(9042,)
(3876, 20)
(3876,)
(9042,)


In [321]:
data

array([[2.0, False, 1.1, ..., 3.3, 0, 1.3],
       [5.0, False, 2.4, ..., 8.73, 1, 2.2],
       [5.0, False, 2.8, ..., 10.09, 0, 1.0],
       ..., 
       [0.0, False, 1.1, ..., 1.83, 0, 1.1],
       [4.0, False, 11.2, ..., 15.39, 1, 1.9],
       [0.0, False, 1.1, ..., 3.15, 1, 1.3]], dtype=object)

In [322]:
target[0:10]

array([1, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [323]:
clfFn_test = LR(data, target)
# clfFn_test.get_params()
clfFn_test

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [324]:
clfFn_test.score(data, target)

0.83709356337093566

In [325]:
clfFn_test.predict(data)[0:20]

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [326]:
index_counter = 1
num_change = 0
while True:
    print('ITERATION:', index_counter)
    data_1 = data[np.where(cohort == 0)]
    target_1 = target[np.where(cohort == 0)]

    data_2 = data[np.where(cohort == 1)]
    target_2 = target[np.where(cohort == 1)]
    
    data_3 = data[np.where(cohort == 2)]
    target_3 = target[np.where(cohort == 2)]
    
    data_4 = data[np.where(cohort == 3)]
    target_4 = target[np.where(cohort == 3)]
    
    
    if sum(target_1) == 0 or sum(target_2) == 0 or sum(target_3) == 0 or sum(target_4) == 0:
        break
    
    cohort1_Model = LR(data_1, target_1)

    cohort2_Model = LR(data_2, target_2)

    cohort3_Model = LR(data_3, target_3)

    cohort4_Model = LR(data_4, target_4)



    cohort1_pred = cohort1_Model.predict(data_1) #reshape(-1,1)
    cohort1_current_accuracy = metrics.accuracy_score(target_1, cohort1_pred)
    print('Cohort 1 Accuracy:', cohort1_current_accuracy*100, '%')
    print("Not Readmitted %:", (1 - (sum(target_1)/len(target_1)))*100,'%')
    print('cohort size:', (len(target_1)/len(target))*100,'%')
    print(" ")
    cohort2_pred = cohort2_Model.predict(data_2) #reshape(-1,1)
    cohort2_current_accuracy = metrics.accuracy_score(target_2, cohort2_pred)
    print('Cohort 2 Accuracy:', cohort2_current_accuracy*100, '%')
    print("Not Readmitted %:", (1 - (sum(target_2)/len(target_2)))*100,'%')
    print('cohort size:', (len(target_2)/len(target))*100,'%')
    print(" ")
    cohort3_pred = cohort3_Model.predict(data_3) #reshape(-1,1)
    cohort3_current_accuracy = metrics.accuracy_score(target_3, cohort3_pred)
    print('Cohort 3 Accuracy:', cohort3_current_accuracy*100, '%')
    print("Not Readmitted %:", (1 - (sum(target_3)/len(target_3)))*100,'%')
    print('cohort size:', (len(target_3)/len(target))*100,'%')
    print(" ")
    cohort4_pred = cohort4_Model.predict(data_4) #reshape(-1,1)
    cohort4_current_accuracy = metrics.accuracy_score(target_4, cohort4_pred)
    print('Cohort 4 Accuracy:', cohort4_current_accuracy*100, '%')
    print("Not Readmitted %:", (1 - (sum(target_4)/len(target_4)))*100,'%')
    print('cohort size:', (len(target_4)/len(target))*100,'%')
    print(" ")



    oldcohort = copy.deepcopy(cohort)
    
    # reassign based on whichever model gives higher probability
    for i in range(len(target)):
        curr_data = data[i]
        if (target[i] == 0):  
            if (cohort1_Model.predict_proba(curr_data)[:,0]) > ((cohort2_Model.predict_proba(curr_data)[:,0]) and (cohort3_Model.predict_proba(curr_data)[:,0]) and (cohort4_Model.predict_proba(curr_data)[:,0])) :
                cohort[i] = 0
            elif (cohort2_Model.predict_proba(curr_data)[:,0]) > ((cohort1_Model.predict_proba(curr_data)[:,0]) and (cohort3_Model.predict_proba(curr_data)[:,0]) and (cohort4_Model.predict_proba(curr_data)[:,0])) :
                cohort[i] = 1
            elif (cohort3_Model.predict_proba(curr_data)[:,0]) > ((cohort2_Model.predict_proba(curr_data)[:,0]) and (cohort1_Model.predict_proba(curr_data)[:,0]) and (cohort4_Model.predict_proba(curr_data)[:,0])) :
                cohort[i] = 2
            else:
                cohort[i] = 3
        else:
            if (cohort1_Model.predict_proba(curr_data)[:,1]) > ((cohort2_Model.predict_proba(curr_data)[:,1]) and (cohort3_Model.predict_proba(curr_data)[:,1]) and (cohort4_Model.predict_proba(curr_data)[:,1])):
                cohort[i] = 0
            elif (cohort2_Model.predict_proba(curr_data)[:,1]) > ((cohort1_Model.predict_proba(curr_data)[:,1]) and (cohort3_Model.predict_proba(curr_data)[:,1]) and (cohort4_Model.predict_proba(curr_data)[:,1])):
                cohort[i] = 1
            elif (cohort3_Model.predict_proba(curr_data)[:,1]) > ((cohort1_Model.predict_proba(curr_data)[:,1]) and (cohort2_Model.predict_proba(curr_data)[:,1]) and (cohort4_Model.predict_proba(curr_data)[:,1])):
                cohort[i] = 2
            else:
                cohort[i] = 3


                    
    if sum(np.absolute(np.subtract(oldcohort, cohort))) == 0 or index_counter == 50:  # or ((cohort3_current_accuracy+cohort2_current_accuracy+cohort1_current_accuracy+cohort4_current_accuracy) > 3.999):
        num_change = sum(np.absolute(np.subtract(oldcohort, cohort)))
        print('num_change', num_change)
        print("break")
        break
    else:
        num_change = sum(np.absolute(np.subtract(oldcohort, cohort)))
        print('num_change', num_change)
    index_counter += 1
    print(" ")
    print(" ")
    print(" ")


ITERATION: 1
Cohort 1 Accuracy: 84.4048653345 %
Not Readmitted %: 83.2319721981 %
cohort size: 25.458969254589693 %
 
Cohort 2 Accuracy: 83.592321755 %
Not Readmitted %: 82.9524680073 %
cohort size: 24.198186241981862 %
 
Cohort 3 Accuracy: 83.8808585195 %
Not Readmitted %: 83.3990363557 %
cohort size: 25.24883875248839 %
 
Cohort 4 Accuracy: 83.5610401058 %
Not Readmitted %: 82.9881004848 %
cohort size: 25.094005750940056 %
 
num_change 12431
 
 
 
ITERATION: 2
Cohort 1 Accuracy: 99.8266597331 %
Not Readmitted %: 86.9301438724 %
cohort size: 63.802256138022564 %
 
Cohort 2 Accuracy: 100.0 %
Not Readmitted %: 73.4732824427 %
cohort size: 17.38553417385534 %
 
Cohort 3 Accuracy: 100.0 %
Not Readmitted %: 84.1849148418 %
cohort size: 4.545454545454546 %
 
Cohort 4 Accuracy: 100.0 %
Not Readmitted %: 77.6744186047 %
cohort size: 14.26675514266755 %
 
num_change 2554
 
 
 
ITERATION: 3
Cohort 1 Accuracy: 99.6481970097 %
Not Readmitted %: 86.4204045734 %
cohort size: 62.87325812873258 %
 
C

In [327]:
sum(target_1)

767

In [328]:
sum(target_2)

621

In [329]:
sum(target_3)

0

In [330]:
sum(target_4)

136

In [331]:
cohort=cohort.astype('int')
cohort_clf = LR(data, cohort)

In [332]:
cohort[1:1000]

array([0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 2, 1, 0, 3, 1, 0, 2, 1, 1, 0, 2, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 3, 0, 0, 0, 3,
       0, 0, 1, 1, 0, 1, 0, 2, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 2, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 3, 0, 0, 1, 0, 0, 1, 1, 3, 1, 3, 3, 2, 0, 0,
       0, 0, 0, 0, 0, 0, 3, 1, 0, 0, 1, 1, 3, 1, 1, 0, 0, 2, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 2, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 0,
       1, 1, 3, 0, 0, 1, 0, 0, 1, 1, 0, 3, 1, 1, 0, 1, 0, 0, 3, 3, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 3, 2, 0, 0, 3, 0, 0, 1, 0, 1, 3, 0,
       0, 0, 0, 0, 0, 3, 1, 1, 0, 1, 0, 0, 0, 1, 0, 3, 1, 1, 0, 0, 0, 2, 0,
       1, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 1,
       0, 1, 1, 3, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 2, 1, 1, 0, 0, 2,
       3, 1, 1, 0, 1, 3, 2, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 2, 0, 0, 0,
       1, 1,

In [333]:
# predict cohorts for test set 
test_cohort = cohort_clf.predict(X_test)
test_cohort[0:100]

array([1, 0, 0, 3, 1, 0, 0, 1, 0, 1, 3, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 3,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 3, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1])

In [334]:
np.any(test_cohort == 2)

True

In [335]:
testdata_1 = test_data[np.where(test_cohort == 0)]
testtarget_1 = test_target[np.where(test_cohort == 0)]

testdata_2 = test_data[np.where(test_cohort == 1)]
testtarget_2 = test_target[np.where(test_cohort == 1)]

testdata_3 = test_data[np.where(test_cohort == 2)]
testtarget_3 = test_target[np.where(test_cohort == 2)]

testdata_4 = test_data[np.where(test_cohort == 3)]
testtarget_4 = test_target[np.where(test_cohort == 3)]

In [336]:
# cohort 1
test_cohort1_pred = cohort1_Model.predict(testdata_1)
test_cohort1_acc = metrics.accuracy_score(testtarget_1, test_cohort1_pred)
print('Accuracy', test_cohort1_acc*100,'%')
print('Cohort fraction:', (len(testtarget_1)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_1)/len(testtarget_1)))*100,'%')
print("Readmitted:", (sum(testtarget_1)/len(testtarget_1))*100,'%')
print('Predicted readmitted:', (sum(test_cohort1_pred)/len(test_cohort1_pred))*100,'%')
print()

# cohort 2
test_cohort2_pred = cohort2_Model.predict(testdata_2)
test_cohort2_acc = metrics.accuracy_score(testtarget_2, test_cohort2_pred)
print('Accuracy', test_cohort2_acc*100,'%')
print('Cohort fraction:', (len(testtarget_2)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_2)/len(testtarget_2)))*100,'%')
print("Readmitted:", (sum(testtarget_2)/len(testtarget_2))*100,'%')
print('Predicted readmitted:', (sum(test_cohort2_pred)/len(test_cohort2_pred))*100,'%')
print()

# cohort 3
test_cohort3_pred = cohort3_Model.predict(testdata_3)
test_cohort3_acc = metrics.accuracy_score(testtarget_3, test_cohort3_pred)
print('Accuracy', test_cohort3_acc*100,'%')
print('Cohort fraction:', (len(testtarget_3)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_3)/len(testtarget_3)))*100,'%')
print("Readmitted:", (sum(testtarget_3)/len(testtarget_3))*100,'%')
print('Predicted readmitted:', (sum(test_cohort3_pred)/len(test_cohort3_pred))*100,'%')
print()
print()

# cohort 5
test_cohort5_pred = cohort5_Model.predict(testdata_5)
test_cohort5_acc = metrics.accuracy_score(testtarget_5, test_cohort5_pred)
print('Accuracy', test_cohort5_acc*100,'%')
print('Cohort fraction:', (len(testtarget_5)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_5)/len(testtarget_5)))*100,'%')
print("Readmitted:", (sum(testtarget_5)/len(testtarget_5))*100,'%')
print('Predicted readmitted:', (sum(test_cohort5_pred)/len(test_cohort5_pred))*100,'%')
print()
print()

# cohort 4
test_cohort4_pred = cohort4_Model.predict(testdata_4)
test_cohort4_acc = metrics.accuracy_score(testtarget_4, test_cohort4_pred)
print('Accuracy', test_cohort4_acc*100,'%')
print('Cohort fraction:', (len(testtarget_4)/len(test_target)))
print("Not Readmitted:", (1 - (sum(testtarget_4)/len(testtarget_4)))*100,'%')
print("Readmitted:", (sum(testtarget_4)/len(testtarget_4))*100,'%')
print('Predicted readmitted:', (sum(test_cohort4_pred)/len(test_cohort4_pred))*100,'%')
print()
print()



# total
total_accuracy = (test_cohort1_acc*(len(testtarget_1)/len(test_target))) + (test_cohort2_acc*(len(testtarget_2)/len(test_target))) + (test_cohort3_acc*(len(testtarget_3)/len(test_target)))  + (test_cohort4_acc*(len(testtarget_4)/len(test_target))) 
print('Total Accuracy:', total_accuracy)

Accuracy 80.8510638298 %
Cohort fraction: 0.5699174406604747
Not Readmitted: 84.5178813943 %
Readmitted: 15.4821186057 %
Predicted readmitted: 7.46944318696 %

Accuracy 71.8348002708 %
Cohort fraction: 0.381062951496388
Not Readmitted: 79.6885578876 %
Readmitted: 20.3114421124 %
Predicted readmitted: 10.6973595125 %

Accuracy 94.1176470588 %
Cohort fraction: 0.0043859649122807015
Not Readmitted: 94.1176470588 %
Readmitted: 5.88235294118 %
Predicted readmitted: 0.0 %


Accuracy 93.7192118227 %
Cohort fraction: 0.20949432404540763
Not Readmitted: 93.7192118227 %
Readmitted: 6.28078817734 %
Predicted readmitted: 0.0 %


Accuracy 93.6416184971 %
Cohort fraction: 0.04463364293085655
Not Readmitted: 94.2196531792 %
Readmitted: 5.78034682081 %
Predicted readmitted: 0.578034682081 %


Total Accuracy: 0.78044375645
