In [1]:
import pandas as pd 
import numpy as np
import gzip
import math
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA


In [16]:
feature_arr_sfs=np.zeros([45,1])
feature_arr_sbs=np.zeros([45,1])

def scale(X_orig):
    nom = (X_orig-X_orig.min(axis=0))
    print("shape of nom",np.shape(nom))
    denom = (X_orig.max(axis=1)-X_orig.min(axis=1))
    denom[denom==0] = 1
    nom[np.reshape(denom,(45,))==0]=0.5
    ret_val=np.transpose(nom)/denom
    return np.transpose(ret_val) 
'''
def regression(X,y,T_x,T_y):
    clf = Ridge(alpha=1.0)
    clf.fit(np.transpose(X),np.transpose(y)) 
    print("Accuracy",clf.score(np.transpose(T_x),np.transpose(T_y)))
    print("Coefs",clf.coef_)
    print("features that matter",np.argwhere(clf.coef_))


def ridge_class(X,y,T_x,T_y):
    clf = RidgeClassifier().fit(np.transpose(X), np.transpose(y))
    print(clf.score(np.transpose(T_x),np.transpose(T_y))) 

'''


dataframe = pd.read_csv("dataset-curation/curated_data_v0.csv")  
dataframe.replace('?',np.NaN,inplace=True)
dataframe.nunique()
selected_encounters = dataframe.groupby('patient_nbr', as_index=False)['encounter_id'].idxmin()
curated_df = dataframe.loc[selected_encounters]
use_dataframe=curated_df


def shuffle(df):
    return df.sample(frac=1).reset_index(drop=True)

# drop columns with specified label
def dropColumn(df, colLabels):
    for colLabel in colLabels:
        if colLabel in df.columns:
            df.drop([colLabel], axis=1, inplace=True)
    return df

# convert feature from categorical to binary
def categoricalToBinary(df, colLabel):
    if colLabel in df.columns:
        df = pandas.get_dummies(df, columns=[colLabel])
    return df

# return features (X) and target (Y) from dataframe
def seperateDataTargets(df, targetLabel):
    X = df.drop(targetLabel, axis=1).values
    Y = df[targetLabel].values.ravel()
    return X, Y

# convert to binary classes: 1 (positive) or 0 (negative)
def convertToBinaryClass(y):
    for i in range(len(y)):
        if y[i] != 1:
            y[i] = 0
        else:
            y[i] = 1
    return np.array(y)

target_name = 'readmitted'
use_dataframe = shuffle(use_dataframe)
use_dataframe = dropColumn(use_dataframe, ['encounter_id', 'patient_nbr', 'weight','payer_code', 'medical_specialty'])

obj_to_category_list = use_dataframe.select_dtypes(include=['object']).columns.tolist()

for obj in obj_to_category_list:
    use_dataframe[obj] = use_dataframe[obj].astype('category')


# save label encodings to le_dict
le_dict = {}
cols_for_le = obj_to_category_list

for col in cols_for_le:
    le_dict[col] = dict(enumerate(use_dataframe[col].cat.categories))


# label encoding
for col in cols_for_le:
    use_dataframe[col] = use_dataframe[col].cat.codes

cols_for_he = [ 'race' , 'gender', 'age' ,'glyburide-metformin','insulin','miglitol','acarbose','rosiglitazone','pioglitazone','glyburide','glipizide','glimepiride','chlorpropamide','nateglinide','repaglinide','metformin','A1Cresult','max_glu_serum'] # can try: 'diag_1', 'diag_2', 'diag_3'
use_dataframe = pd.get_dummies(use_dataframe, columns=cols_for_he, dummy_na=True)

# one hot encoding
# cols_for_he = ['RACE','ETHNICITY','Molecular_Profile','Mutation','Drugs','Type'] # removed 'BIRTHPLACE'
# use_dataframe = pandas.get_dummies(use_dataframe, columns=cols_for_he, dummy_na=True)


# convert boolean classes to int (0 - False, 1 - True)
# use_dataframe['Diabetic'] = use_dataframe['Diabetic'].astype(int)
# use_dataframe['Hypertension'] = use_dataframe['Hypertension'].astype(int)


# seperate X, Y and binarize outcome Y
X, Y = seperateDataTargets(use_dataframe, [target_name])

skiplabels = ['diag_1', 'diag_2', 'diag_3']
keys = list(le_dict.keys())
#pp.pprint([{k: le_dict[k]} for k in keys if k not in skiplabels])


feature_names = use_dataframe.drop([target_name, 'encounter_id', 'patient_nbr','weight','payer_code'], axis=1, errors='ignore').columns.tolist()
class_names = ['>30', '<30', 'NO'] # >30 readmission, <30 days readmission, No readmission

# number of rows and columns
num_cols = use_dataframe.shape[1]
num_rows = use_dataframe.shape[0]

# print basic data set characteristics
print('\n'.join(feature_names))

print(use_dataframe.head())
use_dataframe=use_dataframe.as_matrix()
use_dataframe=np.transpose(use_dataframe)
rows=np.shape(use_dataframe)[0]
cols=np.shape(use_dataframe)[1]
data_y=use_dataframe[rows-1,:]
use_dataframe=np.delete(use_dataframe,rows-1,axis=0)
print("leftover features",np.shape(use_dataframe)[0])
#use_dataframe=scale(use_dataframe)
#test=use_dataframe[:,90000:]
# data_y=np.reshape(data_y,(1,cols))
data_y[data_y < 2] = 1
#test_l=data_y[:,90000:]
print("shape of data", np.shape(use_dataframe))
print("shape of data_y", np.shape(data_y))
'''
pca = PCA(n_components=1)
trans_use_dataframe=pca.fit_transform(np.transpose(use_dataframe))
print(pca.explained_variance_ratio_)
'''

trans_use_dataframe=np.transpose(use_dataframe)


admission_type_id
discharge_disposition_id
admission_source_id
time_in_hospital
num_lab_procedures
num_procedures
num_medications
number_outpatient
number_emergency
number_inpatient
diag_1
diag_2
diag_3
number_diagnoses
acetohexamide
tolbutamide
troglitazone
tolazamide
examide
citoglipton
glipizide-metformin
glimepiride-pioglitazone
metformin-rosiglitazone
metformin-pioglitazone
change
diabetesMed
race_-1.0
race_0.0
race_1.0
race_2.0
race_3.0
race_4.0
race_nan
gender_0.0
gender_1.0
gender_2.0
gender_nan
age_0.0
age_1.0
age_2.0
age_3.0
age_4.0
age_5.0
age_6.0
age_7.0
age_8.0
age_9.0
age_nan
glyburide-metformin_0.0
glyburide-metformin_1.0
glyburide-metformin_2.0
glyburide-metformin_3.0
glyburide-metformin_nan
insulin_0.0
insulin_1.0
insulin_2.0
insulin_3.0
insulin_nan
miglitol_0.0
miglitol_1.0
miglitol_2.0
miglitol_3.0
miglitol_nan
acarbose_0.0
acarbose_1.0
acarbose_2.0
acarbose_nan
rosiglitazone_0.0
rosiglitazone_1.0
rosiglitazone_2.0
rosiglitazone_3.0
rosiglitazone_nan
pioglitazone_0.0



In [17]:
np.bincount(data_y)

array([    0, 71518], dtype=int64)

In [None]:
clf = RidgeClassifier()
scores = cross_val_score(clf, trans_use_dataframe, data_y, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))