In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
pd.set_option("max_columns", 50)

diab_final = pd.read_csv("data/clean_data_2class.csv")

from sklearn.model_selection import train_test_split

# columns need to be deleted since they are not adding any value
del_cols = ["Unnamed: 0", "encounter_id", "patient_nbr", "weight", "discharge_disposition", 
            "admission_source", "payer_code", "medical_specialty", "admisison_type"]

for col in del_cols:
    if col in diab_final.columns:
        diab_final.drop(col, inplace=True, axis = 1)
        #del diab_final[col]

In [2]:
# split predictors and outcome
predictors = diab_final.drop("readmitted", axis = 1)
outcome = diab_final["readmitted"]

In [12]:
# list selected from feature importance from random forest
pick_list = ["number_inpatient", "num_medications", "number_emergency", "number_outpatient", "num_lab_procedures", 
             "number_diagnoses", "age", "time_in_hospital", "num_procedures", "diag_1_NP", "admission_type_HMO Referral", 
             "diabetesMed_Yes", "admission_type_Transfer from another health care facility", "admission_type_Physician Referral", 
             "gender_Male", "diag_2_CSD", "metformin_No", "race_Caucasian", "gender_Female", "insulin_Steady", "insulin_No", 
             "metformin_Steady", "diag_1_ENMI", "diag_3_ENMI", "A1Cresult_None", "admission_type_Clinic Referral", "diag_1_CSD", 
             "diag_3_CSD", "diag_2_ENMI", "diabetesMed_No", "insulin_Down", "change_Ch", "diag_1_PCC", "race_AfricanAmerican", 
             "change_No", "max_glu_serum_None", "diag_2_RSD", "diag_2_SSD"]

In [13]:
predictors.head()

Unnamed: 0,race,gender,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,admission_type
0,Caucasian,Female,1,3,59,0,18,0,0,0,ENMI,ENMI,ENMI,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,Physician Referral
1,AfricanAmerican,Female,2,2,11,5,13,2,0,1,PCC,ENMI,CFI,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,Physician Referral
2,Caucasian,Male,3,2,44,1,16,0,0,0,IPD,ENMI,CSD,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,Physician Referral
3,Caucasian,Male,4,1,51,0,8,0,0,0,NP,NP,ENMI,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,Physician Referral
4,Caucasian,Male,5,3,31,6,16,0,0,0,CSD,CSD,ENMI,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,Clinic Referral


In [14]:
dummy_predictor = pd.get_dummies(predictors)
dummy_predictor = dummy_predictor[pick_list]

In [15]:
dummy_predictor.head()

Unnamed: 0,number_inpatient,num_medications,number_emergency,number_outpatient,num_lab_procedures,number_diagnoses,age,time_in_hospital,num_procedures,diag_1_NP,admission_type_HMO Referral,diabetesMed_Yes,admission_type_Transfer from another health care facility,admission_type_Physician Referral,gender_Male,diag_2_CSD,metformin_No,race_Caucasian,gender_Female,insulin_Steady,insulin_No,metformin_Steady,diag_1_ENMI,diag_3_ENMI,A1Cresult_None,admission_type_Clinic Referral,diag_1_CSD,diag_3_CSD,diag_2_ENMI,diabetesMed_No,insulin_Down,change_Ch,diag_1_PCC,race_AfricanAmerican,change_No,max_glu_serum_None,diag_2_RSD,diag_2_SSD
0,0,18,0,0,59,9,1,3,0,0,0,1,0,1,0,0,1,1,1,0,0,0,1,1,1,0,0,0,1,0,0,1,0,0,0,1,0,0
1,1,13,0,2,11,6,2,2,5,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1,1,1,0,0
2,0,16,0,0,44,7,3,2,1,0,0,1,0,1,1,0,1,1,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,1,0,0
3,0,8,0,0,51,5,4,1,0,1,0,1,0,1,1,0,1,1,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0
4,0,16,0,0,31,9,5,3,6,0,0,1,0,0,1,1,1,1,0,1,0,0,0,1,1,1,1,0,0,0,0,0,0,0,1,1,0,0


In [16]:
col_list = list(dummy_predictor.columns)

In [9]:
len(col_list)
dummy_predictor.shape

(98052, 38)

In [17]:
from sklearn import preprocessing

In [18]:
dummmy_scaled = preprocessing.scale(dummy_predictor)

In [21]:
test_size_range = np.arange(.4,0,-0.05)
tuning_parameter = []

for test_size in test_size_range:
    X_train, X_test, Y_train, Y_test = train_test_split(dummy_predictor,
                                                   outcome,
                                                   stratify=outcome,
                                                   test_size=test_size,
                                                   random_state = 0)
    X_train = preprocessing.scale(X_train)
    svm_clf = svm.SVC(decision_function_shape = 'ovo', cache_size = 7000, verbose = 4).fit(X_train, Y_train)
    clf_train = svm_clf.score(X_train, Y_train)
    clf_test = svm_clf.score(X_test, Y_test)
    tuning_parameter.append([1-test_size, clf_train, clf_test])
    print("Training size:%.2f, Prediction Accuracy:%.2f" % (1-test_size, clf_rate*100))

[LibSVM]Training size:0.60, Prediction Accuracy:53.27
[LibSVM]Training size:0.65, Prediction Accuracy:53.27
[LibSVM]Training size:0.70, Prediction Accuracy:53.27
[LibSVM]Training size:0.75, Prediction Accuracy:53.27
[LibSVM]Training size:0.80, Prediction Accuracy:53.27
[LibSVM]Training size:0.85, Prediction Accuracy:53.27
[LibSVM]Training size:0.90, Prediction Accuracy:53.27
[LibSVM]Training size:0.95, Prediction Accuracy:53.27


In [19]:
X_test.shape

(4903, 38)

In [20]:
tuning_parameter

[[0.59999999999999998, 0.53269931924224267],
 [0.64999999999999991, 0.53262041434773744],
 [0.69999999999999996, 0.5324993200979059],
 [0.75, 0.53316472219955946],
 [0.79999999999999993, 0.53286420886237318],
 [0.84999999999999987, 0.53331520261082399],
 [0.89999999999999991, 0.53375484397307771],
 [0.94999999999999996, 0.53273506016724459]]

In [None]:
#X_train = pd.get_dummies(X_train)

In [None]:
from sklearn import svm

In [None]:
clf = svm.SVC(decision_function_shape = 'ovo', verbose = 4).fit(X_train, Y_train)

In [None]:
clf.fit(X_train, Y_train)

In [None]:
X_test.shape

In [None]:
pred = clf.predict(X_test)

In [None]:
clf.score(X_train, Y_train)