In [None]:
'''Predict hospital readmission probabilites for diabetes patients.
I have used python libraries-pandas,numpy and sk-learn to predict the reorder probabilities.
a.) Data preprocessing: I have done data preprocessing to clean the data and impute the missing values. 
For example: The age is given in the format: [10,20). I have used regex operators to extract the age and approximated it to 25.
b.) I converted the categorical variables using Lableencoder.
c.) It is intersting to note that the data set is unbalanced. The readmitted cases are very less.
So I used SMOTE- Synthetic minority over sampling technique to balance the classes. 
d.) Finally I used random forests to predict the reorder probabilities.  
'''

In [36]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE

In [37]:
#load data set
data  = pd.read_csv("C://Users//jchin//Desktop//data-challenge//data-challenge//training_data.csv")

#data preprocessing - Split the data 
data1 = data.iloc[:,:21]
data2 = data.iloc[:,22:]
del data

data1.drop(['encounter_id','patient_nbr', 'weight', 'payer_code','medical_specialty'], axis=1, inplace=True)

#label encoder to convert categorical variables
labelencoder = LabelEncoder()
data1['race'] = labelencoder.fit_transform(data1['race'])
data1['gender'] = labelencoder.fit_transform(data1['gender'])

#Extract age - Approimate age of [a,b) = a+b/2
f = lambda x: sum(map(int, x.strip('[]()').split('-')))/2
data1.age = data1['age'].apply(f)

#label encoder
data2 = data2.apply(LabelEncoder().fit_transform)

data = pd.concat([data1,data2],axis = 1)


# create a function converter to handle missing data
def converter(x):
    # that, if x is a float or int,
    if type(x) is float or type(x) is int :
        # just returns it untouched
        return x
    # but, if not, return -1
    else:
        return -1
    
data = data.applymap(converter)


In [38]:
#train test split and Oversampling for undersampled data
X_train,X_validation,y_train,y_valiation = train_test_split(data.iloc[:,:-1],data.iloc[:,-1],  test_size=0.1, random_state=42)
X_train_resampled, y_train_resampled = SMOTE().fit_sample(X_train, y_train)

In [39]:
#Random forest classifier including balanced class weight 
rf = RandomForestClassifier(class_weight="balanced_subsample")
rf.fit(X_train_resampled,y_train_resampled)
train_acc = accuracy_score(y_train_resampled,rf.predict(X_train_resampled))
train_conf = confusion_matrix(y_train_resampled,rf.predict(X_train_resampled))
print('train accuracy:',train_acc)
print('confusion matri for training data\n',train_conf)

y_predicted = rf.predict(X_validation)
valid_acc = accuracy_score(y_predicted,y_valiation)

valid_conf = confusion_matrix(y_predicted,y_valiation)
print('validation set accuracy:',valid_acc)
print('validation set confusion matrix\n',valid_conf)

train accuracy: 0.99057922054
validation set accuracy: 0.880373372636


In [40]:
# Test data preparation
test_data  = pd.read_csv("C://Users//jchin//Desktop//data-challenge//data-challenge//test_data.csv")

#data preprocessing - Split the data 
encounter_id = test_data.iloc[:,0]
t_data1 = test_data.iloc[:,:21]
t_data2 = test_data.iloc[:,22:]
del test_data

t_data1.drop(['encounter_id','patient_nbr', 'weight', 'payer_code','medical_specialty'], axis=1, inplace=True)

#label encoder to convert categorical variables
labelencoder = LabelEncoder()
t_data1['race'] = labelencoder.fit_transform(t_data1['race'])
t_data1['gender'] = labelencoder.fit_transform(t_data1['gender'])

#Extract age - Approimate age of [a,b) = a+b/2
f = lambda x: sum(map(int, x.strip('[]()').split('-')))/2
t_data1.age = t_data1['age'].apply(f)

#label encoder
t_data2 = t_data2.apply(LabelEncoder().fit_transform)

test_data = pd.concat([t_data1,t_data2],axis = 1)


# create a function converter to handle missing data
def converter(x):
    # that, if x is a float or int,
    if type(x) is float or type(x) is int :
        # just returns it untouched
        return x
    # but, if not, return -1
    else:
        return -1
    
test_data = test_data.applymap(converter)


In [41]:
#test data predictions
predicted_probailities = rf.predict_proba(test_data)
encounter_id = encounter_id[:,np.newaxis]
final_pred = np.append(encounter_id,predicted_probailities,axis=1)
fp = pd.DataFrame(final_pred)
fp.columns = ['encounter_id', 'not_readmitted_probability', 're_admitted_probability']
fp.to_csv("C://Users//jchin//Desktop//data-challenge//data-challenge//suhas_chowdary.csv")