In [1]:
#libraries needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#scikit
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn import preprocessing
from sklearn import metrics

%matplotlib inline

In [2]:
df = pd.read_csv("cleaned_data_v5.csv", encoding="Latin-1")

In [3]:
df.shape

(99492, 38)

In [4]:
df.columns

Index(['Unnamed: 0', 'race', 'gender', 'age', 'days_in_hospital',
       'num_lab_procedures', 'num_not_lab_procedures',
       'num_current_medications', 'num_outpatient_appointments',
       'num_previous_emergencies', 'num_inpatient_overnight_stays',
       'number_diagnoses', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change_in_meds', 'diabates_med_prescribed',
       'readmitted'],
      dtype='object')

In [5]:
df = df.drop(['Unnamed: 0'], axis=1)

In [6]:
list_of_meds = ['metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone']

# renaming values in medicines
# change in dosage('dosage_increased', 'dosage_decreased') -> 1
# no change or not prescribed 'not_prescribed', no_change_dosage -> 0
new_values_meds = {"dosage_increased":3, "dosage_decreased":1 ,"no_change_dosage":2, "not_prescribed":0}
for medicine in list_of_meds:
    df[medicine] = df[medicine].replace(new_values_meds)

In [7]:
# renaming 'readmitted'
# readmitted ('>30', '<30') -> 1
# not readmitted  ('NO') -> 0
df.readmitted = df.readmitted.replace({'NO': 0, '>30': 2, '<30':1})

In [8]:
# renaming 'diabetes_med_prescribed'
# 'Yes' -> 1 
# 'No' -> 0
df = df.rename(columns={'diabates_med_prescribed': 'diabetes_med_prescribed'})
df.diabetes_med_prescribed = df.diabetes_med_prescribed.replace({'Yes': 1, 'No':0})

In [9]:
# renaming 'change_in_meds'
# 'Ch' -> 1
# 'No' -> 0
df.change_in_meds = df.change_in_meds.replace({'Ch': 1, 'No': 0})

In [10]:
#renaming 'age'
#[70-80) -> 70
#[60-70) -> 60
#[50-60) -> 90
#[80-90) -> 80
#[40-50) -> 40
#[30-40) -> 30
#[90-100) -> 90
#[20-30) -> 20
#[10-20) -> 10
#[0-10)-> 0
age_mapping = {'[70-80)': 70, '[60-70)': 60, '[50-60)':90, '[80-90)':80, '[40-50)': 40, '[30-40)': 30,'[90-100)': 90, '[20-30)': 20, '[10-20)': 10, '[0-10)': 0}
df.age = df.age.replace(age_mapping)

In [11]:
#before
print(df.race.value_counts())
print(df.gender.value_counts())

Caucasian          76099
AfricanAmerican    19210
Hispanic            2037
Other               1505
Asian                641
Name: race, dtype: int64
Female    53575
Male      45917
Name: gender, dtype: int64


In [12]:
#before
df.head()

Unnamed: 0,race,gender,age,days_in_hospital,num_lab_procedures,num_not_lab_procedures,num_current_medications,num_outpatient_appointments,num_previous_emergencies,num_inpatient_overnight_stays,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change_in_meds,diabetes_med_prescribed,readmitted
0,Caucasian,Female,0,1,41,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Caucasian,Female,10,3,59,0,18,0,0,0,...,0,3,0,0,0,0,0,1,1,2
2,AfricanAmerican,Female,20,2,11,5,13,2,0,1,...,0,0,0,0,0,0,0,0,1,0
3,Caucasian,Male,30,2,44,1,16,0,0,0,...,0,3,0,0,0,0,0,1,1,0
4,Caucasian,Male,40,1,51,0,8,0,0,0,...,0,2,0,0,0,0,0,1,1,0


In [13]:
x = df.columns.values
le_gender = preprocessing.LabelEncoder()
le_race = preprocessing.LabelEncoder()
x_gender = le_gender.fit_transform(df.gender.values)
x_race = le_race.fit_transform(df.race.values)
df['gender'] = x_gender
df['race'] = x_race

In [None]:
#after
print(df.gender.value_counts())
print(df.race.value_counts())

0    53575
1    45917
Name: gender, dtype: int64
2    76099
0    19210
3     2037
4     1505
1      641
Name: race, dtype: int64


We can deduct that:
In race: Caucasian = 2, AfricanAmerican = 0, Hispanic = 3, Other = 4, 1 = Asian
In gender: Female = 0, Male = 1

In [None]:
df.to_csv("cleaned_data_BN_v1.csv")

In [None]:
#after
df.head()

Unnamed: 0,race,gender,age,days_in_hospital,num_lab_procedures,num_not_lab_procedures,num_current_medications,num_outpatient_appointments,num_previous_emergencies,num_inpatient_overnight_stays,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change_in_meds,diabetes_med_prescribed,readmitted
0,2,0,0,1,41,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,10,3,59,0,18,0,0,0,...,0,3,0,0,0,0,0,1,1,2
2,0,0,20,2,11,5,13,2,0,1,...,0,0,0,0,0,0,0,0,1,0
3,2,1,30,2,44,1,16,0,0,0,...,0,3,0,0,0,0,0,1,1,0
4,2,1,40,1,51,0,8,0,0,0,...,0,2,0,0,0,0,0,1,1,0


Data Preprocessing

In [None]:
#separating samples from labels
samples = df.iloc[:,0:99492]
labels = df['readmitted']
print(samples.shape)

(99492, 37)


In [None]:
# dividing df in test data and training data
x_train, x_test, y_train, y_test = train_test_split(samples, labels, test_size=0.7, train_size= 0.3, random_state=0)
print(x_train.shape)
print(x_test.shape)
#https://stackabuse.com/cross-validation-and-grid-search-for-model-selection-in-python/

(29847, 37)
(69645, 37)


Data Scaling and Normalisation

In [None]:
#normalisation

#scaling
scaler = preprocessing.MinMaxScaler()
x_train_norm = scaler.fit_transform(x_train)

#transforming
x_test_norm = scaler.transform(x_test)
print(x_train_norm)
print(x_test_norm)

[[0.5        0.         0.33333333 ... 0.         1.         0.        ]
 [0.5        0.         0.66666667 ... 0.         1.         0.        ]
 [0.         0.         0.77777778 ... 0.         0.         1.        ]
 ...
 [0.         1.         0.44444444 ... 0.         1.         0.        ]
 [0.5        1.         0.66666667 ... 1.         1.         0.        ]
 [0.5        1.         0.66666667 ... 0.         1.         0.        ]]
[[0.5        0.         0.44444444 ... 1.         1.         1.        ]
 [0.         1.         0.77777778 ... 1.         1.         0.        ]
 [1.         1.         0.66666667 ... 0.         1.         0.        ]
 ...
 [0.5        0.         1.         ... 1.         1.         0.        ]
 [0.5        0.         0.77777778 ... 1.         1.         0.        ]
 [0.         1.         0.66666667 ... 0.         1.         1.        ]]


  return self.partial_fit(X, y)


Data Modelling

In [None]:
#KNN modelling
knn = KNeighborsClassifier()

#Choosing Hyper Parameters

#Hyper Parameters Set
parameters = {'n_neighbors':[1, 5, 10, 15, 20, 25, 30],
          'leaf_size':[1, 5, 10, 20, 30],
          'weights':['uniform', 'distance'],
          'algorithm':['ball_tree','kd_tree','brute']}

#Making models with hyper parameters sets
grid = GridSearchCV(knn, param_grid=parameters, scoring='accuracy', n_jobs=5)
#Sources used: 
# https://www.kaggle.com/mayu0116/hyper-parameters-tuning-of-dtree-rf-svm-knn
# https://stackabuse.com/cross-validation-and-grid-search-for-model-selection-in-python/

In [None]:

#Learning
grid.fit(x_train, y_train)

#Selection of best hyperparameters
print("Selected parameters")
print(grid.best_params_)

#Sources used:
# https://www.kaggle.com/mayu0116/hyper-parameters-tuning-of-dtree-rf-svm-knn
# https://www.ritchieng.com/machine-learning-efficiently-search-tuning-param/



In [None]:

#Prediction
y_prediction = grid.predict(x_test)

#accuracy score
print("Accuracy score: ")
print(metrics.accuracy_score(y_test, y_prediction))

#Confusion matrix
print("Confusion matrix: ")
metrics.confusion_matrix(y_test, y_prediction)

#classification report
print("Classification report: ")
metrics.classification_report(y_test, y_prediction)

#Source used: https://www.kaggle.com/mayu0116/hyper-parameters-tuning-of-dtree-rf-svm-knn

In [None]:
#printing mean scores
scores = grid.grid_scores_
neighbor_range = [1, 5, 10, 15, 20, 25, 30] #same as n_neighbours in parameters
grid_means = [result.mean_validation_score for result in scores]
print(grid_mean_scores)

#Source used: # https://www.ritchieng.com/machine-learning-efficiently-search-tuning-param/

In [None]:
plt.plot(neighbor_range, grid_means)
plt.xlabel('K neighbor value')
plt.ylabel('Cross-Validated Accuracy')

In [None]:
cross_val = cross_val_score(grid, samples, labels, test_size=0.5, train_size=0.5, scoring='accuracy')
print(cross_val.mean())
#Source used: https://www.ritchieng.com/machine-learning-efficiently-search-tuning-param/

Sources used:
- Sebastian Raschka, 2015. Python Machine Learning.
- Validation and Model Selection notebook from week 7
- https://www.kaggle.com/mayu0116/hyper-parameters-tuning-of-dtree-rf-svm-knn
- https://www.ritchieng.com/machine-learning-efficiently-search-tuning-param/
- https://stackabuse.com/cross-validation-and-grid-search-for-model-selection-in-python/
- https://medium.com/30-days-of-machine-learning/day-3-k-nearest-neighbors-and-bias-variance-tradeoff-75f84d515bdb
