In [None]:
#libraries needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#scikit
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn import metrics

In [None]:
df = pd.read_csv("cleaned_data_v5.csv", encoding="Latin-1")

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df = df.drop(['Unnamed: 0'], axis=1)

In [None]:
list_of_meds = ['metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone']

# renaming values in medicines
# change in dosage('dosage_increased', 'dosage_decreased') -> 1
# no change or not prescribed 'not_prescribed', no_change_dosage -> 0
new_values_meds = {"dosage_increased":2, "dosage_decreased":-1 ,"no_change_dosage":1, "not_prescribed":0}
for medicine in list_of_meds:
    df[medicine] = df[medicine].replace(new_values_meds)

In [None]:
# renaming 'readmitted'
# readmitted ('>30', '<30') -> 1
# not readmitted  ('NO') -> 0
df.readmitted = df.readmitted.replace({'NO': 0, '>30': 2, '<30':1})

In [None]:
# renaming 'diabetes_med_prescribed'
# 'Yes' -> 1 
# 'No' -> 0
df = df.rename(columns={'diabates_med_prescribed': 'diabetes_med_prescribed'})
df.diabetes_med_prescribed = df.diabetes_med_prescribed.replace({'Yes': 1, 'No':0})

In [None]:
# renaming 'change_in_meds'
# 'Ch' -> 1
# 'No' -> 0
df.change_in_meds = df.change_in_meds.replace({'Ch': 1, 'No': 0})

In [None]:
#renaming 'age'
#[70-80) -> 70
#[60-70) -> 60
#[50-60) -> 90
#[80-90) -> 80
#[40-50) -> 40
#[30-40) -> 30
#[90-100) -> 90
#[20-30) -> 20
#[10-20) -> 10
#[0-10)-> 0
age_mapping = {'[70-80)': 70, '[60-70)': 60, '[50-60)':90, '[80-90)':80, '[40-50)': 40, '[30-40)': 30,'[90-100)': 90, '[20-30)': 20, '[10-20)': 10, '[0-10)': 0}
df.age = df.age.replace(age_mapping)

In [None]:
#before
print(df.race.value_counts())
print(df.gender.value_counts())

In [None]:
#before
df.head()

In [None]:
x = df.columns.values
le_gender = preprocessing.LabelEncoder()
le_race = preprocessing.LabelEncoder()
x_gender = le_gender.fit_transform(df.gender.values)
x_race = le_race.fit_transform(df.race.values)
df['gender'] = x_gender
df['race'] = x_race

In [None]:
#after
print(df.gender.value_counts())
print(df.race.value_counts())

We can deduct that:
In race: Caucasian = 2, AfricanAmerican = 0, Hispanic = 3, Other = 4, 1 = Asian
In gender: Female = 0, Male = 1

In [None]:
#after
df.head()

In [None]:
#separating samples from labels
samples = df.iloc[:,0:99492]
labels = df['readmitted']
print(samples.shape)

In [None]:
# dividing df in test data and training data
x_train, x_test, y_train, y_test = train_test_split(samples, labels, test_size=0.7, train_size= 0.3)
print(x_train.shape)
print(x_test.shape)

In [None]:
# try and experiemnt with test size

In [None]:
#normalisation

#scaling
scaler = preprocessing.MinMaxScaler()
x_train_norm = scaler.fit_transform(x_train)

#transforming
x_test_norm = scaler.transform(x_test)
print(x_train_norm)
print(x_test_norm)

In [None]:
#KNN modelling

for n_neighbours in [1, 5, 10, 15, 20, 25, 30]:
    knn = KNeighborsClassifier(n_neighbours)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    print("N neighbours " + str(n_neighbours))
    print("Found vs Actual ")
    print(np.sum(y_test == y_pred), len(y_test))
    print("Accuracy score " + str(knn.score(x_test, y_test)))
    print("****************")
#example above taken from lab7 file 5.0

In [None]:
metrics.confusion_matrix(y_test, y_pred)

In [None]:
metrics.classification_report(y_test, y_pred)

In [None]:
#check for overfitting and underfitting -> DATA VIS!!

In [None]:
#cross-validation
x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(samples, labels, test_size=0.5, train_size= 0.5)


Source used: Sebastian Raschka, 2015. Python Machine Learning. 