In [1]:
#libraries needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#scikit
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, classification_report
from sklearn import preprocessing
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv("cleaned_data_v5.csv", encoding="Latin-1")

In [3]:
df.shape

(99492, 38)

In [4]:
df = df.drop(['Unnamed: 0'], axis=1)

In [5]:
list_of_meds = ['metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone']

# renaming values in medicines
# medicine has been prescribed('dosage_increased', 'dosage_decreased', 'no_change_dosage') -> 1
# medicine has not 'not_prescribed' -> 0
new_values_meds = {"dosage_increased":1, "dosage_decreased":1 ,"no_change_dosage":1, "not_prescribed":0}
for medicine in list_of_meds:
    df[medicine] = df[medicine].replace(new_values_meds)

In [6]:
# renaming 'gender':
# 'Female' -> 0
# 'Male' -> 1
df.gender = df.gender.replace({'Female': 0, 'Male': 1})

In [7]:
# renaming 'readmitted'
# readmitted ('>30', '<30') -> 1
# not readmitted  ('NO') -> 0
df.readmitted = df.readmitted.replace({'NO': 0, '>30': 1, '<30':1})

In [8]:
# renaming 'diabetes_med_prescribed'
# 'Yes' -> 1 
# 'No' -> 0
df = df.rename(columns={'diabates_med_prescribed': 'diabetes_med_prescribed'})
df.diabetes_med_prescribed = df.diabetes_med_prescribed.replace({'Yes': 1, 'No':0})

In [9]:
# renaming 'change_in_meds'
# 'Ch' -> 1
# 'No' -> 0
df.change_in_meds = df.change_in_meds.replace({'Ch': 1, 'No': 0})

In [10]:
# renaming 'race'
# Caucasian -> 0
# AfricanAmerican -> 1
# Hispanic -> 2
# Other -> 3
# Asian -> 4

race_mapping = {'Caucasian':0, 'AfricanAmerican':1, 'Hispanic':2, 'Other':3, 'Asian':4 }
df.race = df.race.replace(race_mapping)

In [11]:
#renaming 'age'
#[70-80) -> 70
#[60-70) -> 60
#[50-60) -> 90
#[80-90) -> 80
#[40-50) -> 40
#[30-40) -> 30
#[90-100) -> 90
#[20-30) -> 20
#[10-20) -> 10
#[0-10)-> 0
age_mapping = {'[70-80)': 70, '[60-70)': 60, '[50-60)':90, '[80-90)':80, '[40-50)': 40, '[30-40)': 30,'[90-100)': 90, '[20-30)': 20, '[10-20)': 10, '[0-10)': 0}
df.age = df.age.replace(age_mapping)

In [12]:
#OneHotEncoder for race

In [13]:
#separating samples from labels
samples = df.iloc[:,0:99492]
labels = df['readmitted']
print(samples.shape)

(99492, 37)


In [14]:
# dividing df in test data and training data
x_train, x_test, y_train, y_test = train_test_split(samples, labels, test_size=0.67, train_size= 0.33)
print(x_train.shape)
print(x_test.shape)

(32832, 37)
(66660, 37)


In [15]:
#pre-processing

#scaling
scaler = preprocessing.StandardScaler()

#estimating means and standard deviation
print(scaler.fit(x_train))
print(scaler.mean_)

#transforming
x_train_standard_dev = scaler.transform(x_train)
x_test_standard_dev = scaler.transform(x_test)
print(x_train_standard_dev)
print(x_test_standard_dev)

StandardScaler(copy=True, with_mean=True, with_std=True)
[3.07535331e-01 4.59978070e-01 6.77518884e+01 4.42333699e+00
 4.32284661e+01 1.34143519e+00 1.60391386e+01 3.77710770e-01
 2.04617446e-01 6.33497807e-01 7.44194688e+00 1.96363304e-01
 1.52899610e-02 6.36574074e-03 8.83284600e-04 5.18701267e-02
 0.00000000e+00 1.25121832e-01 1.02765595e-01 2.43664717e-04
 7.23379630e-02 6.12816764e-02 3.04580897e-03 1.52290448e-04
 3.04580897e-05 1.82748538e-04 0.00000000e+00 0.00000000e+00
 5.33168860e-01 6.88352827e-03 1.21832359e-04 0.00000000e+00
 0.00000000e+00 3.04580897e-05 4.63480750e-01 7.68884016e-01
 4.63541667e-01]
[[ 1.06539571 -0.92291747 -2.07505294 ...  1.07591213  0.54825749
  -0.92955778]
 [-0.47316035 -0.92291747  0.12356867 ...  1.07591213  0.54825749
   1.07578035]
 [-0.47316035 -0.92291747 -1.52539754 ...  1.07591213  0.54825749
  -0.92955778]
 ...
 [-0.47316035 -0.92291747  0.12356867 ...  1.07591213  0.54825749
  -0.92955778]
 [-0.47316035  1.08352051 -1.52539754 ...  1.075

  return self.partial_fit(X, y)
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


In [16]:
# classifiers
LSVC = LinearSVC(dual=False)
KNN = KNeighborsClassifier(n_neighbors=10)
#SVC = SVC(dual=False)

In [17]:
#KNN modelling
KNN.fit(x_train, y_train)
y_pred_KNN = KNN.predict(x_test)
print(np.sum(y_test == y_pred_KNN), len(y_test))

44221 66660


In [18]:
#KNN accuracy
accuracy_score(y_test, y_pred_KNN)

0.6633813381338134

In [19]:
#LSVC modelling
LSVC.fit(x_train, y_train)
y_pred_LSVC= LSVC.predict(x_test)
print(np.sum(y_test == y_pred_LSVC), len(y_test))

66660 66660


In [20]:
accuracy_score(y_test, y_pred_LSVC)

1.0

In [21]:
#find other models

In [22]:
#check for overfitting and underfitting

In [23]:
#cross-validation 50%50

In [24]:
#data visualisation