In [None]:
import numpy as np
import pandas as pd
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
 
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


from sklearn.preprocessing import StandardScaler 

# import various functions from sklearn 


from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score

# import function to perform feature selection
from sklearn.feature_selection import RFE

# Data Set Description


* **Encounter ID**	Unique identifier of an encounter
* **Patient number**	Unique identifier of a patient
* **Race**	Values: Caucasian, Asian, African American, Hispanic, and other
* **Gender**	Values: male, female, and unknown/invalid
* **Age**	Grouped in 10-year intervals: 0, 10), 10, 20), …, 90, 100)
* **Weight**	Weight in pounds
* **Admission type**	Integer identifier corresponding to 9 distinct values, for example, emergency, urgent, elective, newborn, and not available
* **Discharge disposition**	Integer identifier corresponding to 29 distinct values, for example, discharged to home, expired, and not available
* **Admission source**	Integer identifier corresponding to 21 distinct values, for example, physician referral, emergency room, and transfer from a hospital
* **Time in hospital**	Integer number of days between admission and discharge
* **Payer code**	Integer identifier corresponding to 23 distinct values, for example, Blue Cross/Blue Shield, Medicare, and self-pay Medical
* **Medical specialty**	Integer identifier of a specialty of the admitting physician, corresponding to 84 distinct values, for example, cardiology, internal medicine, family/general practice, and surgeon
* **Number of lab procedures**	Number of lab tests performed during the encounter
* **Number of procedures** Numeric	Number of procedures (other than lab tests) performed during the encounter
* **Number of medications**	Number of distinct generic names administered during the encounter
* **Number of outpatient visits** Number of outpatient visits of the patient in the year preceding the encounter
* **Number of emergency visits**	Number of emergency visits of the patient in the year preceding the encounter
* **Number of inpatient visits**	Number of inpatient visits of the patient in the year preceding the encounter
* **Diagnosis 1**	The primary diagnosis (coded as first three digits of ICD9); 848 distinct values
* **Diagnosis 2**	Secondary diagnosis (coded as first three digits of ICD9); 923 distinct values
* **Diagnosis 3** Additional secondary diagnosis (coded as first three digits of ICD9); 954 distinct values
* **Number of diagnoses**	Number of diagnoses entered to the system 0%
* **Glucose serum test result**	Indicates the range of the result or if the test was not taken. Values: “>200,” “>300,” “normal,” and “none” if not measured
* **A1c test result**	Indicates the range of the result or if the test was not taken. Values: “>8” if the result was greater than 8%, “>7” if the result was greater than 7% but less than 8%, “normal” if the result was less than 7%, and “none” if not measured.
* **Change of medications**	Indicates if there was a change in diabetic medications (either dosage or generic name). Values: “change” and “no change”
* **Diabetes medications**	Indicates if there was any diabetic medication prescribed. Values: “yes” and “no”
* 24 features for medications	For the generic names: **metformin, repaglinide, nateglinide, chlorpropamide, glimepiride, acetohexamide, glipizide, glyburide, tolbutamide, pioglitazone, rosiglitazone, acarbose, miglitol, troglitazone, tolazamide, examide, sitagliptin, insulin, glyburide-metformin, glipizide-metformin, glimepiride- pioglitazone, metformin-rosiglitazone, and metformin- pioglitazone**, the feature indicates whether the drug was prescribed or there was a change in the dosage. Values: “up” if the dosage was increased during the encounter, “down” if the dosage was decreased, “steady” if the dosage did not change, and “no” if the drug was not prescribed
* **Readmitted**	Days to inpatient readmission. Values: “<30” if the patient was readmitted in less than 30 days, “>30” if the patient was readmitted in more than 30 days, and “No” for no record of readmission










# Data Dictionary

# admission_type_id	Description
1.	Emergency
2.	Urgent	
3.	Elective	
4.	Newborn	
5.	Not Available
6.	NULL	
7.	Trauma Center
8.	Not Mapped
		
# discharge_disposition_id	Description
1.	Discharged to home
2.	Discharged/transferred to another short term hospital
3.	Discharged/transferred to SNF
4.	Discharged/transferred to ICF
5. Discharged/transferred to another type of inpatient care institution
6.	Discharged/transferred to home with home health service
7.	Left AMA	
8.	Discharged/transferred to home under care of Home IV provider
9.	Admitted as an inpatient to this hospital
10.	Neonate discharged to another hospital for neonatal aftercare
11.	Expired	
12.	Still patient or expected to return for outpatient services
13.	Hospice / home
14.	Hospice / medical facility
15.	Discharged/transferred within this institution to Medicare approved swing bed
16.	Discharged/transferred/referred another institution for outpatient services
17.	Discharged/transferred/referred to this institution for outpatient services
18.	NULL	
19.	Expired at home. Medicaid only, hospice.
20.	Expired in a medical facility. Medicaid only, hospice.
21.	Expired, place unknown. Medicaid only, hospice.
22.	Discharged/transferred to another rehab fac including rehab units of a hospital .
23.	Discharged/transferred to a long term care hospital.
24.	Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare.
25.	Not Mapped
26.	Unknown/Invalid
30.	Discharged/transferred to another Type of Health Care Institution not Defined Elsewhere
27.	Discharged/transferred to a federal health care facility.
28.	Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital
29.	Discharged/transferred to a Critical Access Hospital (CAH).
		
# admission_source_id	Description
1.	 Physician Referral
2.	Clinic Referral
3.	HMO Referral
4.	Transfer from a hospital
5.	 Transfer from a Skilled Nursing Facility (SNF)
6.	 Transfer from another health care facility
7.	 Emergency Room
8.	 Court/Law Enforcement
9.	 Not Available
10.	 Transfer from critical access hospital
11.	Normal Delivery
12.	 Premature Delivery
13.	 Sick Baby	
14.	 Extramural Birth
15.	Not Available
17.	NULL	
18.	 Transfer From Another Home Health Agency
19.	Readmission to Same Home Health Agency
20.	 Not Mapped
21.	Unknown/Invalid
22.	 Transfer from hospital inpt/same fac reslt in a sep claim
23.	 Born inside this hospital
24.	 Born outside this hospital
25.	 Transfer from Ambulatory Surgery Center
26.	Transfer from Hospice



## Before I start, I realized that there are "?" sign that represented the NAN values. Getting rid of some information missing, I will replace them with np.nan

In [None]:
diabet = pd.read_csv('../input/diabetes/diabetic_data.csv')
diabet.head()


In [None]:
diabet = diabet.replace("?",np.nan)

df = diabet.copy()

In [None]:
#df = df.drop_duplicates(subset= ['patient_nbr'], keep = 'first')
#df.shape

In [None]:
df['readmitted'].value_counts()

In [None]:
diabet['readmitted'].value_counts()

In [None]:
df[df['readmitted'] == '<30']

In [None]:
len(diabet.select_dtypes('O').columns)

In [None]:
diabet['readmitted'].value_counts()/len(diabet)

In [None]:
diabet.info()

In [None]:
diabet.isnull().sum()[diabet.isnull().sum()>1]/len(diabet)*100

In [None]:
diabet.drop(['weight','payer_code','medical_specialty'],axis=1,inplace=True)






# **Exploratory Data Analysis and Visualizations**

In [None]:
 diabet['readmitted'].unique()

### **Readmitted**
This column our target feature. It is about **"days to inpatient readmission"**
* If the patient was readmitted in less than 30 days **"<30"**
* if the patient was readmitted in more than 30 days **">30"**
* If there is no record **"NO"**

We decided to reduce these values to two and map them according to the following rule;
* NO -> 0 <30 
*  >30 -> 1

In [None]:
diabet.head()

In [None]:
diabet = diabet.replace({"NO":0,
                         "<30":1,
                         ">30":0})

print(diabet.readmitted.value_counts())

sns.countplot(x = "readmitted", data = diabet)
plt.title("Distribution of Target Values")
plt.show()

# Pie chart
diabet.readmitted.value_counts().plot.pie(autopct = "%.1f%%")
plt.title("Proportion of Target Value")
plt.show()

### **Race**
We have 5 different races value, these are;
- Caucasian          
- AfricanAmerican    
- Hispanic            
- Asian
- Other 

In [None]:
sns.countplot(x=diabet.race, data = diabet)
plt.xticks(rotation=90)
plt.title("Number of Race values")
plt.show()

print("Proportion of Race")
print(diabet.race.value_counts(normalize = True)*100)

- As we see, there is **Caucasians** in 73 percent of all our data. 
And other other 22 percent is divided into **African Americans**, **Hispanics**, **Asians** and **Others**.<br>
- Here we decided to divide into 3 groups like **Caucasian**, **African American** and **Other**.

In [None]:
mapped_race = {"Asian":"Other","Hispanic":"Other"}
diabet.race = diabet.race.replace(mapped_race)

sns.countplot(x="race", data = diabet)
plt.title("Number of Race values")
plt.show()

print("Proportion of Race After the Mapping")
print(diabet.race.value_counts(normalize= True)*100)

In [None]:
sns.countplot(x="race", hue= "readmitted", data = diabet)
plt.title("Readmitted - Race")
plt.show()

In [None]:
sns.catplot(x = "race", y = "readmitted",
            data = diabet, kind = "bar", height= 5)
plt.title("Readmitted Probability")
plt.show()

- Most of the patients are **Caucasian**, followed by **African Americans**.
- Although the **Other** values are few than **Caucasian**, 
we see that the **Readmitted Probability** almost close to **Caucasian**.

- Most of the patients are **Caucasian**, followed by **African Americans**.
- Although the **Other** values are few than **Caucasian**, 
we see that the **Readmitted Probability** almost close to **Caucasian**.

### **Gender**

In [None]:
sns.countplot(x = "gender", data = diabet)
plt.title("Distribution of Number of Gender")
plt.show()

print("Proportions of Race Value")
print(diabet.gender.value_counts(normalize = True))

- When we looked up **Gender** values, there is only one entry for **Unknown/Invalid**. So we dropped them

In [None]:
diabet = diabet.drop(diabet.loc[diabet["gender"]=="Unknown/Invalid"].index, axis=0)

sns.countplot(x = "gender", data = diabet)
plt.title("Distribution of Number of Gender After Dropping")
plt.show()

sns.countplot(x = "gender", hue = "readmitted", data = diabet)
plt.title("Gender - Readmitted")
plt.show()

In [None]:
g = sns.catplot(x = "gender",y = "readmitted", 
                data = diabet, kind = "bar", height= 5)
g.set_ylabels("Readmitted Probability")
plt.show()

- We see a nearly equal distribution of **Gender**.
- Also, we can state that **Females** are a little more prone than **Males**.

## **Age**

In [None]:
sns.countplot(x="age", data = diabet)
plt.xticks(rotation = 90)
plt.show()

* To get rid of this parenthesis notation and make the **Age** variable a numeric value, we changed it according to the following rule.

In [None]:
diabet.age = diabet.age.replace({"[70-80)":75,
                         "[60-70)":65,
                         "[50-60)":55,
                         "[80-90)":85,
                         "[40-50)":45,
                         "[30-40)":35,
                         "[90-100)":95,
                         "[20-30)":25,
                         "[10-20)":15,
                         "[0-10)":5})

sns.countplot(x="age", data = diabet)
#plt.xticks(rotation = 90)
plt.show()

In [None]:
g = sns.catplot(x = "age", y = "readmitted", data = diabet, 
                   kind = "bar", height = 5)
g.set_ylabels("Readmitted Probability")
plt.show()

- we can understand that we have an **elderly population.**

### **Weight**

### **Admission Type ID**
Integer identifier corresponding to 9 distinct values;
- **Emergency**        : 1
- **Urgent**           : 2
- **Elective**         : 3
- **Newborn**          : 4
- **Not Available**    : 5
- **NULL**             : 6
- **Trauma Center**    : 7
- **Not Mapped**       : 8



In [None]:
sns.countplot(x = "admission_type_id", data = diabet)
plt.title("Distribution of Admission IDs")
plt.show()

print("Distribution of ID's")
print(diabet.admission_type_id.value_counts())

In here we need to do mapping for :
- **NULL**, **Not Available** and **Not Mapped** values.
- In addition, we will map **Urgent** value as **Emergency** because they have same meaning

In [None]:
mapped = {1.0:"Emergency",
          2.0:"Emergency",
          3.0:"Elective",
          4.0:"New Born",
          5.0:np.nan,
          6.0:np.nan,
          7.0:"Trauma Center",
          8.0:np.nan}

diabet.admission_type_id = diabet.admission_type_id.replace(mapped)

sns.countplot(x = "admission_type_id", data = diabet)
plt.title("-Distribution of Admission IDs-")
plt.show()

print("-Distribution of ID's-")
print(diabet.admission_type_id.value_counts())

In [None]:
g = sns.catplot(x = "admission_type_id", y ="readmitted", 
                    data = diabet, height = 6, kind = "bar")
g.set_ylabels("Readmitted Probability")
plt.show()

### **Discharge Disposition ID**
-Integer identifier corresponding to 29 distinct values. For example, discharged to home, expired, and not available

In [None]:
sns.countplot(x ="discharge_disposition_id", data = diabet)
plt.show()

When we look the graph, we can see too much values. Getting rid of that situation,
 we applied this rules: 
- If any one includes **"home"** word I will grouping into one
- If not, it will be as **OTHER**
- NAN = 18, 25, 26

In [None]:
diabet['discharge_disposition_id'].unique()

In [None]:
mapped_discharge = {1:"Discharged to Home",
                    6:"Discharged to Home",
                    8:"Discharged to Home",
                    13:"Discharged to Home",
                    19:"Discharged to Home",
                    18:np.nan,25:np.nan,26:np.nan,
                    2:"Other",3:"Other",4:"Other",
                    5:"Other",7:"Other",9:"Other",
                    10:"Other",11:"Other",12:"Other",
                    14:"Other",15:"Other",16:"Other",
                    17:"Other",20:"Other",21:"Other",
                    22:"Other",23:"Other",24:"Other",
                    27:"Other",28:"Other",29:"Other",30:"Other"}

diabet["discharge_disposition_id"] = diabet["discharge_disposition_id"].replace(mapped_discharge)

- Now it will be more clear and readable

In [None]:
diabet['discharge_disposition_id'].unique()

In [None]:
sns.countplot(x ="discharge_disposition_id", data = diabet)
plt.show()

sns.countplot(x ="discharge_disposition_id", hue = "readmitted", data = diabet)
plt.show()

print("Proportions of ID's")
print(diabet.discharge_disposition_id.value_counts())

In [None]:
g = sns.catplot(x = "discharge_disposition_id", y="readmitted", 
                data = diabet, height = 5, kind ="bar")
g.set_ylabels("Probability of Readmitted")
plt.show()

### **Admission Source ID**

Integer identifier corresponding to 21 distinct values.For example, **physician referral, emergency room, and transfer from a hospital**


In [None]:
sns.countplot(x ="admission_source_id", data = diabet)
plt.show()

We can see that there is same problem here. Again we applied some map like:

- we'll put the similar ones together like **Referral or Transfer**
- we will replace **Null, Not Mapped, Unknown** values as NAN


In [None]:
mapped_adm = {1:"Referral",2:"Referral",3:"Referral",
              4:"Other",5:"Other",6:"Other",10:"Other",22:"Other",25:"Other",
              9:"Other",8:"Other",14:"Other",13:"Other",11:"Other",
              15:np.nan,17:np.nan,20:np.nan,21:np.nan,
              7:"Emergency"}
diabet.admission_source_id = diabet.admission_source_id.replace(mapped_adm)

sns.countplot(x = "admission_source_id", data = diabet)
plt.show()

sns.countplot(x = "admission_source_id", hue = "readmitted", data = diabet)
plt.title("Admission Source - Readmitted")
plt.show()


print(diabet.admission_source_id.value_counts())

In [None]:
g = sns.catplot(x = "admission_source_id", y ="readmitted", 
                    data = diabet, kind = "bar", height= 5)
g.set_ylabels("Probability of Readmission")
plt.show()

- We see that **Readmitted Probability of Referral** is very close to **Emergency**, although **Emergency** is have more samples than other


## **Time in Hospital**
Integer number of days between admission and discharge. Shortly it is "treatment time"[](http://)

In [None]:
sns.countplot(x="time_in_hospital", data = diabet,
              order = diabet.time_in_hospital.value_counts().index)
plt.show()

print(diabet.time_in_hospital.value_counts())


In [None]:
fig = plt.figure(figsize=(10,5))

#readmitted = 0
ax = sns.kdeplot(diabet.loc[(diabet.readmitted == 0), "time_in_hospital"],
                 color = "b", shade = True, label = "Not Readmitted")

ax = sns.kdeplot(diabet.loc[(diabet.readmitted == 1), "time_in_hospital"],
                 color = "r", shade = True, label = "Readmitted")
ax.legend(loc="upper right")

ax.set_xlabel("Time in Hospital")
ax.set_ylabel("Frequency")
ax.set_title("Time in Hospital - Readmission")
plt.show()

- Most of people stayed 2 - 3 days in hospital


### **Number of Lab Procedures**
Number of lab tests performed during the encounter

In [None]:
plt.figure(figsize=(20,13))
sns.countplot(x = "num_lab_procedures", data = diabet)
plt.show()

print("Proportions of Column")
print(diabet.num_lab_procedures.value_counts().head(10))

In [None]:
fig = plt.figure(figsize=(10,5))

#readmitted = 0
ax = sns.kdeplot(diabet.loc[(diabet.readmitted == 0), "num_lab_procedures"],
                 color = "b", shade = True,label = "Not Readmitted")

#readmitted = 1
ax = sns.kdeplot(diabet.loc[(diabet.readmitted == 1), "num_lab_procedures"],
                 color = "r", shade = True, label = "Readmitted")

ax.legend(loc="upper right")

ax.set_xlabel("Number of Lab Procedures")
ax.set_ylabel("Frequency")
ax.set_title("Number of Lab Procedures - Readmission")

plt.show()

### **Number of Procedures**
Number of procedures (other than lab tests) performed during the encounter

In [None]:
sns.countplot(x = diabet.num_procedures, order = diabet.num_procedures.value_counts().index)
plt.title("Distribution of Number of Procedures")
plt.show()

print("Proportions of Values")
print(diabet.num_procedures.value_counts(normalize=True)*100)


In [None]:
sns.countplot(x = "num_procedures", hue = "readmitted", 
              data = diabet, order = diabet.num_procedures.value_counts().index)
plt.show()

In [None]:
sns.catplot(x = "num_procedures", y = "readmitted",
               data = diabet, kind = "bar", height = 5)
plt.title("Readmission Probability")
plt.show()

## **Number of Medications**
Number of distinct generic names administered during the encounter

In [None]:
plt.figure(figsize=(20,14))
sns.countplot(x="num_medications", data = diabet, 
        order = diabet.num_medications.value_counts().index)
plt.title("Distribution of Number of Medications")
plt.show()

print(diabet.num_medications.value_counts())

### **Diag1, Diag2 and Diag3**
- In diag section there are lots of ID that belong the specific name. So we'll map them

In [None]:
def map_diagnosis(data, cols):
    for col in cols:
        data.loc[(data[col].str.contains("V")) | (data[col].str.contains("E")), col] = -1
        data[col] = data[col].astype(np.float16)

    for col in cols:
        data["temp_diag"] = np.nan
        data.loc[(data[col]>=390) & (data[col]<=459) | (data[col]==785), "temp_diag"] = "Circulatory"
        data.loc[(data[col]>=460) & (data[col]<=519) | (data[col]==786), "temp_diag"] = "Respiratory"
        data.loc[(data[col]>=520) & (data[col]<=579) | (data[col]==787), "temp_diag"] = "Digestive"
        data.loc[(data[col]>=250) & (data[col]<251), "temp_diag"] = "Diabetes"
        data.loc[(data[col]>=800) & (data[col]<=999), "temp_diag"] = "Injury"
        data.loc[(data[col]>=710) & (data[col]<=739), "temp_diag"] = "Muscoloskeletal"
        data.loc[(data[col]>=580) & (data[col]<=629) | (data[col] == 788), "temp_diag"] = "Genitourinary"
        data.loc[(data[col]>=140) & (data[col]<=239), "temp_diag"] = "Neoplasms"

        data["temp_diag"] = data["temp_diag"].fillna("Other")
        data[col] = data["temp_diag"]
        data = data.drop("temp_diag", axis=1)

    return data

In [None]:
diabet = map_diagnosis(diabet,["diag_1","diag_2","diag_3"])

In [None]:
def plot_diags(col,data):
    sns.countplot(x = col, data = data,
            order = data[f"{col}"].value_counts().index)
    plt.xticks(rotation = 90)
    plt.title(col)
    plt.show()

diag_cols = ["diag_1","diag_2","diag_3"]

for diag in diag_cols:
    plot_diags(diag,diabet)


## **Diabetes medications**

- There was too many medications that belong the diabet. And some of them has just one or two value that does not any impact the model. So we decided to drop them.
- But firstly, lets look at the medications

In [None]:
drug_cols = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
             'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
             'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
             'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 
             'metformin-rosiglitazone', 'metformin-pioglitazone']

def explore_drug(drugs):

  for drug in drugs:
    sns.countplot(x = drug, 
                  hue = "readmitted",
                  data = diabet)
    plt.show()
    print(drug.upper())
    print(diabet[f"{drug}"].value_counts())
    

explore_drug(drug_cols)

## **Change**
Indicates if there was a change in diabetic medications (either dosage or generic
name). Values: 
- **“change”** 
- **“no change”**

In [None]:
diabet.change.value_counts()

In [None]:
diabet.change = diabet.change.replace("Ch","Yes")

sns.countplot(x = "change", data = diabet)
plt.title("Proportions of Change Values")
plt.show()

sns.countplot(x = "change", hue = "readmitted", data = diabet)
plt.show()


In [None]:
g = sns.catplot(x = "change", y="readmitted", data = diabet, height = 6, kind ="bar")
g.set_ylabels("Probability of Readmitted")
plt.show()                                                                                

## **Glucose Serum Test Result**<br>
Indicates the range of the result or if the test was not taken.<br> 
Values: 
- “>200,” 
- “>300,”
- “normal,” 
- “none” if not measured<br>

We decided use the Glucose Serum Test Result like as follows:


In [None]:
diabet["max_glu_serum"] = diabet["max_glu_serum"].replace({">200":2,
                                                        ">300":2,
                                                        "Norm":1,
                                                        "None":0}) 

sns.countplot(x = "max_glu_serum", data = diabet)
plt.show()

sns.countplot(x = "max_glu_serum",hue = "readmitted", data = diabet)
plt.show()

print(diabet.max_glu_serum.value_counts())

## **A1c test result**

Indicates the range of the result or if the test was not taken. <br>
Values: 
- “>8” if the result
was greater than 8%, 
- “>7” if the result was greater than 7% but less than 8%, “normal”
- if the result was less than 7%, and “none” if not measured.

We decided use the A1c test result like as follows:

In [None]:
diabet["A1Cresult"] = diabet["A1Cresult"].replace({">7":2,
                                           ">8":2,
                                           "Norm":1,
                                           "None":0})

sns.countplot(x = "A1Cresult", data = diabet)
plt.show()

sns.countplot(x = "A1Cresult",hue = "readmitted", data = diabet)
plt.show()

print(diabet.A1Cresult.value_counts())

## **Diabetes medications**
Indicates if there was any diabetic medication prescribed. Values: **“yes”** and **“no”**

In [None]:
sns.countplot(x = "diabetesMed", data = diabet )
plt.title("Proportions of Change Values")
plt.show()

sns.countplot(x = "diabetesMed", hue = "readmitted", data = diabet)
plt.show()

print(diabet.diabetesMed.value_counts())

In [None]:
g = sns.catplot(x = "diabetesMed", y="readmitted", 
            data = diabet, height = 6, kind ="bar")
g.set_ylabels("Probability of Readmitted")
plt.show()

In [None]:
diabet.isnull().sum()

In [None]:
#from google.colab import files

#diabet.to_csv('submit.csv',index=0)

#files.download('submit.csv')

In [None]:
 diabet['race'] = diabet['race'].fillna(diabet['race'].mode()[0])

In [None]:
diabet['admission_type_id'] = diabet['admission_type_id'].fillna(diabet['admission_type_id'].mode()[0])

In [None]:
diabet['discharge_disposition_id'] = diabet['discharge_disposition_id'].fillna(diabet['discharge_disposition_id'].mode()[0])

In [None]:
diabet['admission_source_id'] = diabet['admission_source_id'].fillna(diabet['admission_source_id'].mode()[0])

In [None]:
diabet.head()

### heatmap

In [None]:
plt.figure(figsize=(13,10))
sns.heatmap(diabet.corr(),annot=True)

In [None]:
#b
from statsmodels.stats.outliers_influence import variance_inflation_factor
X = diabet.select_dtypes(np.number)
vif = [variance_inflation_factor(X.values,i) for i in range(len(X.columns))]
z = pd.DataFrame({'names':X.columns,'vif':vif})  # we can ingnore constant as we know it just a constant we have provided
z

# Dropping the duplicates entries


In [None]:
cat_data = diabet.select_dtypes('O')

num_data = diabet.select_dtypes(np.number)

cat_data

In [None]:
cat_data = pd.get_dummies(cat_data,drop_first= True)
cat_data.shape

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for i in cat_data:
    cat_data[i] = le.fit_transform(cat_data[i])

In [None]:
data = pd.concat([num_data,cat_data],axis=1)
data.head()

In [None]:
data.drop(['encounter_id','patient_nbr'],axis=1,inplace=True)

In [None]:
data.head()

# Splitting the dependent and independent variable

In [None]:
X = data.drop('readmitted',axis=1)

y = data['readmitted']

In [None]:
import statsmodels.api as sm

Xc = sm.add_constant(X)

model = sm.Logit(y,Xc.iloc[:,0:25]).fit()


In [None]:
model.summary()

# Splitting into train and test

In [None]:
#modelling now
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler 

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=1)

In [None]:
SC = StandardScaler()

X_train_scaled = pd.DataFrame(SC.fit_transform(X_train),columns=X_train.columns)
X_test_scaled = pd.DataFrame(SC.transform(X_test),columns=X_test.columns)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
LR = LogisticRegression()

LR.fit(X_train_scaled,y_train)

In [None]:
LR.score(X_train_scaled,y_train)


In [None]:
LR.score(X_test_scaled,y_test)

In [None]:
y_pred = LR.predict(X_test)

print(classification_report(y_test,y_pred))

In [None]:
# define a to plot a confusion matrix for the model

    
    # predict the target values using X_test

    
    # create a confusion matrix
    # pass the actual and predicted target values to the confusion_matrix()
plt.figure(figsize=(9,7))
cm = confusion_matrix(y_test, y_pred)

    # label the confusion matrix  
    # pass the matrix as 'data'
    # pass the required column names to the parameter, 'columns'
    # pass the required row names to the parameter, 'index'
conf_matrix = pd.DataFrame(data = cm,columns = ['Predicted:0','Predicted:1'], index = ['Actual:0','Actual:1'])

    # plot a heatmap to visualize the confusion matrix
    # 'annot' prints the value of each grid 
    # 'fmt = d' returns the integer value in each grid
    # 'cmap' assigns color to each grid
    # as we do not require different colors for each grid in the heatmap,
    # use 'ListedColormap' to assign the specified color to the grid
    # 'cbar = False' will not return the color bar to the right side of the heatmap
    # 'linewidths' assigns the width to the line that divides each grid
    # 'annot_kws = {'size':25})' assigns the font size of the annotated text 
sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap =['lightskyblue'], cbar = False, 
                linewidths = 0.1, annot_kws = {'size':25})

    # set the font size of x-axis ticks using 'fontsize'
plt.xticks(fontsize = 20)

    # set the font size of y-axis ticks using 'fontsize'
plt.yticks(fontsize = 20)


    # display the plot
plt.show()
  

In [None]:
RF = RandomForestClassifier()

RF.fit(X_train_scaled,y_train)

In [None]:
RF.score(X_train_scaled,y_train)

In [None]:
RF.score(X_test_scaled,y_test)

In [None]:
y_pred = RF.predict(X_test)



In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
plt.figure(figsize=(12,24))
sns.barplot(sorted(RF.feature_importances_),X_train.columns);

In [None]:
z = pd.DataFrame([RF.feature_importances_,X_train.columns]).T

In [None]:
z.columns = ['Feature','importance']
plt.figure(figsize=(13,12))

sns.barplot(y=z['Feature'],x=z['importance'])

In [None]:
diabet.head()

In [None]:
X.head()

In [None]:
important_features = pd.DataFrame({'Features': X_train.columns, 
                                   'Importance': RF.feature_importances_})

# sort the dataframe in the descending order according to the feature importance
important_features = important_features.sort_values('Importance', ascending = False)

# create a barplot to visualize the features based on their importance
plt.figure(figsize=(12,24))
sns.barplot(x = 'Importance', y = 'Features', data = important_features)

# add plot and axes labels
# set text size using 'fontsize'
plt.title('Feature Importance', fontsize = 15)
plt.xlabel('Importance', fontsize = 15)
plt.ylabel('Features', fontsize = 15)

# display the plot
plt.show()

In [None]:
# define a to plot a confusion matrix for the model
def plot_confusion_matrix(model, test_data):
    
    # predict the target values using X_test
    y_pred = model.predict(test_data)
    
    # create a confusion matrix
    # pass the actual and predicted target values to the confusion_matrix()
    cm = confusion_matrix(y_test, y_pred)

    # label the confusion matrix  
    # pass the matrix as 'data'
    # pass the required column names to the parameter, 'columns'
    # pass the required row names to the parameter, 'index'
    conf_matrix = pd.DataFrame(data = cm,columns = ['Predicted:0','Predicted:1'], index = ['Actual:0','Actual:1'])

    # plot a heatmap to visualize the confusion matrix
    # 'annot' prints the value of each grid 
    # 'fmt = d' returns the integer value in each grid
    # 'cmap' assigns color to each grid
    # as we do not require different colors for each grid in the heatmap,
    # use 'ListedColormap' to assign the specified color to the grid
    # 'cbar = False' will not return the color bar to the right side of the heatmap
    # 'linewidths' assigns the width to the line that divides each grid
    # 'annot_kws = {'size':25})' assigns the font size of the annotated text 
    sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = ListedColormap(['lightskyblue']), cbar = False, 
                linewidths = 0.1, annot_kws = {'size':25})

    # set the font size of x-axis ticks using 'fontsize'
    plt.xticks(fontsize = 20)

    # set the font size of y-axis ticks using 'fontsize'
    plt.yticks(fontsize = 20)


    # display the plot
    plt.show()
  

In [None]:
# define a to plot a confusion matrix for the model

    
    # predict the target values using X_test

    
    # create a confusion matrix
    # pass the actual and predicted target values to the confusion_matrix()
plt.figure(figsize=(12,8))
cm = confusion_matrix(y_test, y_pred)

    # label the confusion matrix  
    # pass the matrix as 'data'
    # pass the required column names to the parameter, 'columns'
    # pass the required row names to the parameter, 'index'
conf_matrix = pd.DataFrame(data = cm,columns = ['Predicted:0','Predicted:1'], index = ['Actual:0','Actual:1'])

    # plot a heatmap to visualize the confusion matrix
    # 'annot' prints the value of each grid 
    # 'fmt = d' returns the integer value in each grid
    # 'cmap' assigns color to each grid
    # as we do not require different colors for each grid in the heatmap,
    # use 'ListedColormap' to assign the specified color to the grid
    # 'cbar = False' will not return the color bar to the right side of the heatmap
    # 'linewidths' assigns the width to the line that divides each grid
    # 'annot_kws = {'size':25})' assigns the font size of the annotated text 
sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap =['lightskyblue'], cbar = False, 
                linewidths = 0.1, annot_kws = {'size':25})

    # set the font size of x-axis ticks using 'fontsize'
plt.xticks(fontsize = 20)

    # set the font size of y-axis ticks using 'fontsize'
plt.yticks(fontsize = 20)


    # display the plot
plt.show()
  

In [None]:
# define a function to plot the ROC curve and print the ROC-AUC score
def plot_roc(model, test_data):
    
    # predict the probability of target variable using X_test
    # consider the probability of positive class by subsetting with '[:,1]'
    y_pred_prob = model.predict_proba(test_data)[:,1]
    
    # the roc_curve() returns the values for false positive rate, true positive rate and threshold
    # pass the actual target values and predicted probabilities to the function
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

    # plot the ROC curve
    plt.plot(fpr, tpr)

    # set limits for x and y axes
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])

    # plot the straight line showing worst prediction for the model
    plt.plot([0, 1], [0, 1],'r--')

    # add plot and axes labels
    # set text size using 'fontsize'
    plt.title('ROC curve for Cancer Prediction Classifier', fontsize = 15)
    plt.xlabel('False positive rate (1-Specificity)', fontsize = 15)
    plt.ylabel('True positive rate (Sensitivity)', fontsize = 15)

    # add the AUC score to the plot
    # 'x' and 'y' gives position of the text
    # 's' is the text 
    # use round() to round-off the AUC score upto 4 digits
    plt.text(x = 0.02, y = 0.9, s = ('AUC Score:',round(roc_auc_score(y_test, y_pred_prob),4)))

    # plot the grid
    plt.grid(True)

In [None]:
from sklearn.model_selection import GridSearchCV
params = {'criterion':['gini','entropy'],
          'max_depth':[5,7,8,9,10],
         'random_state':[1,2,3,4,],
       'learning_rate': [0.045,0.034,0.012,0.034,0.0023],
        'n_estimators': [20000,50000,23000,133400,43560,54643],
        'max_bin': [94,75,34,56,23,66,67,13,67,123,89],
        'num_leaves': [10,6,7,8,9,11,12,34,45,21,34],
        'max_depth': [27, 12,15,16,17,14,21,31,24,25],
        'reg_alpha': [8.457, 9.945,6.57,7.436,9.8765,10.23],
        'reg_lambda': [6.853,4.5,6.68,10.35,8.346,9.35],
        'subsample':[ 0.749,0.436,0.235,0.346,0.264,0.8568]
         }        
         



In [None]:
        

model= LGBMClassifier()
grid = GridSearchCV(estimator=model,param_grid=params)
grid.fit(X,y)

In [None]:
grid.best_params_

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

def cross_val(X, y, model, params, folds=9):

    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=21)
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        print(f"Fold: {fold}")
        x_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        x_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

        alg = model(**params)
        alg.fit(x_train, y_train,
                eval_set=[(x_test, y_test)],
                early_stopping_rounds=100,
                verbose=400)

        pred = alg.predict_proba(x_test)[:, 1]
        roc_score = roc_auc_score(y_test, pred)
        print(f"roc_auc_score: {roc_score}")
        print("-"*50)
    
    return alg


lgb_params= {'learning_rate': 0.045, 
             'n_estimators': 20000, 
             'max_bin': 94,
             'num_leaves': 10, 
             'max_depth': 27, 
             'reg_alpha': 8.457, 
             'reg_lambda': 6.853, 
             'subsample': 0.749}

from lightgbm import LGBMClassifier
lgb_model = cross_val(X, y, LGBMClassifier, lgb_params)

### using OVERSAMPLING METHOD!!!!

In [None]:
import imblearn

from imblearn.over_sampling import SMOTE


In [None]:
oversample = SMOTE()

X,y = oversample.fit_resample(X,y)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=121)

In [None]:
LR = LogisticRegression()

LR.fit(X_train,y_train)

y_pred_LR = LR.predict(X_test)

print(classification_report(y_test,y_pred))

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

dt.fit(X_train,y_train)

y_pred_dt = dt.predict(X_test)

print(classification_report(y_test,y_pred_dt))

In [None]:
RF = RandomForestClassifier()

RF.fit(X_train,y_train)

In [None]:
RF.score(X_train,y_train),RF.score(X_test,y_test)

In [None]:
y_pred_RF = RF.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_train,y_train)


In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

def cross_val(X, y, model, params, folds=9):

    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=21)
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        print(f"Fold: {fold}")
        x_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        x_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

        alg = model(**params)
        alg.fit(x_train, y_train,
                eval_set=[(x_test, y_test)],
                early_stopping_rounds=100,
                verbose=400)

        pred = alg.predict_proba(x_test)[:, 1]
        roc_score = roc_auc_score(y_test, pred)
        print(f"roc_auc_score: {roc_score}")
        print("-"*50)
    
    return alg


lgb_params= {'learning_rate': 0.045, 
             'n_estimators': 20000, 
             'max_bin': 94,
             'num_leaves': 10, 
             'max_depth': 27, 
             'reg_alpha': 8.457, 
             'reg_lambda': 6.853, 
             'subsample': 0.749}

from lightgbm import LGBMClassifier
lgb_model = cross_val(X, y, LGBMClassifier, lgb_params)

In [None]:
lgb_model.score(X_test,y_test),lgb_model.score(X_train,y_train)

In [None]:
y_pred_lgb = lgb_model.predict(X_test)
from sklearn.metrics import f1_score

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train,y_train)
y_pred_gbc = gbc.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
abc = AdaBoostClassifier()
abc.fit(X_train,y_train)
y_pred_ada = abc.predict(X_test)
print(classification_report(y_test,y_pred))

# decision tree

In [None]:
# define a to plot a confusion matrix for the model

    
    # predict the target values using X_test

    
    # create a confusion matrix
    # pass the actual and predicted target values to the confusion_matrix()
plt.figure(figsize=(9,7))
cm = confusion_matrix(y_test, y_pred_dt)

    # label the confusion matrix  
    # pass the matrix as 'data'
    # pass the required column names to the parameter, 'columns'
    # pass the required row names to the parameter, 'index'
conf_matrix = pd.DataFrame(data = cm,columns = ['Predicted:0','Predicted:1'], index = ['Actual:0','Actual:1'])

    # plot a heatmap to visualize the confusion matrix
    # 'annot' prints the value of each grid 
    # 'fmt = d' returns the integer value in each grid
    # 'cmap' assigns color to each grid
    # as we do not require different colors for each grid in the heatmap,
    # use 'ListedColormap' to assign the specified color to the grid
    # 'cbar = False' will not return the color bar to the right side of the heatmap
    # 'linewidths' assigns the width to the line that divides each grid
    # 'annot_kws = {'size':25})' assigns the font size of the annotated text 
sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap =['lightskyblue'], cbar = False, 
                linewidths = 0.1, annot_kws = {'size':25})

    # set the font size of x-axis ticks using 'fontsize'
plt.xticks(fontsize = 20)

    # set the font size of y-axis ticks using 'fontsize'
plt.yticks(fontsize = 20)


    # display the plot
plt.show()
  

# logistic regression

In [None]:
# define a to plot a confusion matrix for the model

    
    # predict the target values using X_test

    
    # create a confusion matrix
    # pass the actual and predicted target values to the confusion_matrix()
plt.figure(figsize=(9,7))
cm = confusion_matrix(y_test, y_pred_LR)

    # label the confusion matrix  
    # pass the matrix as 'data'
    # pass the required column names to the parameter, 'columns'
    # pass the required row names to the parameter, 'index'
conf_matrix = pd.DataFrame(data = cm,columns = ['Predicted:0','Predicted:1'], index = ['Actual:0','Actual:1'])

    # plot a heatmap to visualize the confusion matrix
    # 'annot' prints the value of each grid 
    # 'fmt = d' returns the integer value in each grid
    # 'cmap' assigns color to each grid
    # as we do not require different colors for each grid in the heatmap,
    # use 'ListedColormap' to assign the specified color to the grid
    # 'cbar = False' will not return the color bar to the right side of the heatmap
    # 'linewidths' assigns the width to the line that divides each grid
    # 'annot_kws = {'size':25})' assigns the font size of the annotated text 
sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap =['lightskyblue'], cbar = False, 
                linewidths = 0.1, annot_kws = {'size':25})

    # set the font size of x-axis ticks using 'fontsize'
plt.xticks(fontsize = 20)

    # set the font size of y-axis ticks using 'fontsize'
plt.yticks(fontsize = 20)


    # display the plot
plt.show()
  

In [None]:
lis = [y_pred_ada,y_pred_RF,y_pred_gbc,y_pred_dt,y_pred_lgb,y_pred_LR]

for i in lis:
    
    
    # define a to plot a confusion matrix for the model

    
    # predict the target values using X_test

    
    # create a confusion matrix
    # pass the actual and predicted target values to the confusion_matrix()
    plt.figure(figsize=(9,7))
    cm = confusion_matrix(y_test,i)

    # label the confusion matrix  
    # pass the matrix as 'data'
    # pass the required column names to the parameter, 'columns'
    # pass the required row names to the parameter, 'index'
    conf_matrix = pd.DataFrame(data = cm,columns = ['Predicted:0','Predicted:1'], index = ['Actual:0','Actual:1'])

    # plot a heatmap to visualize the confusion matrix
    # 'annot' prints the value of each grid 
    # 'fmt = d' returns the integer value in each grid
    # 'cmap' assigns color to each grid
    # as we do not require different colors for each grid in the heatmap,
    # use 'ListedColormap' to assign the specified color to the grid
    # 'cbar = False' will not return the color bar to the right side of the heatmap
    # 'linewidths' assigns the width to the line that divides each grid
    # 'annot_kws = {'size':25})' assigns the font size of the annotated text 
    sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap =['lightskyblue'], cbar = False, 
                linewidths = 0.1, annot_kws = {'size':25})

    # set the font size of x-axis ticks using 'fontsize'
    plt.xticks(fontsize = 20)

    # set the font size of y-axis ticks using 'fontsize'
    plt.yticks(fontsize = 20)


    # display the plot
    plt.show()
  