In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tabulate import tabulate
import missingno as msno
import tabulate as tb
import statsmodels.api as sm
from statsmodels.formula.api import ols
import scipy.stats as stats
import statsmodels.stats.multicomp as multi
from sklearn import preprocessing
import matplotlib.pyplot as plt 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns

In [None]:
data = pd.DataFrame(pd.read_csv('../input/heart-disease-prediction-using-logistic-regression/framingham.csv'))
display(data)
data.shape

In [None]:
data.dtypes

In [None]:
np.sum(data.isnull())

In [None]:
msno.matrix(data)

In the above graph, white lines represent missing values and their location. As can be seen above, the variable with the most missing values is glucose while 9 other variables don't have any such as gender, age, smoking status. In this project, missing values will be imputed by the appropriate method.

In [None]:
#Descriptive statistics of numeric variables
data[['age','cigsPerDay','totChol','sysBP','diaBP','BMI','heartRate','glucose']].dropna().describe()

The table of 10-year risk of coronary heart disease and gender shows the number of people in each group. By that, we can conclude that men in the data who have a 10-year risk of coronary heart disease are greater than women while for the other group that has no risk it is the opposite.

In [None]:
#Imputed data
data_wo_na=data.copy()
data_wo_na['cigsPerDay'] = data_wo_na['cigsPerDay'].fillna(data_wo_na['cigsPerDay'].mode().iloc[0])
data_wo_na['totChol'] = data_wo_na['totChol'].fillna(data_wo_na['totChol'].median())
data_wo_na['BMI'] = data_wo_na['BMI'].fillna(data_wo_na['BMI'].median())
data_wo_na['heartRate'] = data_wo_na['heartRate'].fillna(data_wo_na['heartRate'].median())
data_wo_na['glucose'] = data_wo_na['glucose'].fillna(data_wo_na['glucose'].median())
data_wo_na['education'] = data_wo_na['education'].fillna(data_wo_na['education'].mode().iloc[0])
data_wo_na['BPMeds'] = data_wo_na['BPMeds'].fillna(0)

In [None]:
bins = [29, 39, 49, 59, 69, 79]
labels = ['30-39', '40-49', '50-59', '60-69', '70-79']
data['agerange'] = pd.cut(data.age, bins, labels = labels,include_lowest = True)
bins = [29, 39, 49, 59, 69, 79]
labels = ['30-39', '40-49', '50-59', '60-69', '70-79']
data_wo_na['agerange'] = pd.cut(data.age, bins, labels = labels,include_lowest = True)

In [None]:
data[['male','education','currentSmoker','BPMeds','prevalentStroke','prevalentHyp','diabetes','TenYearCHD','education']]=data[['male','education','currentSmoker','BPMeds','prevalentStroke','prevalentHyp','diabetes','TenYearCHD','education']].astype('category')
print(data.dtypes)
data_wo_na[['male','education','currentSmoker','BPMeds','prevalentStroke','prevalentHyp','diabetes','TenYearCHD','education']]=data_wo_na[['male','education','currentSmoker','BPMeds','prevalentStroke','prevalentHyp','diabetes','TenYearCHD','education']].astype('category')
print(data_wo_na.dtypes)

In [None]:
data.groupby('TenYearCHD').mean()

All numeric variables are higher in the group who have a 10-year risk of coronary heart disease.

In [None]:
#Descriptive statistics after imputation
data_wo_na[['age','cigsPerDay','totChol','sysBP','diaBP','BMI','heartRate','glucose']].describe()

**EDA-RESEARCH QUESTIONS**

*1) How does the 10-year risk of coronary heart disease change by other variables in the data?*

In [None]:
palette = sns.color_palette("mako_r", 6)
sns.catplot(x="TenYearCHD", kind="count",palette=palette, data=data)

In [None]:
TenYearCHD_table=pd.crosstab(index=data['TenYearCHD'],columns='count')
sns.heatmap(TenYearCHD_table, cmap=palette, annot=True, fmt='g');

As it can be seen in the above graph and the frequency table, data consist of 3594 people who don't have a 10-year risk of coronary heart disease and 644 people who have the risk.

In [None]:
#GENDER
x, y, hue = "TenYearCHD", "proportion", "male"
hue_order = ["Female", "Male"]

(data[x]
 .groupby(data[hue])
 .value_counts(normalize=True)
 .rename(y)
 .reset_index()
 .pipe((sns.barplot, "data"), x=x, y=y, hue=hue,palette=palette))

Most of the people who don't have a 10-year risk of coronary heart disease are female while the ones who have the risk are generally male.

In [None]:
from scipy.stats import chi2_contingency
c=pd.crosstab(index=data['male'], columns=data['TenYearCHD'])
sns.heatmap(c, cmap=palette, annot=True, fmt='g');

In [None]:
stat, p, dof, expected = chi2_contingency(c)
  
# interpret p-value
alpha = 0.05
print("Chi-Square Test Result")
print("p value is " + str(p))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (H0 holds true)')

Therefore, H0 was rejected, that is, the 10-year risk of coronary heart disease and gender have a significant relation.

In [None]:
#AGE
age_CHD_table=pd.crosstab(index=data['agerange'],columns=data['TenYearCHD'])
sns.heatmap(age_CHD_table, cmap=palette, annot=True, fmt='g');

In [None]:
x, y, hue = "TenYearCHD", "proportion", "agerange"
hue_order = ["30-39", "40-49","50-59","60-69","70-79"]

(data[x]
 .groupby(data[hue])
 .value_counts(normalize=True)
 .rename(y)
 .reset_index()
 .pipe((sns.barplot, "data"), x=x, y=y, hue=hue,palette=palette))

In the above bar plots, the first one represents the frequency of 10-year risk of CHD in each age group while the second one shows the proportions.

By age frequency table and the first graph, it can be said that 40-49 age group is the most crowded one while age group 70-79 have only 2 people.
The age group and CHD table and the second graph show that age and the 10-year risk of coronary heart disease are directly proportional. In the age group 70-79, the risk increases to 50 percent while in the age group 30-39 it is less than 5 percent.

In [None]:
#EDUCATION
x, y, hue = "TenYearCHD", "proportion", "education"
hue_order = [1, 2,3,4]

(data[x]
 .groupby(data[hue])
 .value_counts(normalize=True)
 .rename(y)
 .reset_index()
 .pipe((sns.barplot, "data"), x=x, y=y, hue=hue,palette=palette))

The proportion of the 10-year risk of coronary heart disease by education graph shows the percentages of 10-year risk of coronary heart disease in each education group. There is no huge difference between those percentages but the greatest risk is in education group 1 while the lowest is in group 2.

In [None]:
c=pd.crosstab(index=data['education'], columns=data['TenYearCHD'])
sns.heatmap(c, cmap=palette, annot=True, fmt='g');
stat, p, dof, expected = chi2_contingency(c)
  
# interpret p-value
alpha = 0.05
print("Chi-Square Test Result")
print("p value is " + str(p))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (H0 holds true)')

Therefore, H0 was rejected, that is, the 10-year risk of coronary heart disease and education have a significant relation.

In [None]:
#PREVALENT STROKE
c=pd.crosstab(index=data['prevalentStroke'], columns=data['TenYearCHD'])
sns.heatmap(c, cmap=palette, annot=True, fmt='g');

The data consist of only 25 people who had a stroke before and 44 percent of them have a 10-year risk of coronary heart disease. This seems like a big percentage but since the sample is very small, it did not give a considerable meaning apart from this data.

In [None]:
stat, p, dof, expected = chi2_contingency(c)
  
# interpret p-value
alpha = 0.05
print("Chi-Square Test Result")
print("p value is " + str(p))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (H0 holds true)')

Therefore, H0 was rejected, that is, the 10-year risk of coronary heart disease and prevalent stroke have a significant relation.

In [None]:
#PREVALENT HYPERTENSION
c=pd.crosstab(index=data['prevalentHyp'], columns=data['TenYearCHD'])
sns.heatmap(c, cmap=palette, annot=True, fmt='g');

In [None]:
stat, p, dof, expected = chi2_contingency(c)
  
# interpret p-value
alpha = 0.05
print("Chi-Square Test Result")
print("p value is " + str(p))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (H0 holds true)')

Therefore, H0 was rejected, that is, the 10-year risk of coronary heart disease and prevalent hypertension have a significant relation.

In [None]:
#DIABETES
c=pd.crosstab(index=data['diabetes'], columns=data['TenYearCHD'])
sns.heatmap(c, cmap=palette, annot=True, fmt='g');

The table above shows the frequencies of people who have diabetes or not by the 10-year risk of coronary heart disease. And by that, it can be concluded that there is a huge difference in percentages for people who have diabetes. For people who don't have diabetes, the risk is 14.6 percent while for the other group the risk is 63.3 percent.

In [None]:
stat, p, dof, expected = chi2_contingency(c)
  
# interpret p-value
alpha = 0.05
print("Chi-Square Test Result")
print("p value is " + str(p))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (H0 holds true)')

Therefore, H0 was rejected, that is, the 10-year risk of coronary heart disease and diabetes have a significant relation.

In [None]:
#SMOKING STATUS
sns.catplot(x="currentSmoker",hue="TenYearCHD", kind="count",palette=palette, data=data)

The above table shows the change in 10-year risk of coronary heart disease by smoking status. It can be said that smoking status has an inconsiderable effect on the risk in the data.

In [None]:
c=pd.crosstab(index=data['currentSmoker'], columns=data['TenYearCHD'])
sns.heatmap(c, cmap=palette, annot=True, fmt='g');

stat, p, dof, expected = chi2_contingency(c)
  
# interpret p-value
alpha = 0.05
print("Chi-Square Test Result")
print("p value is " + str(p))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (H0 holds true)')

Therefore, H0 was accepted, that is, the 10-year risk of coronary heart disease and smoking status does not have a significant relation.

In [None]:
#BODY MASS INDEX
ager_10ychd = pd.crosstab(index=data['agerange'],columns=data['TenYearCHD'],
                    values=data['BMI'],
                    aggfunc=np.mean).round(0)
sns.heatmap(ager_10ychd, cmap=palette, annot=True, fmt='g');

In [None]:
sns.catplot(x="TenYearCHD", y="BMI",hue="agerange", kind="box", data=data,palette=palette)

The body mass index box plot shows the distributions for the 10-year risk of coronary heart disease by age group.

In the age group 30-39; There is not much difference in the median but for the risk group, it is slightly higher. Minimum, maximum values, and the first quartile are lower in the risk group but the third quartile is higher.

In the age group 40-49; The distribution for the risk group is almost the same as the group of people who don't have the risk.

In the age group 50-59; The distribution for the risk group is almost the same as the group of people who don't have the risk but the first and third quartile is slightly higher in the risk group.

In the age group 60-69; There is not much difference in the median and minimum value. Maximum values and the first quartile are lower in the risk group but the third quartile is higher.

In [None]:
#GLUCOSE, TOTAL CHOLESTEROL, SYSTOLIC BLOOD PRESSURE, DIASTOLIC BLOOD PRESSURE, HEART RATE
import matplotlib.pyplot as plt
import seaborn as sns
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

sns.boxplot(ax=axes[0, 0],data=data_wo_na, y="glucose", x='TenYearCHD', palette=palette)
sns.boxplot(ax=axes[0, 1],data=data_wo_na, y="totChol", x='TenYearCHD', palette=palette)
sns.boxplot(ax=axes[1, 0],data=data_wo_na, y="sysBP", x='TenYearCHD', palette=palette)
sns.boxplot(ax=axes[1, 1],data=data_wo_na, y="diaBP", x='TenYearCHD', palette=palette)
sns.boxplot(ax=axes[0, 2],data=data_wo_na, y="heartRate", x='TenYearCHD', palette=palette)

The above box plots show the distributions in each variable by the 10-year risk of coronary heart disease.

**Glucose-10 Year Risk of Coronary Heart Disease:** The distributions for both the risk group and the group of people who don't have the risk are almost the same except the third quartile which is greater and the maximum value which is slightly greater for the risk group. Both groups have so many outliers.

**Total Cholesterol-10 Year Risk of Coronary Heart Disease:** The distributions for both the risk group and the group of people who don't have the risk are almost the same. Both groups have so many outliers.

**Heart Rate-10 Year Risk of Coronary Heart Disease:** The distributions for both the risk group and the group of people who don't have the risk are almost the same except for the third quartile, minimum and maximum values which are slightly greater for the risk group. Both groups have so many outliers especially the group of people who don't have the risk.

**Systolic Blood Pressure-10 Year Risk of Coronary Heart Disease:** In the group that doesn't have a 10-year risk of coronary heart disease, the median is about 130 while in the other group that has a 10-year risk of coronary heart disease it is almost 150. The minimum systolic blood pressure value for both two groups are the same while the maximum value is much higher in the risk group. Also, the first and third quartiles are so much higher in the risk group.

**Diastolic Blood Pressure-10 Year Risk of Coronary Heart Disease:** Like systolic blood pressure, diastolic blood pressure's median, max, first quartile, and third quartile values are higher for the risk group.

2) Does smoking status and the number of cigarettes smoked in a day affect heart rate and systolic blood pressure?

In [None]:
data['cigsPerDay'].value_counts()
bins = [0,1,3,7,11,15,19,23,27,30,39,49,80]
labels = ['0','1-3', '4-7', '8-11', '12-15', '16-19','20-23','24-27','27-30','31-39','40-49','50+']
data['cigrange'] = pd.cut(data.cigsPerDay, bins, labels = labels,include_lowest = True)
data[['cigrange','cigsPerDay']]
data_wo_na['cigsPerDay'].value_counts()
bins = [0,1,3,7,11,15,19,23,27,30,39,49,80]
labels = ['0','1-3', '4-7', '8-11', '12-15', '16-19','20-23','24-27','27-30','31-39','40-49','50+']
data_wo_na['cigrange1'] = pd.cut(data_wo_na.cigsPerDay, bins, labels = labels,include_lowest = True)
data_wo_na[['cigrange1','cigsPerDay']]
#Groups that have the range of cigarettes smoked in a day were created to see the results in graphs without a mess.

In [None]:
#HEART RATE
cgr_10ychd = pd.crosstab(index=data_wo_na['cigrange1'],columns=data_wo_na['agerange'],
                    values=data_wo_na['heartRate'],
                    aggfunc=np.mean).round(0)
sns.heatmap(cgr_10ychd, cmap=palette, annot=True, fmt='g');
sns.catplot(x="cigrange1", y="heartRate",aspect=1.5, kind="box", data=data_wo_na,palette="viridis")

The above box plot shows the distribution of heart rate and the range of cigarettes smoked in a day. Except for the 31-39 and 50+ groups, every group has almost the same median heart rate. The comments for each group were written as a comparison with the previous one.

* For the group who are non-smokers, the box plot shows a symmetric distribution. Also, this group has many outliers and this can be because of other variables such as age. Without outliers, the minimum value for this group is almost 45 which is very low even in resting. The maximum value is near 105 and the median is near 75.
* For the group 1-3, while the minimum heart rate increased, the maximum heart rate decreased. There is a left-skewed distribution in this group and this means that there is an agglomeration in between the median and the third quartile. So, we can say that in this group there are more people who have heartrate above the median than who have heartrate below the median.
* For the group 4-7, minimum and maximum values for heart rate are almost the same as the 1-3 group but the first and third quartiles are higher. This group also has an almost symmetric distribution.
* For the group 8-11, while the minimum value is lower the maximum value is higher than the previous group. The first and third quartiles are almost the same except the third quartile is a little lower in the 8-11. This group also looks symmetric.
* For the group 12-15 there is an increase in the minimum value but the maximum value remains the same as the previous group. The first and third quartiles are higher and there is a right-skewed distribution in this group. This means there is an agglomeration between the first quartile and the median.
* For the groups 16-19 the range of the minimum and maximum is getting the smallest. Also, the interquartile range is the smallest too. The distribution looks symmetric and there are only 2 outliers that are lower than the minimum value and very higher than the maximum. This can be a cause of other factors such as age.
* For the group 20-23 the distribution and descriptive statistics are almost equal to the group 12-15. But there are so many outliers in this case that are higher than the maximum value.
* For the group 24-27 the minimum value is much higher while the maximum is much lower than the previous one. The first quartile is almost the same but the third quartile is a little higher. There is a right-skewed distribution in this group which means there is an agglomeration in between the first quartile and the median.
* For the group 27-30 the minimum value is much lower while the maximum is much higher than the previous one. While the third quartile is almost the same, the first quartile is a little lower. There is also right-skewed distribution but not so obvious like the previous group.
* For the group 31-39 the median is lower than previous ones and there is a really obvious decrease in the maximum value which is confusing because it is expected that the number of cigarettes smoked and the heart rate are directly proportional. It can be because of other variables. The minimum value remains the same. There is a right-skewed distribution as well.
* For the group 40-49 there is almost the same distribution as the 27-30 except the first quartile and the minimum value which are a little higher.
* For the 50+ group the median and ranges between maximum and minimum values getting smaller. There is an obvious right-skewed distribution which means there is an agglomeration in between the first quartile and the median.

In [None]:
plt.hist(data_wo_na['heartRate'])
stats.shapiro(data_wo_na['heartRate'])

By the above histogram, it can be seen that heart rate is not normal. So the assumption of ANOVA, normality was not provided. Because of this, ANOVA can not be performed. Instead of ANOVA, Non-parametric Kruskal was used to see the differences between median systolic blood pressure in each age group.

In [None]:
data_wo_na['cigrange1'].value_counts()

In [None]:
data_wo_na['cigrange1'].value_counts()
cr0=data_wo_na['heartRate'][data_wo_na['cigrange1']=='0']
cr1=data_wo_na['heartRate'][data_wo_na['cigrange1']=='1-3']
cr2=data_wo_na['heartRate'][data_wo_na['cigrange1']=='4-7']
cr3=data_wo_na['heartRate'][data_wo_na['cigrange1']=='8-11']
cr4=data_wo_na['heartRate'][data_wo_na['cigrange1']=='12-15']
cr5=data_wo_na['heartRate'][data_wo_na['cigrange1']=='16-19']
cr6=data_wo_na['heartRate'][data_wo_na['cigrange1']=='20-23']
cr7=data_wo_na['heartRate'][data_wo_na['cigrange1']=='24-27']
cr8=data_wo_na['heartRate'][data_wo_na['cigrange1']=='28-31']
cr9=data_wo_na['heartRate'][data_wo_na['cigrange1']=='31-39']
cr10=data_wo_na['heartRate'][data_wo_na['cigrange1']=='40-49']
cr11=data_wo_na['heartRate'][data_wo_na['cigrange1']=='50+']
cr0=cr0.append(cr1)
cr2=cr2.append(cr3)
cr4=cr4.append(cr5)
cr6=cr6.append(cr8)
cr8=cr8.append(cr10)
cr10=cr10.append(cr11)
#perform Kruskal-Wallis Test 

stats.kruskal(cr0,cr2,cr4,cr6,cr8,cr10)

Since the p-value of the Kruskal Wallis H Test is smaller than 0.05, the null hypothesis can be rejected. This means the median heart rate differ by the number of cigarettes smoked in a day.

In [None]:
#SYSTOLIC BLOOD PRESSURE
cgr_10ychdsys = pd.crosstab(index=data_wo_na['cigrange1'],columns=data_wo_na['agerange'],
                    values=data_wo_na['sysBP'],
                    aggfunc=np.mean).round(0)
sns.heatmap(cgr_10ychdsys, cmap=palette, annot=True, fmt='g');
sns.catplot(x="cigrange", y="sysBP",aspect=1.5, kind="box", data=data,palette="viridis")

The above box plot shows the distribution of systolic blood pressure and the range of cigarettes smoked in a day. The comments for each group were written as a comparison with the previous one.

* For the group who are non-smokers, the box plot shows a symmetric distribution. This group has many outliers and this can be because of other variables such as age. Without outliers, the minimum value for this group is almost 75. The maximum value is near 185 and the median is near 130.
* For the group 1-3, while the minimum systolic blood pressure(near 100) increased, the maximum systolic blood pressure(near 180) and the median(near 130) decreased. The distribution looks symmetric.
* For the group 4-7, minimum, maximum, first quartile, and third quartile are all lower than the previous group. This group also has an almost symmetric distribution.
* For the group 8-11, the minimum value and the maximum values are lower than the previous group. The first and third quartiles are almost the same. There is a right-skewed distribution which means there is an agglomeration between the first quartile and the median.
* For the group 12-15 there is a decrease in the minimum and the maximum values. The first and third quartiles are almost the same and there is an almost symmetric distribution in this group.
* For the groups 16-19 the minimum, maximum, first, and third quartiles and also median are higher. There is a right-skewed distribution.
* For the group 20-23 all descriptive statistics are lower than the previous group.
* For the group 24-27 the minimum value is much higher while the maximum is much lower than the previous one. The first quartile is almost the same but the third quartile is a little lower. There is a right-skewed distribution in this group which means there is an agglomeration in between the first quartile and the median.
* For the group 27-30 the minimum value is much lower while the maximum is much higher than the previous one. While the first quartile is almost the same, the first quartile is a little higher. There is an almost symmetric distribution.
* For the group 31-39 all descriptive statistics are higher except the first quartile and the distribution looks symmetric.
* For the group 40-49 all descriptive statistics are lower except the first quartile and the distribution looks symmetric.
* For the 50+ group the median and ranges between maximum and minimum values getting smaller. There is an obvious right-skewed distribution which means there is an agglomeration in between the first quartile and the median.

In [None]:
plt.hist(data_wo_na['sysBP'])
stats.shapiro(data_wo_na['sysBP'])

By the above histogram, it can be seen that systolic blood pressure is not normal. So the assumption of ANOVA, normality was not provided. Because of this, ANOVA can not be performed. Instead of ANOVA, Non-parametric Kruskal was used to see the differences between median systolic blood pressure in each age group.

In [None]:
data_wo_na['cigrange1'].value_counts()
cr0=data_wo_na['sysBP'][data_wo_na['cigrange1']=='0']
cr1=data_wo_na['sysBP'][data_wo_na['cigrange1']=='1-3']
cr2=data_wo_na['sysBP'][data_wo_na['cigrange1']=='4-7']
cr3=data_wo_na['sysBP'][data_wo_na['cigrange1']=='8-11']
cr4=data_wo_na['sysBP'][data_wo_na['cigrange1']=='12-15']
cr5=data_wo_na['sysBP'][data_wo_na['cigrange1']=='16-19']
cr6=data_wo_na['sysBP'][data_wo_na['cigrange1']=='20-23']
cr7=data_wo_na['sysBP'][data_wo_na['cigrange1']=='24-27']
cr8=data_wo_na['sysBP'][data_wo_na['cigrange1']=='28-31']
cr9=data_wo_na['sysBP'][data_wo_na['cigrange1']=='31-39']
cr10=data_wo_na['sysBP'][data_wo_na['cigrange1']=='40-49']
cr11=data_wo_na['sysBP'][data_wo_na['cigrange1']=='50+']
cr0=cr0.append(cr1)
cr2=cr2.append(cr3)
cr4=cr4.append(cr5)
cr6=cr6.append(cr8)
cr8=cr8.append(cr10)
cr10=cr10.append(cr11)
#perform Kruskal-Wallis Test 
stats.kruskal(cr0,cr2,cr4,cr6,cr8,cr10)

Since the p-value of the Kruskal Wallis H Test is smaller than 0.05, the null hypothesis can be rejected. This means the median systolic blood pressure differs by the number of cigarettes smoked in a day.

3) Does smoking status and the number of cigarettes smoked change by gender, age and education?

In [None]:
x, y, hue = "currentSmoker", "proportion", "male"
hue_order = ["Male", "Female"]

(data[x]
 .groupby(data[hue])
 .value_counts(normalize=True)
 .rename(y)
 .reset_index()
 .pipe((sns.barplot, "data"), x=x, y=y, hue=hue,palette=palette))

The above graph shows the proportion of people in the status smokers and non-smokers while the colors show the gender in each group. In this data, almost 60 percent of people who are smokers are female while more than 60 percent of people who are not smokers are male.

In [None]:
x, y, hue = "currentSmoker", "proportion", "agerange"
hue_order = ["30-39","40-49","50-59","60-69","70-79"]

(data[x]
 .groupby(data[hue])
 .value_counts(normalize=True)
 .rename(y)
 .reset_index()
 .pipe((sns.barplot, "data"), x=x, y=y, hue=hue,palette=palette))

Similar to the previous graph, the above graph shows the proportion for smoking status. But in this case, it shows the proportion of age groups instead of gender. In the x axis, 0 represents non-smokers while 1 represents smokers.

All people in the age group 70-79 are smokers by this graph. But it is not correct to conclude with this result since there are only 2 people in that group in the data.

In the age groups 60-69 and 50-59, the majority of people are smokers while for the age groups 40-49 and 30-39 it is the opposite. But generally, the highest percentage of people who are smokers are in the 60-69 age group if 70-79 will not be included.

In [None]:
sns.catplot(x="cigrange", kind="count",hue='male', aspect=1.5, data=data,palette=palette)

The range of cigarettes smoked in a day and gender graph shows the following conclusions;

* Women in the data who don't smoke are more than 2 times of men who are non-smokers. Also, it can be said that the people who don't smoke are the majority.
* In the range of 1-15 cigarettes smoked in a day, the majority are women.
* In the range of 16-19 and 31-39 cigarettes smoked in a day, there is not any female.
* In the range of 20-40+ cigarettes smoked in a day, the majority are men.
A simple conclusion can be made by looking at these results. And this is that men tend to smoke more cigarettes than women in this data.

In [None]:
#EDUCATION
from matplotlib import cm
# Prepare Data
df = data.groupby('education').size()

# Make the plot with pandas
df.plot(kind='pie', subplots=True, figsize=(8, 8),cmap="crest", autopct='%1.1f%%')
plt.title("Pie Chart of Education")
plt.ylabel("")
plt.show()

In [None]:
sns.catplot(x="education",y='cigsPerDay' ,kind="bar", aspect=1.5, data=data,palette=palette)

Actually, it is expected that while education levels go higher the number of cigarettes smoked per day will decrease. But above bar plot shows that there is not a relationship like that in this data. The education level that has the most cigarettes in a day is level 2 while the second one is level 4 and the least is level 3.

In [None]:
sns.catplot(x='education' ,hue="currentSmoker",kind="count", aspect=1.5, data=data,palette=palette)

The above plot shows the frequencies of smoking status in each education level. The highest percentage of smokers are in the education level 2 while the second one is in the level 4 and the least one is in the level 3.

4) Does age affect other variables?

In [None]:
#HEART RATE
sns.catplot(x="agerange", y="heartRate",aspect=1.5, kind="box", data=data_wo_na,palette="viridis")

The medians of heart rate are almost the same for all age groups except 70-79. In each group, the distribution looks symmetric. The range for the group 70-79 is the smallest but since there are only 2 people in that group, it doesn't mean anything concrete.

It is known that the heart rate is not normal. So the assumption of ANOVA, normality was not provided. Because of this, ANOVA can not be performed. Instead of ANOVA, Non-parametric Kruskal was used to see the differences between median heart rates in each age group.

In [None]:
data_wo_na['agerange'].value_counts()
ar3=data_wo_na['heartRate'][data_wo_na['agerange']=='30-39']
ar4=data_wo_na['heartRate'][data_wo_na['agerange']=='40-49']
ar5=data_wo_na['heartRate'][data_wo_na['agerange']=='50-59']
ar6=data_wo_na['heartRate'][data_wo_na['agerange']=='60-69']
ar7=data_wo_na['heartRate'][data_wo_na['agerange']=='70-79']
#perform Kruskal-Wallis Test 
stats.kruskal(ar3, ar4 ,ar5, ar6, ar7)

Since the p-value of the Kruskal Wallis H Test is greater than 0.05, the null hypothesis cannot be rejected. This means heart rates don't change by age groups.

In [None]:
#SYSTOLIC BLOOD PRESSURE
sns.catplot(x="agerange", y="sysBP",aspect=1.5, kind="box", data=data_wo_na,palette="viridis")

Median of systolic blood pressure, interquartile ranges, minimum and maximum values are getting higher until the group 70-79 except for the 50-59's min value. The distributions of all groups look symmetric.

In [None]:
data_wo_na['agerange'].value_counts()
ar3=data_wo_na['sysBP'][data_wo_na['agerange']=='30-39']
ar4=data_wo_na['sysBP'][data_wo_na['agerange']=='40-49']
ar5=data_wo_na['sysBP'][data_wo_na['agerange']=='50-59']
ar6=data_wo_na['sysBP'][data_wo_na['agerange']=='60-69']
ar7=data_wo_na['sysBP'][data_wo_na['agerange']=='70-79']
#perform Kruskal-Wallis Test 
stats.kruskal(ar3, ar4 ,ar5, ar6, ar7)

Since the p-value of the Kruskal Wallis H Test is smaller than 0.05, the null hypothesis can be rejected. This means systolic blood pressure differ by age groups.

In [None]:
sns.catplot(x="agerange", y="glucose",aspect=1.5, kind="box", data=data_wo_na,palette="viridis")

Almost all desciptives are equal for each group.

In [None]:
plt.hist(data_wo_na['glucose'])

By the above histogram, it can be seen that glucose is not normal. So the assumption of ANOVA, normality was not provided. Because of this, ANOVA can not be performed. Instead of ANOVA, Non-parametric Kruskal was used to see the differences between median glucose in each age group.

In [None]:
data_wo_na['agerange'].value_counts()
ar3=data_wo_na['glucose'][data_wo_na['agerange']=='30-39']
ar4=data_wo_na['glucose'][data_wo_na['agerange']=='40-49']
ar5=data_wo_na['glucose'][data_wo_na['agerange']=='50-59']
ar6=data_wo_na['glucose'][data_wo_na['agerange']=='60-69']
ar7=data_wo_na['glucose'][data_wo_na['agerange']=='70-79']
#perform Kruskal-Wallis Test 
stats.kruskal(ar3, ar4 ,ar5, ar6, ar7)

Since the p-value of the Kruskal Wallis H Test is smaller than 0.05, the null hypothesis can be rejected. This means glucose differ by age groups.

**PRE-PROCESSING**

In [None]:
plt.rc("font", size=14)
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
data_wo_na = data_wo_na.rename(columns={'TenYearCHD': 'y'})
data_wo_na=data_wo_na.drop(columns=['agerange','cigrange1'])
education=pd.get_dummies(data_wo_na['education'])
education.columns=['education_1','education_2','education_3','education_4']
education

In [None]:
data_wo_na=pd.concat([data_wo_na,education],axis=1)
data_wo_na=data_wo_na.drop(columns='education')
data_wo_na

In [None]:
#OUTLIER DETECTION
datanum=data_wo_na[['cigsPerDay','totChol','sysBP','diaBP','BMI','heartRate','glucose']].copy()
datacat=data_wo_na.drop(columns=['cigsPerDay','totChol','sysBP','diaBP','BMI','heartRate','glucose']).copy()
Q1 = datanum.quantile(0.25)
Q3 = datanum.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
datanum = datanum[~((datanum < (Q1 - 1.5 * IQR)) |(datanum > (Q3 + 1.5 * IQR))).any(axis=1)]
datanum.shape

In [None]:
datanumcat=pd.concat([datacat, datanum], axis=1)
datanumcat=datanumcat.dropna()
datanumcat[['age','cigsPerDay','totChol','sysBP','diaBP','BMI','heartRate','glucose']].describe()
#Descriptive statistics after deleting outliers

In [None]:
#MULTICOLLINEARITY CHECK
# Correlation
plt.figure(figsize=(6,5), dpi= 80)
sns.heatmap(datanumcat[['totChol',	'sysBP',	'diaBP'	,'BMI'	,'heartRate',	'glucose']].corr(), xticklabels=datanumcat[['totChol',	'sysBP',	'diaBP'	,'BMI'	,'heartRate',	'glucose']].corr().columns, yticklabels=datanumcat[['totChol',	'sysBP',	'diaBP'	,'BMI'	,'heartRate',	'glucose']].corr().columns, cmap="viridis", center=0, annot=True)

# Decorations
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

There is a strong correlation between systolic and diastolic blood pressures. But all other variables have weak correlations between each other.

In [None]:
# Import library for VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)
vif_data=datanumcat[['totChol',	'sysBP',	'diaBP'	,'BMI'	,'heartRate',	'glucose']].copy() #Burada standardized etmeden önceki VIF valueları da göstermek lazım
calc_vif(vif_data)

In order to see the multicollinearity between variables, their VIF values were checked. They should be less than 5 or 10. In this analysis, the threshold of VIF values was decided as 10. Since VIF values are much higher in the above table, scaling or another method has to be applied.

In [None]:
#SCALING
from numpy import asarray
from sklearn.preprocessing import MinMaxScaler
# define min max scaler
scaler = MinMaxScaler()
# transform data
datanum2=datanumcat[['totChol','sysBP','diaBP','BMI','heartRate','glucose']].copy()
datanum2=pd.DataFrame(scaler.fit_transform(datanum2))
print(datanum2)
datanum2.columns=['totChol','sysBP','diaBP','BMI','heartRate','glucose']
datacat2=datanumcat.drop(columns=['totChol','sysBP','diaBP','BMI','heartRate','glucose']).copy()
data_log= pd.concat([datanum2, datacat2], axis=1)

In [None]:
data_log=data_log.dropna()
vif_data=data_log[['totChol',	'sysBP',	'diaBP'	,'BMI'	,'heartRate',	'glucose']].copy() #Burada standardized etmeden önceki VIF valueları da göstermek lazım
calc_vif(vif_data)

In [None]:
#After deleting diaBP
vif_data=data_log[['totChol',	'sysBP'	,'BMI'	,'heartRate',	'glucose']].copy() #Burada standardized etmeden önceki VIF valueları da göstermek lazım
calc_vif(vif_data)

In [None]:
#After scaling and elimination, VIF values became less than 10. So, there is no multicollinearity between variables now.
data_log=data_log.drop(columns='diaBP',axis=1)

In [None]:
data_log
data_final=data_log.copy()
data_final.columns.values
X = data_final.loc[:, data_final.columns != 'y']
y = data_final.loc[:, data_final.columns == 'y']

In [None]:
#OVERSAMPLING FOR IMBALANCED DATA
from collections import Counter
from imblearn.over_sampling import RandomOverSampler 

ros = RandomOverSampler(sampling_strategy=1, random_state=42)
X, y = ros.fit_resample(X, y)

print('Resampled dataset shape %s' % Counter(y))

**NAIVE BAYES**

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
model = GaussianNB()
model.fit(X, y);
X = data_log.drop('y', axis=1).copy()
y = data_log['y'].copy()

X, y = ros.fit_resample(X, y)
X=pd.DataFrame(X)
X.columns=['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']
X[['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']]=X[['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']].astype('category')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42,stratify=y)
#param_grid_nb = {
#    'var_smoothing': np.logspace(0,-9, num=100)
#}
#grid_search= GridSearchCV(GaussianNB(), param_grid_nb,cv=7)
#grid_search.fit(X_train,y_train)
#grid_search.best_params_
modelnb=GaussianNB(var_smoothing=0.0005336699231206307)
modelnb.fit(X_train,y_train)
y2_modelnb = modelnb.predict(X_test)
accuracy_score(y_test, y2_modelnb)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
cf=confusion_matrix(y_test, y2_modelnb)
print(classification_report(y_test, y2_modelnb))

group_names = ["True Negative","False Positive","False Negative","True Positive"]
group_counts = ["{0:0.0f}".format(value) for value in
                cf.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     cf.flatten()/np.sum(cf)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf, annot=labels, fmt="", cmap=palette)

In [None]:
perf_nb=pd.DataFrame({'Train_Score':modelnb.score(X_train,y_train),"Test_Score":modelnb.score(X_test,y_test),"Precision_Score":precision_score(y_test,y2_modelnb),"Recall_Score":recall_score(y_test,y2_modelnb),"F1_Score":f1_score(y_test,y2_modelnb)},index=["Naives Bayes"])

**KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
X = data_log.drop('y', axis=1).copy()
y = data_log['y'].copy()

X, y = ros.fit_resample(X, y)
X=pd.DataFrame(X)
X.columns=['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']
X[['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']]=X[['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']].astype('category')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42,stratify=y)
#from sklearn.model_selection import GridSearchCV

#gridSearchParameters = {'n_neighbors' : [i for i in range(3,10,2)],
#                        'weights' : ['uniform', 'distance'],
#                        'metric' : ['euclidean','manhattan','minkowski','hamming']
#                        }

#grid = GridSearchCV(KNeighborsClassifier(), gridSearchParameters, cv=7)
#grid.fit(X_train,y_train)
#grid.best_params_

modelknn=KNeighborsClassifier(metric= 'hamming', n_neighbors= 3, weights= 'distance')
modelknn.fit(X_train,y_train)
y2_modelknn = modelknn.predict(X_test)
accuracy_score(y_test, y2_modelknn)

In [None]:
cf=confusion_matrix(y_test, y2_modelknn)
print(classification_report(y_test, y2_modelknn))
group_names = ["True Negative","False Positive","False Negative","True Positive"]
group_counts = ["{0:0.0f}".format(value) for value in
                cf.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     cf.flatten()/np.sum(cf)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf, annot=labels, fmt="", cmap=palette)
perf_knn=pd.DataFrame({'Train_Score':modelknn.score(X_train,y_train),"Test_Score":modelknn.score(X_test,y_test),"Precision_Score":precision_score(y_test,y2_modelknn),"Recall_Score":recall_score(y_test,y2_modelknn),"F1_Score":f1_score(y_test,y2_modelknn)},index=["KNN"])

**LOGISTIC REGRESSION**

In [None]:
data_log['y'].value_counts()

In [None]:
count_no_risk = len(data_log[data_log['y']==0])
count_risk = len(data_log[data_log['y']==1])
pct_of_no_risk = count_no_risk/(count_no_risk+count_risk)
print("percentage of no risk", pct_of_no_risk*100)
pct_of_risk = count_risk/(count_no_risk+count_risk)
print("percentage of risk", pct_of_risk*100)

In [None]:
data_log
data_final=data_log.copy()
data_final.columns.values
X = data_final.loc[:, data_final.columns != 'y']
y = data_final.loc[:, data_final.columns == 'y']

In [None]:
#OVERSAMPLING FOR IMBALANCED DATA
from collections import Counter
from imblearn.over_sampling import RandomOverSampler 

ros = RandomOverSampler(sampling_strategy=1, random_state=42)
X, y = ros.fit_resample(X, y)

print('Resampled dataset shape %s' % Counter(y))

In [None]:
X=pd.DataFrame(X,columns=['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay'])
X[['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']]=X[['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']].astype('category')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42,stratify=y)
pd.DataFrame(y).value_counts()

In [None]:
X_for_logistic = data_final.loc[:, data_final.columns != 'y']
y_for_logistic = data_final.loc[:, data_final.columns == 'y']

X_for_logistic, y_for_logistic = ros.fit_resample(X_for_logistic, y_for_logistic)
X_for_logistic=pd.DataFrame(X_for_logistic)
X_for_logistic.columns=['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']
X_for_logistic[['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']]=X_for_logistic[['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']].astype('category')
X_train_lg, X_test_lg, y_train_lg, y_test_lg = train_test_split(X_for_logistic, y_for_logistic, test_size=0.25, random_state=42,stratify=y_for_logistic)

In [None]:
columns = X_train.columns
os_data_X = pd.DataFrame(X_train,columns=columns )
os_data_y= pd.DataFrame(y_train)
os_data_y.columns=['y']
os_data_y.value_counts()

In [None]:
print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of subscription",len(os_data_y[os_data_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))

In [None]:
#from sklearn.model_selection import GridSearchCV
#grid= dict(solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
#                      C= [0.001,0.01,0.1,1,10,100,1000],
#                      penalty= ['none', 'l1', 'l2', 'elasticnet'])# l1 lasso l2 ridge
#logreg=LogisticRegression()
#logreg_cv=GridSearchCV(logreg,grid,cv=10)
#logreg_cv.fit(X_train_lg,y_train_lg)
#print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
#print("accuracy :",logreg_cv.best_score_)

#By the GridSearchCV method, the best hyperparameters were found and they were used in the logistic model.
data_final_vars=data_final.columns.values.tolist()
y=['y']
X=[i for i in data_final_vars if i not in y]
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C= 0.1, penalty= 'l1', solver= 'saga')
rfe = RFE(logreg, 20)
rfe = rfe.fit(os_data_X, os_data_y.values.ravel())
print(rfe.support_)
print(rfe.ranking_)

In [None]:
X=os_data_X
y=os_data_y['y']
import statsmodels.api as sm
logit_model=sm.Logit(np.asarray(y),X.astype(float))
result=logit_model.fit()
print(result.summary2())

In [None]:
#Since there are variables that have p-values greater than 0.05, there should be an elimination among independent variables.
X=os_data_X.drop(columns=['BPMeds','diabetes','sysBP','totChol','currentSmoker','prevalentStroke','heartRate','glucose'],axis=1).copy()
y=os_data_y['y']

logit_model=sm.Logit(np.asarray(y),X.astype(float))
result=logit_model.fit()
print(result.summary2())

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(X_for_logistic, y_for_logistic, test_size=0.25, random_state=42,stratify=y_for_logistic)
logreg = LogisticRegression(C= 0.1, penalty= 'l1', solver= 'saga')
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
from sklearn.metrics import confusion_matrix
cf = confusion_matrix(y_test, y_pred)
print(cf)

In [None]:
from sklearn.metrics import classification_report
l_cr=classification_report(y_test, y_pred,output_dict=True)
print(classification_report(y_test, y_pred))

In [None]:
perf_lr=pd.DataFrame({'Train_Score':logreg.score(X_train,y_train),"Test_Score":logreg.score(X_test,y_test),"Precision_Score":precision_score(y_test,y_pred),"Recall_Score":recall_score(y_test,y_pred),"F1_Score":f1_score(y_test,y_pred)},index=["Logistic Regression"])

**DECISION TREE**

In [None]:
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
X = data_log.drop(columns='y') # Features
y = data_log['y'] # Target variable

X, y = ros.fit_resample(X, y)
X=pd.DataFrame(X)
X.columns=['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']
X[['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']]=X[['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']].astype('category')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42,stratify=y)

In [None]:
#param_dict={"criterion" : ['gini', 'entropy'], "max_depth":range(1,10),"min_samples_split":range(1,10),"min_samples_leaf":range(1,5)   }
#clf_GS = GridSearchCV( DecisionTreeClassifier(),param_grid= param_dict,cv=10,verbose=1,n_jobs=-1)
#clf_GS.fit(X_train, y_train)
#clf_GS.best_params_
#clf_GS.best_estimator_

# Create Decision Tree classifer object
clf = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=9, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

dt_cr=classification_report(y_test, y_pred,output_dict=True)

In [None]:
perf_dt=pd.DataFrame({'Train_Score':clf.score(X_train,y_train),"Test_Score":clf.score(X_test,y_test),"Precision_Score":precision_score(y_test,y_pred),"Recall_Score":recall_score(y_test,y_pred),"F1_Score":f1_score(y_test,y_pred)},index=["Decision Tree"])
cf=confusion_matrix(y_test, y_pred)
group_names = ["True Negative","False Positive","False Negative","True Positive"]
group_counts = ["{0:0.0f}".format(value) for value in
                cf.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     cf.flatten()/np.sum(cf)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf, annot=labels, fmt="", cmap=palette)
print(classification_report(y_test, y_pred))

**RANDOM FOREST**

In [None]:
X = data_log.drop('y', axis=1)
y = data_log['y'].copy()

X, y = ros.fit_resample(X, y)
X=pd.DataFrame(X)
X.columns=['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']
X[['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']]=X[['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']].astype('category')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42,stratify=y)

In [None]:
#from sklearn.model_selection import RandomizedSearchCV

#number of trees in random forest
#n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

#number of features at every split
#max_features = ['auto', 'sqrt']

#max depth
#max_depth = [int(x) for x in np.linspace(100, 500, num = 11)] max_depth.append(None)

#create random grid
#random_grid = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth }

#Random search of parameters
#rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

#Fit the model
#rfc_random.fit(X_train, y_train)

#print results
#print(rfc_random.best_params_)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
rfc = RandomForestClassifier(n_estimators= 200, max_features= 'sqrt', max_depth= None)
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
rfc_cv_score = cross_val_score(rfc, X, y, cv=10, scoring='roc_auc')
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

In [None]:
sorted_idx = rfc.feature_importances_.argsort()
plt.barh(data_log.columns[sorted_idx], rfc.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")

The most important 5 features are respectively age, body mass index, total cholesterol, systolic blood pressure, and glucose.

In [None]:
cf=confusion_matrix(y_test, rfc_predict)
group_names = ["True Negative","False Positive","False Negative","True Positive"]
group_counts = ["{0:0.0f}".format(value) for value in
                cf.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     cf.flatten()/np.sum(cf)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf, annot=labels, fmt="", cmap=palette)
perf_rf=pd.DataFrame({'Train_Score':rfc.score(X_train,y_train),"Test_Score":rfc.score(X_test,y_test),"Precision_Score":precision_score(y_test,rfc_predict),"Recall_Score":recall_score(y_test,rfc_predict),"F1_Score":f1_score(y_test,rfc_predict)},index=["Random Forest"])

**SUPPORT VECTOR MACHINE**

In [None]:
X = data_log.drop('y', axis=1).copy()
y = data_log['y'].copy()

X, y = ros.fit_resample(X, y)
X=pd.DataFrame(X)
X.columns=['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']
X[['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']]=X[['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose', 'male', 'age',
       'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp',
       'diabetes', 'education_1', 'education_2', 'education_3', 'education_4',
       'cigsPerDay']].astype('category')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42,stratify=y)
from sklearn.svm import SVC
# defining parameter range
#param_grid = {'C': [0.1, 1, 10, 100, 1000], 
#              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
#              'kernel': ['rbf']} 
  
#grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
#grid.fit(X_train, y_train)

from sklearn.svm import SVC
svclassifier = SVC(kernel='rbf',C=100,gamma=1)
svclassifier.fit(X_train, y_train)

In [None]:
y_pred = svclassifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
cf=confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))
perf_svm=pd.DataFrame({'Train_Score':svclassifier.score(X_train,y_train),"Test_Score":svclassifier.score(X_test,y_test),"Precision_Score":precision_score(y_test,y_pred),"Recall_Score":recall_score(y_test,y_pred),"F1_Score":f1_score(y_test,y_pred)},index=["SVM"])
group_names = ["True Negative","False Positive","False Negative","True Positive"]
group_counts = ["{0:0.0f}".format(value) for value in
                cf.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     cf.flatten()/np.sum(cf)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf, annot=labels, fmt="", cmap=palette)

In [None]:
A=pd.concat([perf_nb,perf_lr,perf_dt,perf_rf,perf_knn,perf_svm])
A

As can be seen in the above table the most accurate algorithm used is Random Forest while the most inaccurate one is Naïve Bayes.

At first, Random Forest can give an idea of overfitting in classification report and confusion matrix but by looking at the train and test score in the above table it can be seen that there is not an overfitting problem.


### 