Import libraries 

In [None]:
import pandas as pd
from sklearn import metrics, preprocessing
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import Ridge, RidgeClassifier, Lasso, LassoCV, LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold, GridSearchCV, RandomizedSearchCV, \
    RepeatedStratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, IsolationForest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from statsmodels.formula.api import ols
import statsmodels.api as sm

In [None]:
pd.options.mode.chained_assignment = None
from scipy.stats import chi2_contingency, spearmanr
from warnings import simplefilter

data = pd.read_csv("../input/heart-disease-prediction-using-logistic-regression/framingham.csv")
print(data.dtypes)


In [None]:
#convert columns asfactors and numeric variables
data['male'] = data['male'].astype('category')
data['education'] = data['education'].astype('category')
data['currentSmoker'] = data['currentSmoker'].astype('category')
data['BPMeds'] = data['BPMeds'].astype('category')
data['prevalentStroke'] = data['prevalentStroke'].astype('category')
data['prevalentHyp'] = data['prevalentHyp'].astype('category')
data['diabetes'] = data['diabetes'].astype('category')



In [None]:
#check for missing values
print('number of missing values in male', data['male'].isnull().sum())
print('number of missing values in age', data['age'].isnull().sum())
print('number of missing values in education',data['education'].isnull().sum())
print('number of missing values in currentSmoker',data['currentSmoker'].isnull().sum())
print('number of missing values in cigsPerDay', data['cigsPerDay'].isnull().sum())
print('number of missing values in BPMeds', data['BPMeds'].isnull().sum())
print('number of missing values in prevalentStroke',data['prevalentStroke'].isnull().sum())
print('number of missing values in prevalentHyp',data['prevalentHyp'].isnull().sum())
print('number of missing values in diabetes', data['diabetes'].isnull().sum())
print('number of missing values in totChol', data['totChol'].isnull().sum())
print('number of missing values in sysBP',data['sysBP'].isnull().sum())
print('number of missing values in diaBP',data['diaBP'].isnull().sum())
print('number of missing values in BMI', data['BMI'].isnull().sum())
print('number of missing values in heartRate', data['heartRate'].isnull().sum())
print('number of missing values in glucose',data['glucose'].isnull().sum())
print('number of missing values in TenYearCHD',data['TenYearCHD'].isnull().sum())
########################################################################################################

In [None]:
#split the data set into train and test sets
ydata=data['TenYearCHD']
data.drop(["TenYearCHD"], axis = 1, inplace = True)
random.seed(123)
train,test,y,ty=train_test_split(data,ydata,test_size=0.3,random_state=123)

In [None]:

#graphically represent the independent variables
simplefilter(action='ignore',category=FutureWarning)

sns.countplot(x='male',data=train)
plt.title("Distribution of Male")
plt.show()

sns.boxplot(x=train['age'])
plt.title('Distribution of age')
plt.show()

fig, axs = plt.subplots(1, 1,tight_layout=True)
axs.hist(train['age'],bins=25,density=True)
plt.title('Distribution of age')
plt.show()


sns.countplot(x='education',data=train)
plt.title("Distribution of Education")
plt.show()


sns.countplot(x='currentSmoker',data=train)
plt.title("Distribution of currentSmoker")
plt.show()

sns.countplot(x='BPMeds',data=train)
plt.title("Distribution of BPMeds")
plt.show()


sns.countplot(x='prevalentStroke',data=train)
plt.title("Distribution of prevalentStroke")
plt.show()

sns.countplot(x='prevalentHyp',data=train)
plt.title("Distribution of prevalentHyp")
plt.show()


sns.countplot(x='diabetes',data=train)
plt.title("Distribution of diabetes")
plt.show()


fig, axs = plt.subplots(1, 1,tight_layout=True)
axs.hist(train['totChol'],bins=25)
plt.title('Distribution of totChol')
plt.show()


fig, axs = plt.subplots(1, 1,tight_layout=True)
axs.hist(train['sysBP'],bins=25)
plt.title('Distribution of sysBP')
plt.show()


fig, axs = plt.subplots(1, 1,tight_layout=True)
axs.hist(train['diaBP'],bins=25)
plt.title('Distribution of diaBP')
plt.show()

fig, axs = plt.subplots(1, 1,tight_layout=True)
axs.hist(train['BMI'],bins=25)
plt.title('Distribution of BMI')
plt.show()

fig, axs = plt.subplots(1, 1,tight_layout=True)
axs.hist(train['heartRate'],bins=25)
plt.title('Distribution of heartRate')
plt.show()

fig, axs = plt.subplots(1, 1,tight_layout=True)
axs.hist(train['glucose'],bins=25)
plt.title('Distribution of glucose')
plt.show()

train['TenYearCHD']=y
sns.countplot(x='TenYearCHD',data=train)
plt.title("Distribution of TenYearCHD")
plt.show()


In [None]:
train=train.drop(['TenYearCHD'],axis=1)

In [None]:
#impute missing values

train['education']=train.fillna(1)

imputer=KNNImputer(n_neighbors=2)
df=imputer.fit_transform(train[['cigsPerDay']])
train=train.drop('cigsPerDay',axis=1)
train['cigsPerDay']=df


df=imputer.fit_transform(train[['totChol']])
train=train.drop('totChol',axis=1)
train['totChol']=df

df=imputer.fit_transform(train[['BMI']])
train=train.drop('BMI',axis=1)
train['BMI']=df

df=imputer.fit_transform(train[['heartRate']])
train=train.drop('heartRate',axis=1)
train['heartRate']=df

df=imputer.fit_transform(train[['glucose']])
train=train.drop('glucose',axis=1)
train['glucose']=df



In [None]:
#EDA #########################################################################
#relationship between dependent varaible and independent variables
print("relationships between independent variables and dependent varaible")
print("chisqaure tests for dependent variables")
contingency=pd.crosstab(train['education'],y)
print(contingency)
stat,p,dof,expected=chi2_contingency(contingency)
alpha=0.05
if p<= alpha:
    print('education and TenYearCHD are  related')
else:
    print('education and TenYearCHD are not related')

contingency=pd.crosstab(train['male'],y)
print(contingency)
stat,p,dof,expected=chi2_contingency(contingency)
alpha=0.05
if p<= alpha:
    print('male and TenYearCHD are  related')
else:
    print('male and TenYearCHD are not related')

contingency=pd.crosstab(train['currentSmoker'],y)
print(contingency)
stat,p,dof,expected=chi2_contingency(contingency)
alpha=0.05
if p<= alpha:
    print('currentSmoker and TenYearCHD are  related')
else:
    print('currentSmoker and TenYearCHD are not related')

contingency=pd.crosstab(train['BPMeds'],y)
print(contingency)
stat,p,dof,expected=chi2_contingency(contingency)
alpha=0.05
if p<= alpha:
    print('BPMeds and TenYearCHD are  related')
else:
    print('BPMeds and TenYearCHD are not related')

contingency=pd.crosstab(train['prevalentStroke'],y)
print(contingency)
stat,p,dof,expected=chi2_contingency(contingency)
alpha=0.05
if p<= alpha:
    print('prevalentStroke and TenYearCHD are  related')
else:
    print('prevalentStroke and TenYearCHD are not related')

contingency=pd.crosstab(train['prevalentHyp'],y)
print(contingency)
stat,p,dof,expected=chi2_contingency(contingency)
alpha=0.05
if p<= alpha:
    print('prevalentHyp and TenYearCHD are  related')
else:
    print('prevalentHyp and TenYearCHD are not related')

contingency=pd.crosstab(train['diabetes'],y)
print(contingency)
stat,p,dof,expected=chi2_contingency(contingency)
alpha=0.05
if p<= alpha:
    print('diabetes and TenYearCHD are  related')
else:
    print('diabetes and TenYearCHD are not related')

In [None]:

print("For continous variables")

train['TenYearCHD']=y
sns.catplot(x='TenYearCHD',y='cigsPerDay',hue='TenYearCHD',kind='box',data=train)
zeroClass=train[train['TenYearCHD']==0]
oneClass=train[train['TenYearCHD']==1]

fig, axs = plt.subplots(1, 1,tight_layout=True)
axs.hist(zeroClass['cigsPerDay'],bins=25)
plt.title('Distribution of cigsPerDay and TenYearCHD is zero class')

fig, axs = plt.subplots(1, 1,tight_layout=True)
axs.hist(oneClass['cigsPerDay'],bins=25)
plt.title('Distribution of cigsPerDay and TenYearCHD is one class')

sns.catplot(x='TenYearCHD',y='totChol',hue='TenYearCHD',kind='box',data=train)

sns.catplot(x='TenYearCHD',y='sysBP',hue='TenYearCHD',kind='box',data=train)

sns.catplot(x='TenYearCHD',y='diaBP',hue='TenYearCHD',kind='box',data=train)

sns.catplot(x='TenYearCHD',y='BMI',hue='TenYearCHD',kind='box',data=train)

sns.catplot(x='TenYearCHD',y='heartRate',hue='TenYearCHD',kind='box',data=train)

sns.catplot(x='TenYearCHD',y='glucose',hue='TenYearCHD',kind='box',data=train)
train=train.drop('TenYearCHD',axis=1)

plt.show()

plt.scatter(train['sysBP'],train['totChol'])
plt.title('Scatter plot between sysBP and totChol')
m, b = np.polyfit(train['sysBP'], train['totChol'], 1)
plt.plot(train['sysBP'], m*train['sysBP'] + b, color='red')
plt.show()


In [None]:


##########################################################################################################
#remove outlier

q1=data['sysBP'].quantile(0.25)
q3=data['sysBP'].quantile(0.75)
iqr=q3-q1
lowestWhisker=q1-1.5*iqr
upperWhisker=q3+1.5*iqr
data=data[(data['sysBP']<upperWhisker) & (data['sysBP']>lowestWhisker)]

q1=data['diaBP'].quantile(0.25)
q3=data['diaBP'].quantile(0.75)
iqr=q3-q1
lowestWhisker=q1-1.5*iqr
upperWhisker=q3+1.5*iqr
data=data[(data['diaBP']<upperWhisker) & (data['diaBP']>lowestWhisker)]

q1=data['age'].quantile(0.25)
q3=data['age'].quantile(0.75)
iqr=q3-q1
lowestWhisker=q1-1.5*iqr
upperWhisker=q3+1.5*iqr
data=data[(data['age']<upperWhisker) & (data['age']>lowestWhisker)]

q1=data['totChol'].quantile(0.25)
q3=data['totChol'].quantile(0.75)
iqr=q3-q1
lowestWhisker=q1-1.5*iqr
upperWhisker=q3+1.5*iqr
data=data[(data['totChol']<upperWhisker) & (data['totChol']>lowestWhisker)]

data=data[data['glucose']<200]

q1=data['cigsPerDay'].quantile(0.25)
q3=data['cigsPerDay'].quantile(0.75)
iqr=q3-q1
lowestWhisker=q1-1.5*iqr
upperWhisker=q3+1.5*iqr
data=data[(data['cigsPerDay']<upperWhisker) & (data['cigsPerDay']>lowestWhisker)]

q1=data['heartRate'].quantile(0.25)
q3=data['heartRate'].quantile(0.75)
iqr=q3-q1
lowestWhisker=q1-1.5*iqr
upperWhisker=q3+1.5*iqr
data=data[(data['heartRate']<upperWhisker) & (data['heartRate']>lowestWhisker)]


In [None]:

#distribution of dependent varaible
train['TenYearCHD']=y
sns.countplot(x='TenYearCHD',data=train)
plt.title('Counts of TenYearCHD before subsampling')
plt.show()

#since dependent variable is unbalanced, perform subsampling and oversampling
#separate the clases
class_0_count,class_1_count=train['TenYearCHD'].value_counts()
class_0=train[train['TenYearCHD']==0]
class_1=train[train['TenYearCHD']==1]

print("class 0 distribution",class_0.shape)
print("class 1 distribution",class_1.shape)

print(train.shape)
#undersampling
class_0_under=class_0.sample(class_1_count,replace=False)
underSample=pd.concat([class_1,class_0_under],axis=0)

sns.countplot(x='TenYearCHD',data=underSample)
plt.title('TenYearCHD after undersampling')
plt.show()
#undersampling countplot

#oversampling
class_1_over=class_1.sample(class_0_count,replace=True)
overSample=pd.concat([class_0,class_1_over],axis=0)
sns.countplot(x='TenYearCHD',data=overSample)
plt.title('TenYearCHD after oversampling')
plt.show()


In [None]:

#choose either oversampling or undersampling
train=overSample
y=train['TenYearCHD']
train=train.drop(['TenYearCHD'],axis=1)


In [None]:

#feature engineering
#sysBP-diastolicBP=PulsePressure
#for train test
sysBP=train['sysBP']
diaBP=train['diaBP']
pulsePressure=sysBP-diaBP
train['pulsePressure']=pulsePressure
train=train.drop(['sysBP','diaBP'],axis=1)
#for test set
sysTest=test['sysBP']
diaBPTest=test['diaBP']
pulsePressureTest=sysTest-diaBPTest
test['pulsePressure']=pulsePressureTest
test=test.drop(['sysBP','diaBP'],axis=1)
#test=test.drop(['currentSmoker'],axis=1)




In [None]:

#check the distribution of y variable of train set
print("distribution of TenYearCHD in train set")
print(y.value_counts())

#describe data
print(train.describe())




In [None]:

#chech the corelation among independent variables
#create the corelation heatmap for quantitative variables
corMat=train.corr()
plt.figure(figsize=(15,10), facecolor='w')
sns.heatmap(corMat,xticklabels=corMat.columns,yticklabels=corMat.columns,annot=True)
plt.title("Correlation Matrix", size=20)
plt.show()


In [None]:

#chisquare test for categorical variables

#chisquare test for categorical variables
contingency=pd.crosstab(train['male'],train['BPMeds'])
print(contingency)
stat,p,dof,expected=chi2_contingency(contingency)
alpha=0.05

if p<= alpha:
    print('male and BPMeds are related')
else:
    print('male and BPMeds are not related')

contingency=pd.crosstab(train['male'],train['prevalentHyp'])
print(contingency)
stat,p,dof,expected=chi2_contingency(contingency)
alpha=0.05

if p<= alpha:
    print('male and prevalentHyp are related')
else:
    print('male and prevalentHyp are not related')

contingency=pd.crosstab(train['male'],train['prevalentStroke'])
print(contingency)
stat,p,dof,expected=chi2_contingency(contingency)
alpha=0.05

if p<= alpha:
    print('male and prevalentStroke are related')
else:
    print('male and prevalentStroke are not related')

contingency = pd.crosstab(train['male'], train['diabetes'])
print(contingency)
stat, p, dof, expected = chi2_contingency(contingency)
alpha = 0.05

if p <= alpha:
    print('male and diabetes are related')
else:
    print('male and diabetes are not related')

contingency = pd.crosstab(train['BPMeds'], train['prevalentHyp'])
print(contingency)
stat, p, dof, expected = chi2_contingency(contingency)
alpha = 0.05

if p <= alpha:
    print('BPMeds and prevalentHyp are related')
else:
    print('BPMeds and prevalentHyp are not related')

contingency = pd.crosstab(train['BPMeds'], train['prevalentStroke'])
print(contingency)
stat, p, dof, expected = chi2_contingency(contingency)
alpha = 0.05

if p <= alpha:
    print('BPMeds and prevalentStroke are related')
else:
    print('BPMeds and prevalentStroke are not related')

contingency = pd.crosstab(train['BPMeds'], train['diabetes'])
print(contingency)
stat, p, dof, expected = chi2_contingency(contingency)
alpha = 0.05

if p <= alpha:
    print('BPMeds and diabetes are related')
else:
    print('BPMeds and diabetes are not related')

contingency = pd.crosstab(train['prevalentHyp'], train['diabetes'])
print(contingency)
stat, p, dof, expected = chi2_contingency(contingency)
alpha = 0.05

if p <= alpha:
    print('prevalentHyp and diabetes are related')
else:
    print('prevalentHyp and diabetes are not related')

contingency = pd.crosstab(train['prevalentHyp'], train['prevalentStroke'])
print(contingency)
stat, p, dof, expected = chi2_contingency(contingency)
alpha = 0.05

if p <= alpha:
    print('prevalentHyp and prevalentStroke are related')
else:
    print('prevalentHyp and prevalentStroke are not related')

#relationship between age and other varaibles
plt.scatter(train['age'],train['cigsPerDay'])
plt.title('Scatter plot between age and cigsPerDay')
m, b = np.polyfit(train['age'], train['cigsPerDay'], 1)
plt.plot(train['age'], m*train['age'] + b, color='red')
plt.show()

contingency = pd.crosstab(train['age'], train['cigsPerDay'])
print(contingency)
stat, p, dof, expected = chi2_contingency(contingency)
alpha = 0.05

if p <= alpha:
    print('age and cigsPerDay are related')
else:
    print('age and cigsPerDay are not related')

model = ols('age ~ C(diabetes)', data=train).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table )

plt.scatter(train['age'],train['pulsePressure'])
plt.title('Scatter plot between age and pulsePressure')
m, b = np.polyfit(train['age'], train['pulsePressure'], 1)
plt.plot(train['age'], m*train['age'] + b, color='red')
plt.show()

plt.scatter(train['age'],train['heartRate'])
plt.title('Scatter plot between age and heartRate')
m, b = np.polyfit(train['age'], train['heartRate'], 1)
plt.plot(train['age'], m*train['age'] + b, color='red')
plt.show()

plt.scatter(train['age'],train['glucose'])
plt.title('Scatter plot between age and glucose')
m, b = np.polyfit(train['age'], train['glucose'], 1)
plt.plot(train['age'], m*train['age'] + b, color='red')
plt.show()

#relationship between cigsPerDay and other variables
plt.scatter(train['cigsPerDay'],train['totChol'])
plt.title('Scatter plot between cigsPerDay and totChol')
m, b = np.polyfit(train['cigsPerDay'], train['totChol'], 1)
plt.plot(train['cigsPerDay'], m*train['cigsPerDay'] + b, color='red')
plt.show()

plt.scatter(train['cigsPerDay'],train['pulsePressure'])
plt.title('Scatter plot between cigsPerDay and pulsePressure')
m, b = np.polyfit(train['cigsPerDay'], train['pulsePressure'], 1)
plt.plot(train['cigsPerDay'], m*train['cigsPerDay'] + b, color='red')
plt.show()

plt.scatter(train['cigsPerDay'],train['heartRate'])
plt.title('Scatter plot between cigsPerDay and heartRate')
m, b = np.polyfit(train['cigsPerDay'], train['heartRate'], 1)
plt.plot(train['cigsPerDay'], m*train['cigsPerDay'] + b, color='red')
plt.show()

plt.scatter(train['cigsPerDay'],train['glucose'])
plt.title('Scatter plot between cigsPerDay and glucose')
m, b = np.polyfit(train['cigsPerDay'], train['glucose'], 1)
plt.plot(train['cigsPerDay'], m*train['cigsPerDay'] + b, color='red')
plt.show()

#relationship between totChol and other variables

plt.scatter(train['totChol'],train['pulsePressure'])
plt.title('Scatter plot between totChol and pulsePressure')
m, b = np.polyfit(train['totChol'], train['pulsePressure'], 1)
plt.plot(train['totChol'], m*train['totChol'] + b, color='red')
plt.show()

plt.scatter(train['totChol'],train['heartRate'])
plt.title('Scatter plot between totChol and heartRate')
m, b = np.polyfit(train['totChol'], train['heartRate'], 1)
plt.plot(train['totChol'], m*train['totChol'] + b, color='red')
plt.show()

plt.scatter(train['totChol'],train['glucose'])
plt.title('Scatter plot between totChol and glucose')
m, b = np.polyfit(train['totChol'], train['glucose'], 1)
plt.plot(train['totChol'], m*train['totChol'] + b, color='red')
plt.show()

#relationship between pulsePressure and other variables

plt.scatter(train['pulsePressure'],train['heartRate'])
plt.title('Scatter plot between pulsePressure and heartRate')
m, b = np.polyfit(train['pulsePressure'], train['heartRate'], 1)
plt.plot(train['pulsePressure'], m*train['pulsePressure'] + b, color='red')
plt.show()

plt.scatter(train['pulsePressure'],train['glucose'])
plt.title('Scatter plot between pulsePressure and glucose')
m, b = np.polyfit(train['pulsePressure'], train['glucose'], 1)
plt.plot(train['pulsePressure'], m*train['pulsePressure'] + b, color='red')
plt.show()

plt.scatter(train['heartRate'],train['glucose'])
plt.title('Scatter plot between heartRate and glucose')
m, b = np.polyfit(train['heartRate'], train['glucose'], 1)
plt.plot(train['heartRate'], m*train['heartRate'] + b, color='red')
plt.show()
print('missinh')
print(train.isnull().sum())

In [None]:
train['y']=y
train=train.dropna()
y=train['y']
train.drop(['y'],axis=1,inplace=True)

test['y']=ty
test=test.dropna()
ty=test['y']
test.drop(['y'],axis=1,inplace=True)

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
########################################  Modeling ################################################
#ridge regression
from random import random

from sklearn.linear_model import LogisticRegression

len(train)
rc=RidgeClassifier()
RidgeClassifier(alpha=1,random_state=123)
rc.fit(train,y)
c1=test['cigsPerDay']
test=test.drop(['cigsPerDay'],axis=1)
test.insert(loc = 4, column = 'cigsPerDay', value = c1)
#print(test.columns)
score=rc.score(test,ty)
print("Accuracy of ridge",score)

In [None]:

import pprint
#using cross validation
#cv=cross_val_score(rc,train,y,cv=10)
#print("Average accuracy of cv is",cv.mean())

#plt.errorbar(train, y, yerr=0.8, fmt='o');

#lasso regression

#logistic regression
#model_logistic=sm.Logit(train,y)
#resultLogistic=model_logistic.fit()
model_logistic = LogisticRegression(random_state=1,solver='liblinear', max_iter=1000).fit(train, y)
predicted_logistic=model_logistic.predict(test)
print("Accuracy of logistic regression is",model_logistic.score(test,ty))

#print(classification_report(ty,predicted_logistic))

#knn
#knn initialized with 5
knn=KNeighborsClassifier(n_neighbors=5)
model_knn=knn.fit(train,y)
predicted_knn=model_knn.predict(test)
print("Accuracy of knn is",model_knn.score(test,ty))


#random forest
rf=RandomForestClassifier(n_estimators=1000,random_state=123)
model_rf=rf.fit(train,y)
predicted_randomforest=rf.predict(test)
print("Accuracy of rf is",metrics.accuracy_score(ty,predicted_randomforest))

#parameter tuning for rf
# Number of trees
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)
#rfb = RandomForestRegressor()
#rf_random = RandomizedSearchCV(estimator = rfb, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
#rf_random.fit(test, ty)
#print(rf_random.best_params_)
rfb = RandomForestClassifier(n_estimators=1600,min_samples_split= 2,min_samples_leaf= 4,max_features= 'sqrt',max_depth= 10,bootstrap= True)
model_rf=rfb.fit(train,y)
predicted_randomforestb=rf.predict(test)
print("Accuracy of rf with best parameters is",metrics.accuracy_score(ty,predicted_randomforestb))

#decision tree
dt=DecisionTreeClassifier(criterion="entropy",max_depth=3)
model_dt=dt.fit(train,y)
predicted_dt=model_dt.predict(test)
print("Accuracy of dt",metrics.accuracy_score(ty,predicted_dt))

#gradient boosting
gb=GradientBoostingClassifier()
np.random.seed(123)
model_gb=gb.fit(train,y)
predicted_gb=model_gb.predict(test)
print("Accuracy of gb",metrics.accuracy_score(ty,predicted_gb))


#apply variable selection
bestfeatures = SelectKBest(score_func=chi2, k='all')
fit = bestfeatures.fit(train,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(train.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(13,'Score'))  #print 10 best features

train=train[['pulsePressure','age','glucose','cigsPerDay','totChol','prevalentHyp']]
test=test[['pulsePressure','age','glucose','cigsPerDay','totChol','prevalentHyp']]

print(train.max())
#ridge regression

len(train)
rc=RidgeClassifier()
RidgeClassifier(alpha=1,random_state=123)
rc.fit(train,y)
score=rc.score(test,ty)
print("Accuracy of ridge",score)

model_rf=rf.fit(train,y)
predicted_randomforest=rf.predict(test)
print("Accuracy of rf with best features is",metrics.accuracy_score(ty,predicted_randomforest))


