I have implemented Logistic Regression, SVM and QDA on the dataset. 

# importing data

In [None]:
import pandas as pd 

In [None]:
data = pd.read_csv('../input/logistic-regression-heart-disease-prediction/framingham_heart_disease.csv')

In [None]:
data.head()

In [None]:
data.columns

In [None]:
print(data.shape)

### a summary of statistics pertaining to the DataFrame columns

In [None]:
data.describe()

### checking for null values

In [None]:
data.isnull().sum()

In [None]:
#total missing values in data

missing_data = data.isnull().sum()
print(sum(missing_data))

In [None]:
#total percentage of missing data

total_percentage = (missing_data.sum()/data.shape[0]) * 100
print(f'The total percentage of missing data is {round(total_percentage,2)}%')

In [None]:
# percentage of missing data per category

total = data.isnull().sum().sort_values(ascending=False)
percent_total = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)*100

missing = pd.concat([total, percent_total], axis=1, keys=["Total", "Percentage"])
missing_data = missing[missing['Total']>0]
missing_data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.figure(figsize=(9,6))
sns.set(style="whitegrid")
sns.barplot(x=missing_data.index, y=missing_data['Percentage'], data = missing_data)
plt.title('Percentage of missing data by feature')
plt.xlabel('Features', fontsize=14)
plt.ylabel('Percentage', fontsize=14)
plt.show()

At 9.15%, the blood glucose entry has the highest percentage of missing data. Other features have very few missing entries.
Since the missing entries account for only 15.22% of the total data we can drop these entries without losing alot of data.

## Drop missing entries

In [None]:
data.dropna(axis=0,inplace=True)

In [None]:
data.head()

In [None]:
print(data.shape)

In [None]:
data.isnull().sum()

## data distribution

In [None]:
def draw_histograms(dataframe, features, rows, cols):
    fig=plt.figure(figsize=(20,20))
    for i, feature in enumerate(features):
        ax=fig.add_subplot(rows,cols,i+1)
        dataframe[feature].hist(bins=20,ax=ax,facecolor='green')
        ax.set_title(feature+" Distribution",color='DarkRed')
        
    fig.tight_layout()  
    plt.show()
        
draw_histograms(data,data.columns,6,3)

The data on the prevalent stroke, diabetes, and blood pressure meds are poorly balanced

In [None]:
#TenYearCHD distribution 

import seaborn as sn

data.TenYearCHD.value_counts()
sn.countplot(x='TenYearCHD',data=data)

There are 3179 patents with no heart disease and 572 patients with risk of heart disease.
The data is not properly balanced as the number of people without the disease greately exceeds the number of people with the disease. The ratio is about 1:5.57

In [None]:
#Number of people who have disease vs age

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

positive_cases = data[data['TenYearCHD'] == 1]
plt.figure(figsize=(15,6))
sns.countplot(x='age',data = positive_cases, hue = 'TenYearCHD', palette='husl')
plt.show()

The people with the highest risk of developing CHD are betwwen the ages of 51 and 63.
The number of sick people generally increases with age

# Feature Selection

In [None]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

In [None]:
#define the features
X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

forest = RandomForestClassifier(n_estimators=1000, n_jobs=-1, class_weight='balanced')

# define Boruta feature selection method
feat_selector = BorutaPy(forest, n_estimators='auto', verbose=2)
 
# find all relevant features
feat_selector.fit(X, y)#define the features

In [None]:
# show the most important features
most_important = data.columns[:-1][feat_selector.support_].tolist()
most_important

Age and systolic blood pressures are selected as the most important features for predicting the Ten year risk of developing CHD.

In [None]:
# select the top 10 features
top_features = data.columns[:-1][feat_selector.ranking_ <10].tolist()
top_features

In [None]:
# select the top 6 features
top_features = data.columns[:-1][feat_selector.ranking_ <6].tolist()
top_features

In [None]:
# select the top 5 features
top_features = data.columns[:-1][feat_selector.ranking_ <5].tolist()
top_features

### Dividing data for training and testing

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=.20,random_state=5)

# Scikit Learn - Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg=LogisticRegression()
logreg.fit(x_train,y_train)
y_pred=logreg.predict(x_test)

In [None]:
#### model accuracy
import sklearn

logistic_accuracy = sklearn.metrics.accuracy_score(y_test,y_pred)
print(logistic_accuracy)

In [None]:
#Predicted probabilities of 0 (No Coronary Heart Disease) and 1 ( Coronary Heart Disease: Yes) for the test data with a default classification threshold of 0.5

y_pred_prob=logreg.predict_proba(x_test)[:,:]
y_pred_prob_df=pd.DataFrame(data=y_pred_prob, columns=['Prob of no heart disease (0)','Prob of Heart Disease (1)'])
y_pred_prob_df.head()

# Support Vector Machine

In [None]:
from sklearn.svm import SVC

In [None]:
#grid search for optimum parameters
from sklearn.model_selection import GridSearchCV

Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}
svm_clf = GridSearchCV(SVC(kernel='rbf', probability=True), param_grid, cv=10)

In [None]:
# train the model
svm_clf.fit(x_train,y_train)
svm_clf.best_params_ 

In [None]:
# predictions
svm_predict = svm_clf.predict(x_test)

In [None]:
#accuracy

from sklearn.metrics import accuracy_score

svm_accuracy = accuracy_score(y_test,svm_predict)
print(svm_accuracy)

# Quadratic Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
clf = QuadraticDiscriminantAnalysis()

In [None]:
clf.fit(x_train,y_train)

In [None]:
QDA_predict = clf.predict(x_test)

In [None]:
QDA_accuracy = accuracy_score(y_test,QDA_predict)
print(QDA_accuracy)

## Model Comparison

In [None]:
comparison = pd.DataFrame({
    "Logistic regression":{'Accuracy':logistic_accuracy },
    "Support vector machine":{'Accuracy':svm_accuracy},
    "Quadratic Discriminant Analysis":{'Accuracy':QDA_accuracy}
}).T


In [None]:
comparison

In [None]:
fig = plt.gcf()
fig.set_size_inches(15, 15)
titles = ['Accuracy']
for title,label in enumerate(comparison.columns):
    plt.subplot(2,2,title+1)
    sns.barplot(x=comparison.index, y = comparison[label], data=comparison)
    plt.xticks(fontsize=9)
    plt.title(titles[title])
plt.show()