In [None]:
import numpy as np
import pandas as pd
import  matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import  AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier

# Starting the analysis
1. We start taking a look at the the data trying to see some insights ?
2. At second we see wich model will perform with a better score ?

In [None]:
#Parsing and showing data

df=pd.read_csv('../input/heart-disease-uci/heart.csv')

y=df.pop('target')

correlation=df.corr()
plt.figure(figsize=(12,12))
sns.heatmap(correlation,annot=True)
plt.show()

In [None]:
df.describe()


In [None]:
df.head()


After reading the data let's see what are the number of individuals that are male and female in the dataset

In [None]:
#showing the gender in the data
val=[]
mal=0
fem=0

sex=df['sex']
classes=np.unique(sex)

for c in classes:
    val.append(np.count_nonzero(sex==c))

done this we are going to show the data

In [None]:
plt.title('Genders in the dataset')
plt.bar(['female','male'],val,label='individuals')
plt.legend()
plt.show()

This data brings another question: What are the distribution between males and females with heart diseases ? Does it follows the same distribution as in the dataset?

In [None]:
for r in range(len(y)):
    if y[r]==1:
        if sex[r]==1:
            mal+=1
        else:
            fem+=1
plt.bar(['female','male'],[fem,mal])
plt.show()

Now we are going to look at age. Does age has a relation with heart disease. if yes how much? if not why?

In [None]:
age_with=[]
age_without=[]
for r in range(len(y)):
    if y[r]==1:
        age_with.append(df['age'][r])
    else:
        age_without.append(df['age'][r])

print('The oldest person to have a heart disease has the age of '+str(max(age_with)))
print('The youngest person to have a heart disease has the age of '+str(min(age_with))) 

print('The oldest person to do not have a heart disease has the age of '+str(max(age_without)))
print('The youngest person to do not have a heart disease has the age of '+str(min(age_without))) 
plt.figure(figsize=(12,6))
plt.bar(['Avg age with heart disease','Avg age without heart disease'],[np.mean(age_with),np.mean(age_without)])
plt.bar(['Median age with heart disease','Median age  without heart disease'],[np.median(age_with),np.median(age_without)])
plt.xticks(rotation = 45)
plt.show()

# Now analysing the models 

* We are going to use Diferent regressor models including:
    
    * Gradient Boosting 
    * Multi Layer Perceptron
    * Random Forest Classifier 
    * Naive Bayes
    * KNeighboards
    * AdaBoost 
    * Gaussian Process
    * Random Forest 

In [None]:
X=df.values
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.20)

model=GradientBoostingClassifier()
model.fit(X_train,y_train)

print(str(model.score(X_test,y_test))+' Score of Gradient Boosting')


In [None]:
model=AdaBoostClassifier()
model.fit(X_train,y_train)
print(str(model.score(X_test,y_test))+' Score of AdaBoost')

In [None]:
model=MLPClassifier(activation='logistic',max_iter=10000)
model.fit(X_train,y_train)
print(str(model.score(X_test,y_test))+' Score of Multi Layer Perceptron')

In [None]:
model=KNeighborsClassifier(n_neighbors=15)
model.fit(X_train,y_train)
print(str(model.score(X_test,y_test))+' Score of KNeighbors')

In [None]:
model=RandomForestClassifier()
model.fit(X_train,y_train)
print(str(model.score(X_test,y_test))+' Score of Random Forest')

In [None]:
model=DecisionTreeClassifier()
model.fit(X_train,y_train)
print(str(model.score(X_test,y_test))+' Score of Decision Tree')

In [None]:
model=GaussianNB()
model.fit(X_train,y_train)
print(str(model.score(X_test,y_test))+' Score of Naive Bayes using gaussian')

In [None]:
model=GaussianProcessClassifier()
model.fit(X_train,y_train)
print(str(model.score(X_test,y_test))+' Score of Gaussian Process')

# Now with Normalized data.


In [None]:
from sklearn import preprocessing
X_train=preprocessing.normalize(X_train)
X_test=preprocessing.normalize(X_test)

In [None]:
model=GradientBoostingClassifier()
model.fit(X_train,y_train)

print(str(model.score(X_test,y_test))+'Score of Gradient Boosting (Normalized)')

In [None]:
model=AdaBoostClassifier()
model.fit(X_train,y_train)
print(str(model.score(X_test,y_test))+' Score of AdaBoost(Normalized)')

In [None]:
model=MLPClassifier(activation='logistic',max_iter=10000)
model.fit(X_train,y_train)
print(str(model.score(X_test,y_test))+' Score of Multi Layer Perceptron(Normalized)')

In [None]:
model=KNeighborsClassifier(n_neighbors=15)
model.fit(X_train,y_train)
print(str(model.score(X_test,y_test))+'Score of KNeighbors (Normalized)')

In [None]:
model=RandomForestClassifier()
model.fit(X_train,y_train)
print(str(model.score(X_test,y_test))+'Score of Random Forest (Normalized)')

In [None]:
model=DecisionTreeClassifier()
model.fit(X_train,y_train)
print(str(model.score(X_test,y_test))+' Score of Decision Tree (Normalized)')

In [None]:
model=GaussianNB()
model.fit(X_train,y_train)
print(str(model.score(X_test,y_test))+' Score of Naive Bayes using gaussian(Normalized)')

In [None]:
model=GaussianProcessClassifier()
model.fit(X_train,y_train)
print(str(model.score(X_test,y_test))+' Score of Gaussian Process (Normalized)')

# Conclusions 
* The greatest results came from the following algorithms :
    * Naive Bayes
    * Random Forest 
    * MLP (Multi Layer Perceptron)
 
As we could see the average age of the individuals in the dataset is around 55.
The youngest person with the disease has 29 years old and the oldest 77
We can see that with this dataset a lot of assumptions can be made but more data is nedded to make better classification models. Maybe weight and height of the person could improve the models, such as other health personal information. A lot of work still can be made and this is an starter notebook feel free to leave a comment on mistakes or improvements that can be made. If you enjoyed the notebook please give it an UP and Thank you for your attention.