In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from pprint import pprint

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')

## About this dataset:

* Age : Age of the patient  
* Sex : Sex of the patient  (1 = male, 0 = female)
* exang: exercise induced angina (1 = yes; 0 = no)  
* ca: number of major vessels (0-3)  
* cp : Chest Pain type chest pain type  
    - Value 1: typical angina
    - Value 2: atypical angina
    - Value 3: non-anginal pain
    - Value 4: asymptomatic
* trtbps : resting blood pressure (in mm Hg)  
* chol : cholestoral in mg/dl fetched via BMI sensor  
* fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)  
* rest_ecg : resting electrocardiographic results  

     - Value 0: normal  
     - Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)  
     - Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria  
* thalach : maximum heart rate achieved  
* target : 0= less chance of heart attack 1= more chance of heart attack  

In [None]:
data.head()

In [None]:
sns.heatmap(data.drop('output', axis=1).corr())
plt.show()

## Checking unique data from this dataset
***
### After checking, I want to make sure that the data description is correct, because some columns are called differently 

In [None]:
unique_data = {}
for col in data.columns:
    unique_data[col] = data[col].unique()

pprint(unique_data)
print(f'data.shape={data.shape}')

### And so, the conclusions that can be made on the results.
***
#### As you can see, caa has values of 0-4, which does not correspond to the description, which says that it is 0-3.
#### Maybe caa is not ca, but there is no second column.
#### Also in the description there are no oldpeak, slp and thall.
#### I did find descriptions, here is what they are:  
* slp - the slope of the peak exercise ST segment (1 = upsloping; 2 = flat; 3 = downsloping)
* oldpeak - ST depression induced by exercise relative to rest
***
#### What is thall? Honestly, I could not figure it out and find information to answer this question. I think knowledgeable people can help answer this question in the comments.
#### I would be grateful if you could tell me about the meaning of this field.

# Construction of graphs to prove some hypotheses
***
1. Malw/female over the middle age of this sample are more prone to heart attacks than those under this age.
2. People with asymptomatic pain are less likely to have a heart attack.
3. People with more than 2 major vessels are more likely to have a heart attack.
4. Exercise-induced angina strongly influences the presence of a heart attack.
***
**These are my 4 hypotheses that I made on the availability of the data that was provided in the dataset.  
I will try to prove or disprove these 4 hypotheses using graphs.   
Let's see if my conclusions are accurate or not.**

## The first hypothesis

Why do I use age for counting?
Since I need a point to lean on, for plotting I use this features, in general it would be better to use some identifier, but it does not exist, so I will use the field age.

In [None]:
male_over_middle_age = data[(data['sex'] == 1) & (data['age'] > data['age'].mean())].groupby('output')['age'].count().reset_index()
male_under_middle_age = data[(data['sex'] == 1) & (data['age'] < data['age'].mean())].groupby('output')['age'].count().reset_index()
female_over_middle_age = data[(data['sex'] == 0) & (data['age'] > data['age'].mean())].groupby('output')['age'].count().reset_index()
female_under_middle_age = data[(data['sex'] == 0) & (data['age'] < data['age'].mean())].groupby('output')['age'].count().reset_index()

#### As you can see, my first hypothesis was immediately defeated; these graphs show that people younger than the average age of this sample are more likely to have a heart attack. 
***
Also the information that females are less susceptible to heart attack also turned out to be false, in this sample we can see that the number of females with heart attack is ~22% less than males. Perhaps if the sample had been larger, we would have noticed a more colossal figure than this one. Also in this sample the ratio of females to males is not equal, that is there are more males in this sample. Specifically, there are 96 females and 207 males.

In [None]:
f_count = data[(data['sex'] == 0)]['output'].count()
m_count = data[(data['sex'] == 1)]['output'].count()
f_attack_count = data[(data['sex'] == 0) & (data['output'] == 1)]['output'].count()
m_attack_count = data[(data['sex'] == 1) & (data['output'] == 1)]['output'].count()
print('Number of females and males\n')
print(f'\tFemale: {f_count} \n\tMale: {m_count}\n\n')
print('Number of females and males with heart attack and their percentage\n')
print(f'\tFemale: {f_attack_count} \n\tMale: {m_attack_count}')
print(f'\tPercentage: {round(100 - f_attack_count * 100 / m_attack_count, 3)}')

In [None]:
plt.figure(figsize=(15, 12))
plt.subplot(2,2,1)
plt.title('Male over middle age')
sns.barplot(x='output', y='age', hue='output', data=male_over_middle_age)
plt.subplot(2,2,2)
plt.title('Male under middle age')
sns.barplot(x='output', y='age', hue='output', data=male_under_middle_age)
plt.subplot(2,2,3)
plt.title('Female over middle age')
sns.barplot(x='output', y='age', hue='output', data=female_over_middle_age)
plt.subplot(2,2,4)
plt.title('Female under middle age')
sns.barplot(x='output', y='age', hue='output', data=female_under_middle_age)
plt.show()

## The second hypothesis

In [None]:
cp_not_attack = data[data['output'] == 0].groupby('cp')['output'].count().reset_index()
cp_with_attack = data[data['output'] == 1].groupby('cp')['output'].count().reset_index()

#### It may be noted that the second hypothesis was correct; it showed that people with normal angina were more prone to heart attack than those who were asymptomatic. Which is a true statement in any other source. Also non-anginal pain, the graph shows that it is more likely to lead to a heart attack than regular angina pectoris

In [None]:
plt.figure(figsize=(15, 12))
plt.subplot(2,2,1)
plt.title('Types of pain without a heart attack')
sns.barplot(x='cp', y='output', hue='cp', data=cp_not_attack)
plt.subplot(2,2,2)
plt.title('Types of pain with a heart attack')
sns.barplot(x='cp', y='output', hue='cp', data=cp_with_attack)
plt.show()

## The third hypothesis

In [None]:
caa_not_attack = data[data['output'] == 0].groupby('caa')['output'].count().reset_index()
caa_with_attack = data[data['output'] == 1].groupby('caa')['output'].count().reset_index()

#### You can notice that my third hypothesis turned out to be false, probably because the major the vessel the less likely the heart attack, because the blood flow will be much better or something like that.
***
P.s. Probably in the reasoning I allow any medical mistake, therefore I ask, those who know correct statements, I will be very glad that you have noticed the mistake made by me in a judgement. As I am not very good in the field of medicine. 


In [None]:
plt.figure(figsize=(15, 12))
plt.subplot(2,2,1)
plt.title('Number of dilated vessels without heart attack')
sns.barplot(x='caa', y='output', hue='caa', data=caa_not_attack)
plt.subplot(2,2,2)
plt.title('Number of dilated vessels with heart attack')
sns.barplot(x='caa', y='output', hue='caa', data=caa_with_attack)
plt.show()

## The fourth hypothesis 

In [None]:
exng_not_attack = data[data['output'] == 0].groupby('exng')['output'].count().reset_index()
exng_with_attack = data[data['output'] == 1].groupby('exng')['output'].count().reset_index()

#### The fourth hypothesis was rejected by the graphs below, you can see that angina caused by exercise does not affect the heart attack much more than angina obtained in another way, it may have been caused at the genetic level. The data don't provide that.


In [None]:
plt.figure(figsize=(15, 12))
plt.subplot(2,2,1)
plt.title('Stenocardia without a heart attack')
sns.barplot(x='exng', y='output', hue='exng', data=exng_not_attack)
plt.subplot(2,2,2)
plt.title('stenocardia with a heart attack')
sns.barplot(x='exng', y='output', hue='exng', data=exng_with_attack)
plt.show()

## Model building without hyperparameter selection and removal of unnecessary data.

In [None]:
data_train, data_test = train_test_split(data, test_size=0.3)

In [None]:
data_train.shape, data_test.shape

In [None]:
lr = LogisticRegression(random_state=0)
rfc = RandomForestClassifier(random_state=0)

In [None]:
lr.fit(data_train.iloc[:, :-1], data_train.iloc[:, -1:])
rfc.fit(data_train.iloc[:, :-1], data_train.iloc[:, -1:])

In [None]:
lr_pred = lr.predict(data_test.iloc[:, :-1])
rfc_pred = rfc.predict(data_test.iloc[:, :-1])

print(f'accuracy LogReg: {accuracy_score(lr_pred,  data_test.iloc[:, -1:])}')
print(f'accuracy RandFors: {accuracy_score(rfc_pred,  data_test.iloc[:, -1:])}')

### This is not the last view of my notebook, I will finish and teach models with improvements.

# Thanks for reading. Don't forget to upvote the work. If you thought my decision was appropriate. Good luck kagging! 