In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#Step1: Necessery imports
import pandas as pd
import numpy as np
import matplotlib as mpl
import scipy as scipy
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from pandas import DataFrame
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score , mean_absolute_error

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#Step2: Loading the data 
#Change the csv file for another brand
#Added Data types for memory optimization and faster loading
data = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')
data.shape

In [None]:
data.info()

# Heart Attack Analysis

* Examines the average and makes predictions on various factors.
* Examines factors 
* Creates 3-D Visualizations. 
* Shows the correlation between factors with heatmaps.
* Is optimized to work with large datasets


In one of my previous studies of Cardiovascular diseases, that you can find that heart failure diseases are mainly caused by sugar and fat as you already know ,  and are not that dependent on bad habits as drinking or smoking as you can imagine with exercising playing a role in healthy heart condition as you already seen.You can also speak to your doctor or like me to a friend of yours who is a surgeon, but you will hear the same: With aging, the ability of the body to regenerate from cholesterol is lagging. So what can we do to have the feeling of a healthy heart?

* Avoid heavily processed foods.
* Avoid fried and fat foods.
* Replace sugar with more natural glucose sources like honey.
* Exrecise often.
* Drinking and Smoking makes the heart muscle weaker, so take a note.


General Conclusions that you will find in this research is that the risk group is:

* males at age of 58 
* resting blood pressure 120 (in mm Hg)
* 197 cholestoral in mg/dl
* having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
* maximum heart rate achieved:162
 

# Making Predictions
*building mathematical Machine Learning Model model and making predictions
*evaluating the model

***Part1: Building the model***

In [None]:
#machine learning and making predictions
#Display columns
data.columns
#drop empty values
data = data.dropna(axis=0)
#selecting prediction target
y = data.output
#Choosing features and stroring it in X
data_features = ['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh', 'exng', 'oldpeak', 'slp', 'caa', 'thall']
X = data[data_features]
#Describe data in X
X.describe()
#Describe the head
X.head()



# Define model. Specify a number for random_state to ensure same results each run
data_model = DecisionTreeRegressor(random_state=1)

# Fit model
data_model.fit(X, y)

#Printing Predictions
print("Making predictions for the following 5 Patients:")
print(X.head(5))
print("The predictions are")
print(data_model.predict(X.head(5)))

***Part2 Evaluating the model***

In [None]:
#Evaluating and validating the model
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
# Define model
data_model = DecisionTreeRegressor()
# Fit model
data_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = data_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

In [None]:
#underfitting and overfitting
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

***Random Forest***

In [None]:
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
data_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, data_preds))

***Standard deviation***

In [None]:
std = np.std(data)
print(std)




***Coefficient of Variation***

In [None]:
cv = np.std(data) / np.mean(data)
print(cv)

***Variance***

In [None]:
var_full = np.var(data)
print(var_full)

# Describing the data 

In [None]:
#Step3:Describing the data
data.describe()

In [None]:
#Step3:Describing the data - finding the mode [most frequent]
data.mode()

In [None]:
fig = px.pie(data, values='age', names='sex')
fig.update_traces(textposition='inside')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig.show();

# 3-D Visualization

In [None]:
#controls the numer of rows to be read in the dataframe
start, end = 0, 800

In [None]:


fig = go.Figure(data=go.Scatter3d(
    x=data['age'][start:end],
    y=data['sex'][start:end],
    z=data['cp'][start:end],
    text=data['output'][start:end],
    mode='markers',
    marker=dict(
        sizemode='diameter',
        sizeref=10,
        size=data['chol'][start:end],
        color = data['output'][start:end],
        colorscale = 'Viridis',
        colorbar_title = 'Output<br>',
        line_color='rgb(140, 140, 170)'
    )
))

fig.update_layout(height=800, width=800,
                  title='3-D Graph - X-Age,Y-Sex,Z-Cp,Size-Chol,Color-output')

fig.show()


In [None]:
fig = go.Figure(data=go.Scatter3d(
    x=data['trtbps'][start:end],
    y=data['restecg'][start:end],
    z=data['fbs'][start:end],
    text=data['output'][start:end],
    mode='markers',
    marker=dict(
        sizemode='diameter',
        sizeref=10,
        size=data['chol'][start:end],
        color = data['output'][start:end],
        colorscale = 'Viridis',
        colorbar_title = 'Output<br>',
        line_color='rgb(140, 140, 170)'
    )
))

fig.update_layout(height=800, width=800,
                  title='3-D Graph - X-trtbps,Y-restecg,Z-fbs,Size-Chol,Color-output')

fig.show()

# Correlation Influence Heatmaps

In [None]:
#correlate data
corr = data.corr()

#using matplotlib to define the size

plt.figure(figsize=(8, 8))

#creating the heatmap with seaborn
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
# Contrasting heatmap with seaborn

sns.set_theme(style="dark")


# Compute the correlation matrix
corr = data.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(8, 8))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-0.1, vmax=0.1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
corr = data.corr()

mask = np.triu(np.ones_like(corr, dtype=np.bool))
corr = corr.mask(mask)
fig = ff.create_annotated_heatmap(
    z=corr.to_numpy().round(2),
    x=list(corr.index.values),
    y=list(corr.columns.values),       
    xgap=3, ygap=3,
    zmin=-1, zmax=1,
    colorscale='fall',
    colorbar_thickness=30,
    colorbar_ticklen=3,
)
fig.update_layout(title_text='Correlation Matrix (impact relationship with numbers)',
                  title_x=0.5,
                  titlefont={'size': 20},
                  width=800, height=800,
                  xaxis_showgrid=False,
                  xaxis={'side': 'bottom'},
                  yaxis_showgrid=False,
                  yaxis_autorange='reversed',                   
                  paper_bgcolor=None,
                  template="simple_white"
                  )
fig.show()


# Jointplots correlating to and predicting heart attack

In [None]:
sns.jointplot(x="age", y="output", data=data, kind = 'reg',fit_reg= True, size = 12)
plt.show()

In [None]:
sns.jointplot(x="caa", y="output", data=data, kind = 'reg',fit_reg= True, size = 12)
plt.show()

In [None]:
sns.jointplot(x="slp", y="output", data=data, kind = 'reg',fit_reg= True, size = 12)
plt.show()

In [None]:
sns.jointplot(x="oldpeak", y="output", data=data, kind = 'reg',fit_reg= True, size = 12)
plt.show()

In [None]:
sns.jointplot(x="exng", y="output", data=data, kind = 'reg',fit_reg= True, size = 12)
plt.show()

#  PAIRPLOTS and Predictions

In [None]:
sns.pairplot(data,palette='bright')

In [None]:
sns.set(style="ticks", color_codes=True)
g = sns.pairplot(data, kind="reg", plot_kws={'line_kws':{'color':'red'}})
plt.show()

# QuickDA Analysis

In [None]:
pip install quickda

In [None]:
# Importing libraries
from quickda.explore_data import *
from quickda.clean_data import *
from quickda.explore_numeric import *
from quickda.explore_categoric import *
from quickda.explore_numeric_categoric import *
from quickda.explore_time_series import *


In [None]:
eda_num(data)

#  Linear Regression Table

In [None]:
model = smf.ols('age ~ output', data = data)
results = model.fit()
print(results.summary())

In [None]:
model = smf.ols('sex ~ output', data = data)
results = model.fit()
print(results.summary())

In [None]:
model = smf.ols('chol ~ output', data = data)
results = model.fit()
print(results.summary())

In [None]:
import pandas_profiling
report = pandas_profiling.ProfileReport(data)
from IPython.display import display
display(report)