In [None]:
#Project Specific Libraries are imported (2)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import statistics
from sklearn.model_selection import train_test_split, KFold



In [None]:
#Dataset for heart attack prediction is read and put into the dataframe structure. (3)
#The following data is considered as the train set 
#The first 10 values are displayed (4)
df= pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
display(df.head (10))
print("\n\n")
print(df.shape)
     

In [None]:
#The train dataset is split to form another dataset where the 
#output column(likeliness of heart attack) is dropped as seen below to create
#test dataset which consistes of all the other columns except the output column 
#In the test set X variables are the the other characterstics variables and based on
# these the target variable output which is the Y variable will do the supervised 
#machine learning. (1)

features_num = ["age", "trtbps","chol","thalachh","oldpeak"]

features_cat = ['sex','exng','caa','cp','fbs','restecg','slp','thall']

scaler = StandardScaler()
ohe = OneHotEncoder(sparse = False)

scaled_columns = scaler.fit_transform(df[features_num]) 
encoded_columns = ohe.fit_transform(df[features_cat])

X = np.concatenate([scaled_columns, encoded_columns], axis = 1)
y = df['output']

#This part specifically splits the dataset to train and test sets. 
#The data divided by the split is based on the relationship
#75% as our training data and test our model on the remaining 25%. 
#The Scikit-learn's train_test_split function enables this.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 4)

In [None]:
pff = pd.DataFrame({'output': X_test[:, 29]})
display(pff)

#pff.to_csv('testpff.csv')

In [None]:
#A single column (output coulmn in this case) is accessed from the train set and viewed 
print(df['output']) #access data frame specific column in this way OR
print(df.output) #access data frame this way 
print(df.output.value_counts()) #counts the unique different values in the column

In [None]:
#checking to see if anny columns have any data missing and it shows no 
#missing entries in any column as no NaN. (5,12,13,14)
df.isnull().sum() 

In [None]:
df.info() #information of the dataframe presented ((5,12,13,14))


In [None]:
# boxplot and outlier detection for the training dataset is done using seaborn (6,7)
fig, (subplot1,subplot2)=plt.subplots(1,2) #1 row 2 column row created
#input argument is defined as df(which is the dataframe here)
#x=cp/exng is considered for the boxplot (where cp is divided into 
#3 bxplts and exng is divided into 2 bxplts as they have these specified in their data)
#y=age is taken from the df (dataframe) for botht eh boxplots

sns.boxplot(x='cp', y='age', data=df, ax=subplot1)#chest pains

sns.boxplot(x='exng', y='age', data=df, ax=subplot2) #exercise induced angina

plt.show()

In [None]:
#Previously it was seen in the boxplot for 
#exercise induced angima  and age that people with age over 70 had outlier these are
#shown here. (6,7)
df.loc[df.age>70] 


In [None]:
#The data types are checked initially.
#Correlation matrix is created for the training
#to show which of the columns are more dependent on the 
#other from which as idea can be generated to understand which 
#values lead to be the more influential. (8)

print(df.dtypes)
print("\n\n")
TrainingDataset_NumericalOnly=df.select_dtypes(include=['int64','float64'])
corr_mat=TrainingDataset_NumericalOnly.corr()

corr_mat.head(n=5) #first 5 values are checked

In [None]:
#The heat map for the training dataset is created which is technically the visual representation 
#of the correlation matrix where the more dependent columns show a higher heat/ warmer
#color referring to the more dependency on each other.(9)

plt.figure(figsize=(17, 10))
sns.heatmap(
    corr_mat,
    cmap=plt.cm.RdBu,
    vmax=1,
    linewidth=0.1,
    linecolor='white',
    square=True,
    annot=True
            )

In [None]:
#Pairplot shows pairwise relationship between each column in the training dataset.
#This results in creating a grid of  axes so that each numeric value
#is shared across the x and y axis. The diagonal plots here act differently 
#it shows a univariate distribution plot is drawn 
#to show  the marginal distribution of each data for the respective columns. (10)
#Data for output(heart attack) is in blue

plt.figure(figsize=(20,20))
sns.pairplot(TrainingDataset_NumericalOnly, hue='output') 


In [None]:
# The countplot shows the specifics data for a column in divided 
# manner based on the individual outcomes for that column
# In the training set of how many males/females are likely to have a heart attack is shown in
# this count plotplot.The blue color mean more likey to heart failure. (11)
sns.countplot(x ='sex', data = df,hue='output')
plt.xticks(ticks=[0, 1], labels = ["female", "male"])




In [None]:
#Supervised machine learning model created using Logistic regression to predict the output cloumn(heart attack chances)
#using the test set that was created based using the training model.
#The model accuracy is also generated herefrom the newly established test dataset.
#The new output generated from the test set is put into a dataframe and shown.
# A 5-fold cross validation is done also over the logistic regressive model which ensures to take 5 different layers
#of data from test and train dataset to create the target column in test dataset.
#This ensures to produce different accuracy results based off the different spliting of the dataset
# as a result producing an array of 5 scores based on which average accuracy score and mean is 
#taken for the model results


X = df.iloc[:, :-1]
y = df.iloc[:, -1]
k = 5
kf = KFold(n_splits=k, random_state=None)
model = LogisticRegression(solver='liblinear')

acc_score = []
pred_val_stack = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]

    model.fit(X_train, y_train)
    pred_values = model.predict(X_test)
    ptt = np.transpose(pred_values)
    conv_ptt = pd.DataFrame(ptt,columns=['output'])
    display(conv_ptt)
    
    acc = accuracy_score(pred_values, y_test)
    acc_score.append(acc)

avg_acc_score = sum(acc_score) / k
std_div = statistics.stdev(acc_score)

print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))
print('Standard Deviation : {}'.format(std_div))



In [None]:
conv_ptt.to_csv('heart_out_prediction.csv', mode='a', header=True)


In [None]:
conv_ptt.to_excel('output.xlsx')



