In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Important Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

## Import Dataset

In [None]:
df = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')
df.head()

## Describing the data

In [None]:
pd.set_option("display.float", "{:.2f}".format)
df.describe()

## Shape of data

In [None]:
df.shape

## Gathering information about the data

In [None]:
df.info()

### Bar Plot of Target Values

In [None]:
df.target.value_counts().plot(kind="bar", color=["red", "green"])

### Bar Plot of exercise induced angina

In [None]:
df.exang.value_counts().plot(kind="bar", color=["blue", "orange"])

# Data Cleaning

### Total percentage of data is missing

In [None]:
missing_values_count = df.isnull().sum()

total_cells = np.product(df.shape)

total_missing = missing_values_count.sum()

percentage_missing = (total_missing/total_cells)*100
print(percentage_missing)

In [None]:
NAN = [(c, df[c].isnull().mean()*100) for c in df]
NAN = pd.DataFrame(NAN, columns=['column_name', 'percentage'])
NAN

## Unique Values in our datasets

In [None]:
for i in df.columns:
    print(df[i].unique())

# Data Visualisation

In [None]:
categorical_val = []
continous_val = []
for column in df.columns:
    
    print(f"{column} : {df[column].unique()}")
    if len(df[column].unique()) <= 10:
        categorical_val.append(column)
    else:
        continous_val.append(column)

In [None]:
def draw_histograms(dataframe, features, rows, cols):
    fig=plt.figure(figsize=(20,20))
    for i, feature in enumerate(features):
        ax=fig.add_subplot(rows,cols,i+1)
        dataframe[feature].hist(bins=20,ax=ax,facecolor='blue')
        ax.set_title(feature+" Distribution",color='Red')
        
    fig.tight_layout()  
    plt.show()
draw_histograms(df,df.columns,6,3)

In [None]:
plt.figure(figsize=(15, 15))

for i, column in enumerate(categorical_val, 1):
    plt.subplot(3, 3, i)
    df[df["target"] == 0][column].hist(bins=40, color='blue', label='Heart Disease = NO', alpha=1,width=0.2)
    df[df["target"] == 1][column].hist(bins=40, color='red', label='Heart Disease = YES', alpha=1,width=0.2)
    plt.legend()
    plt.xlabel(column)

In [None]:
plt.figure(figsize=(15, 15))

for i, column in enumerate(continous_val, 1):
    plt.subplot(3, 2, i)
    df[df["target"] == 0][column].hist(bins=35, color='blue', label='Have Heart Disease = NO', alpha=1)
    df[df["target"] == 1][column].hist(bins=35, color='red', label='Have Heart Disease = YES', alpha=1)
    plt.legend()
    plt.xlabel(column)

## First, analyse the target variable

In [None]:
y = df["target"]

sns.countplot(y)


target_temp = df.target.value_counts()

print(target_temp)

In [None]:
print("Percentage of patience with heart problems: "+str(y.where(y==1).count()*100/303))
print("Percentage of patience with heart problems: "+str(y.where(y==0).count()*100/303))

In [None]:
ax=plt.subplots(1,1,figsize=(10,8))
df['target'].value_counts().plot.pie(autopct='%1.1f%%',shadow=True,figsize=(10,8))
plt.title("Heart Diseases %")
plt.show()

## We'll analyse 'sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca' and 'thal' features

### 'Sex' feature

In [None]:
print(df["sex"].unique())

sns.barplot(df["sex"],y)

#### We analyse that females are more likely to have heart problems than males

## 'Chest Pain Type' feature

In [None]:
print(df["cp"].unique())
sns.barplot(df["cp"],y)

#### We analyse from above bargraph that chest pain of '0', i.e. the ones with typical angina are much less likely to have heart problems

## FBS feature

In [None]:
print(df["fbs"].describe())
print(df["fbs"].unique())
sns.barplot(df["fbs"],y)

#### Nothing to analyse from this graph

## restecg feature

In [None]:
print(df["restecg"].unique())
sns.barplot(df["restecg"],y)

#### We analyse that people with restecg '1' and '0' are much more likely to have a heart disease than with restecg '2'

## 'exang' feature

In [None]:
print(df["exang"].unique())
sns.barplot(df["exang"],y)

#### People with exang=1 i.e. Exercise induced angina are much less likely to have heart problem

## Slope feature

In [None]:
print(df["slope"].unique())
sns.barplot(df["slope"],y)

#### We observe, that Slope '2' causes heart pain much more than Slope '0' and '1'

## 'ca' feature

In [None]:
df["ca"].unique()

In [None]:
sns.countplot(df["ca"])  

In [None]:
sns.barplot(df["ca"],y) 

## 'thal' feature

In [None]:
print(df["thal"].unique())

In [None]:
sns.barplot(df["thal"],y)

In [None]:
sns.distplot(df["thal"])

In [None]:
sns.histplot(data=df, x="chol", hue="target",multiple="stack")

In [None]:
sns.histplot(data=df, x="chol", hue='target',kde = True)

In [None]:
plt.figure(figsize=(10, 8))
sns.relplot(x='trestbps', y='chol',hue='target', data=df)

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(df.age[df.target==1],df.thalach[df.target==1],color="red")
plt.scatter(df.age[df.target==0],
            df.thalach[df.target==0],
            c="blue")
plt.title("Heart Disease in function of Age and Max Heart Rate")
plt.xlabel("Age")
plt.ylabel("Max Heart Rate")
plt.legend(["Disease", "No Disease"]);

In [None]:
corr_matrix = df.corr()
fig, ax = plt.subplots(figsize=(15, 15))
ax = sns.heatmap(corr_matrix,
                 annot=True,
                 linewidths=0.5,
                 fmt=".2f",
                 cmap="YlGnBu");
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

In [None]:
df.drop('target', axis=1).corrwith(df.target).plot(kind='bar',
                                                   grid=True, figsize=(10, 8), 
                                                   title="Correlation with target",color="red")

In [None]:
categorical_val.remove('target')
dataset = pd.get_dummies(df, columns = categorical_val)

from sklearn.preprocessing import StandardScaler

s_sc = StandardScaler()
col_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
dataset[col_to_scale] = s_sc.fit_transform(dataset[col_to_scale])

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

# Machine Learning Model

## Train Test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = dataset.drop('target', axis=1)
y = dataset.target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

In [None]:
print_score(model, X_train, y_train, X_test, y_test, train=True)
print_score(model, X_train, y_train, X_test, y_test, train=False)

In [None]:
test_score = accuracy_score(y_test, model.predict(X_test)) * 100
train_score = accuracy_score(y_train, model.predict(X_train)) * 100

results_df = pd.DataFrame(data=[["Logistic Regression", train_score, test_score]], 
                          columns=['Model', 'Training Accuracy %', 'Testing Accuracy %'])
results_df

In [None]:
y_pred1 = model.predict(X_test)
print(classification_report(y_test, y_pred1))

In [None]:
model.predict(X_train[:10])

In [None]:
model.predict(X_test[:10])

## Save the model

In [None]:
import joblib

In [None]:
heart_pred = 'final_model.sav'
joblib.dump(model, heart_pred)

# Thank You 