In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("/kaggle/input/heart-disease-uci/heart.csv")
data.head()

# **Data Description and information**

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
# Checking for Null values
data.isnull().sum()

# **Data Analysis**

**Feature Selection**

1. Univariate Selction — Statistical tests may be used to pick certain features that have the best relationship to the performance variable. The scikit-learn library provides the SelectKBest class that can be used to select a specific number of features in a suite of different statistical tests.

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X = data.iloc[:,0:13] 
y = data.iloc[:,-1]     
#apply SelectKBest class to extract top best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']
#print best features
print(featureScores.nlargest(12,'Score'))

**2.** **Feature Importance** — You can gain the significance of each feature of your dataset by using the Model Characteristics property.  
Feature value gives you a score for every function of your results, the higher the score the more significant or appropriate the performance variable is.  
We will use the Extra Tree Classifier to extract the top features for the dataset.

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) 
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(13).plot(kind='barh')
plt.show()

**3. Correlation Matrix with Heatmap** — Correlation indicates how the features are related to each other or to the target variable.  
The correlation may be positive (increase in one value of the feature increases the value of the target variable) or negative (increase in one value of the feature decreases the value of the target variable)  
Heatmap makes it easy to classify the features are most relevant to the target variable, and we will plot the associated features of the heatmap using the seaborn library.

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(data.corr(),annot=True,cmap="magma",fmt='.2f')

# **Data Visualization**

Seaborn

In [None]:
sns.set_style('darkgrid')
sns.set_palette('Set2')

In [None]:
data1 = data.copy()
def chng(sex):
    if sex == 0:
        return 'female'
    else:
        return 'male'
data1['sex'] = data1['sex'].apply(chng)
def chng2(prob):
    if prob == 0:
        return 'Heart Disease'
    else:
        return 'No Heart Disease'
data['target'] = data1['target'].apply(chng2)

In [None]:
# Countplot
data1['target'] = data1['target'].apply(chng2)
sns.countplot(data= data1, x='sex',hue='target')
plt.title('Gender v/s target\n')

In [None]:
sns.countplot(data= data1, x='cp',hue='target')
plt.title('Chest Pain Type v/s target\n')

In [None]:
sns.countplot(data= data1, x='sex',hue='thal')
plt.title('Gender v/s Thalassemia\n')

In [None]:
sns.countplot(data= data1, x='slope',hue='target')
plt.title('Slope v/s Target\n')

In [None]:
sns.countplot(data= data1, x='exang',hue='thal')
plt.title('exang v/s Thalassemia\n')

In [None]:
# Boxplot
sns.boxplot(data=data1,x='target',y='age')

In [None]:
plt.figure(figsize=(14,8))
sns.violinplot(data=data1,x='ca',y='age',hue='target')

In [None]:
sns.boxplot(data=data1,x='cp',y='thalach',hue='target')

In [None]:
plt.figure(figsize=(10,7))
sns.boxplot(data=data1,x='fbs',y='trestbps',hue='target')

In [None]:
plt.figure(figsize=(10,7))
sns.violinplot(data=data1,x='exang',y='oldpeak',hue='target')

In [None]:
plt.figure(figsize=(10,7))
sns.boxplot(data=data1,x='slope',y='thalach',hue='target')

In [None]:
sns.violinplot(data=data1,x='thal',y='oldpeak',hue='target')

In [None]:
sns.violinplot(data=data1,x='target',y='thalach')

In [None]:
# PairPlot
sns.pairplot(data,hue='cp')

# **Classification Tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
X = data.iloc[:,0:13] # Features
y = data.iloc[:,13] # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# **Data Pre-processing**  

We have 4 Categorical columns as seen in Data Description using pandas profiling:  
1. cp — chest_pain_type  
2. restecg — rest_ecg_type  
3. slope — st_slope_type  
4. thal — thalassemia_type  

In [None]:
#Change Name of the column
data.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg_type', 'max_heart_rate_achieved',
       'exercise_induced_angina', 'st_depression', 'st_slope_type', 'num_major_vessels', 'thalassemia_type', 'target']
data.columns

In [None]:
# Generating categorical columns values
#cp - chest_pain_type
data.loc[data['chest_pain_type'] == 0, 'chest_pain_type'] = 'asymptomatic'
data.loc[data['chest_pain_type'] == 1, 'chest_pain_type'] = 'atypical angina'
data.loc[data['chest_pain_type'] == 2, 'chest_pain_type'] = 'non-anginal pain'
data.loc[data['chest_pain_type'] == 3, 'chest_pain_type'] = 'typical angina'
#restecg - rest_ecg_type
data.loc[data['rest_ecg_type'] == 0, 'rest_ecg_type'] = 'left ventricular hypertrophy'
data.loc[data['rest_ecg_type'] == 1, 'rest_ecg_type'] = 'normal'
data.loc[data['rest_ecg_type'] == 2, 'rest_ecg_type'] = 'ST-T wave abnormality'
#slope - st_slope_type
data.loc[data['st_slope_type'] == 0, 'st_slope_type'] = 'downsloping'
data.loc[data['st_slope_type'] == 1, 'st_slope_type'] = 'flat'
data.loc[data['st_slope_type'] == 2, 'st_slope_type'] = 'upsloping'
#thal - thalassemia_type
data.loc[data['thalassemia_type'] == 0, 'thalassemia_type'] = 'nothing'
data.loc[data['thalassemia_type'] == 1, 'thalassemia_type'] = 'fixed defect'
data.loc[data['thalassemia_type'] == 2, 'thalassemia_type'] = 'normal'
data.loc[data['thalassemia_type'] == 3, 'thalassemia_type'] = 'reversable defect'

In [None]:
#One Hot Encoding
dummy = pd.get_dummies(data, drop_first=False)
dummy.columns

In [None]:
data_temp = dummy['thalassemia_type_fixed defect']
dummy = pd.get_dummies(data, drop_first=True)
dummy.head()

In [None]:
frames = [dummy, data_temp]
result = pd.concat(frames,axis=1)
result.drop('thalassemia_type_nothing',axis=1,inplace=True)
resultc = result.copy()# making a copy for further analysis

# **Logistic Regression**  

1. Gather columns
2. Splitting Data  
3. Normalization  
4. Fitting into Model  
5. Prediction  
6. Model Evaluation

In [None]:
#Gather columns
X = result.drop('target_No Heart Disease', axis = 1)
y = result['target_No Heart Disease']

In [None]:
#Splitting Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
#Normalization
X_train=(X_train-np.min(X_train))/(np.max(X_train)-np.min(X_train)).values
X_test=(X_test-np.min(X_test))/(np.max(X_test)-np.min(X_test)).values

In [None]:
#Fitting into Model
from sklearn.linear_model import LogisticRegression
logre = LogisticRegression()
logre.fit(X_train,y_train)

In [None]:
#Prediction
y_pred = logre.predict(X_test)
actual = []
predcition = []
for i,j in zip(y_test,y_pred):
  actual.append(i)
  predcition.append(j)
dic = {'Actual':actual,
       'Prediction':predcition
       }
result  = pd.DataFrame(dic)
import plotly.graph_objects as go
 
fig = go.Figure()
 
 
fig.add_trace(go.Scatter(x=np.arange(0,len(y_test)), y=y_test,
                    mode='markers+lines',
                    name='Test'))
fig.add_trace(go.Scatter(x=np.arange(0,len(y_test)), y=y_pred,
                    mode='markers',name='Pred'))

In [None]:
#Model Evaluation
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred))
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True)

In [None]:
#ROC Score
import sklearn
sklearn.metrics.roc_auc_score(y_test,y_pred)

In [None]:
final_data = {'Actual Value':y_test, 'Predicted Value':y_pred}
submission = pd.DataFrame(data=final_data)

In [None]:
submission.to_csv('submission_lr.csv', index =False)