# Data Exploration and Visualization

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# All imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pandas.plotting._matplotlib import scatter_matrix as sm
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [None]:
# Loading Data into data frame
heart_df = pd.read_csv("../input/heart-disease-uci/heart.csv")

In [None]:
# Displaying top 5 rows from the data frame
heart_df.head() #defaulted to 5 rows

In [None]:
# checking the total number of columns and rows
heart_df.shape

In [None]:
# From the above output, we know that there are 303 records(rows) and 14 attributes/features(columns)

In [None]:
# Alternatively we can use the below command for checking the same
heart_df

In [None]:
# We can see that some rows are masked which prevents the full view of the data.

In [None]:
# checking for duplicates
heart_df.drop_duplicates() # drops duplicates, if any
heart_df.shape

In [None]:
# We see that the number of rows and columns remain same, we confirmed that there were no duplicates

In [None]:
# Let's check for total null values in each column
heart_df.isnull().sum()

In [None]:
# Let's check for the full information about the dataset including non-nulls and 
# data-types
heart_df.info()

In [None]:
# Since data-types are int64 and float64, it's presumed that there are no non-numerical
# data present in the given dataset. Yet to confirm it, let's check for it with below command
heart_df.isna().sum()

In [None]:
# Now, let's describe the data to see the IQR, mean, standard deviation, min, max values
heart_df.describe()

In [None]:
# Let's rename the columns for better understanding
heart_df = heart_df.rename({'cp': 'chest_pain', 'trestbps': 'resting_blood_pressure', 
                            'chol': 'cholestrol', 'fbs': 'fasting_blood_sugar',
                            'restecg': 'rest_ECG', 'thal': 'thallium_stress', 'target': 'is_heart_disease'}, axis=1)

In [None]:
# Let's check the unique values in each column
heart_df.nunique()

In [None]:
for c in heart_df.columns:
    print ("----", c ,"---")
    print (heart_df[c].value_counts())

In [None]:
# From above output, we found that some columns contain categorical data.
# let's encode the categorical data.
heart_df.head()

In [None]:
plt.figure(figsize=(15,7))
ax = sns.heatmap(heart_df.corr(),annot=True, cmap="YlGnBu", linewidths=.5)

In [None]:
# From the heatmap above, we understand that chest_pain, thalach and slope are highly correlated.

In [None]:
# Boxplot visualization
plt.figure(figsize=(15,7))
heart_df.boxplot()
plt.xticks(rotation=45)
plt.show()

In [None]:
# Out of 14 features, 7 features seems to have outliers in the data set.
# cholestrol contains values that are higher when compared to other features. 

# Data pre-processing - outlier cleaning

In [None]:
# Clear visualization of the outliers
plt.title('Box plot summary of cholestrol')
heart_df['cholestrol'].plot(kind='box',figsize=(6,8))
plt.yticks(range(200,600,50))
plt.grid()
plt.show()

In [None]:
# outlier range for cholestrol
upper_limit_chol = heart_df['cholestrol'].quantile(0.99)
lower_limit_chol = heart_df['cholestrol'].quantile(0.01)
print("upper_limit_chol",upper_limit_chol)
print("lower_limit_chol",lower_limit_chol)

In [None]:
#trimming outliers in cholestrol
heart_df = heart_df[(heart_df['cholestrol'] <= upper_limit_chol) & (heart_df['cholestrol'] >= lower_limit_chol)]
heart_df.describe()

In [None]:
# Visualizing the boxplot for trimmed data
plt.title('Box plot summary of cholestrol')
heart_df['cholestrol'].plot(kind='box',figsize=(6,8))
plt.yticks(range(200,600,50))
plt.grid()
plt.show()

In [None]:
# As the trimmed data looks good, let's move on to the next attribute to be handled.
plt.title('Box plot summary of thalach')
heart_df['thalach'].plot(kind='box',figsize=(6,8))
plt.grid()
plt.show()

In [None]:
# outlier range for thalach
upper_limit_thalach = heart_df['thalach'].quantile(0.99)
lower_limit_thalach = heart_df['thalach'].quantile(0.01)
print("upper_limit_thalach",upper_limit_thalach)
print("lower_limit_thalach",lower_limit_thalach)

In [None]:
#trimming outliers in thalach
heart_df = heart_df[(heart_df['thalach'] <= upper_limit_thalach) & (heart_df['thalach'] >= lower_limit_thalach)]
heart_df.describe()

In [None]:
# As the trimmed data looks good, let's move on to the next attribute to be handled.
plt.title('Box plot summary of thalach')
heart_df['thalach'].plot(kind='box',figsize=(6,8))
plt.grid()
plt.show()

In [None]:
# So, we have trimmed thalach too!
# Visualizing the boxplot for trimmed data
plt.title('Box plot summary of resting_blood_pressure')
heart_df['resting_blood_pressure'].plot(kind='box',figsize=(6,8))
plt.grid()
plt.show()

In [None]:
# checking outliers in resting_blood_pressure
upper_limit_RBP = heart_df['resting_blood_pressure'].quantile(0.99)
lower_limit_RBP = heart_df['resting_blood_pressure'].quantile(0.01)
print("upper_limit_RBP",upper_limit_RBP)
print("lower_limit_RBP",lower_limit_RBP)

In [None]:
#trimming outliers in resting_blood_pressure
heart_df = heart_df[(heart_df['resting_blood_pressure'] <= upper_limit_RBP) 
  & (heart_df['resting_blood_pressure'] >= lower_limit_RBP)]
heart_df.describe()

In [None]:
# As the trimmed data looks good, let's move on to the next attribute to be handled.
plt.title('Box plot summary of Resting_Blood_Pressure')
heart_df['resting_blood_pressure'].plot(kind='box',figsize=(6,8))
plt.grid()
plt.show()

In [None]:
# Now the data looks more clean! Let's check for skewness!

In [None]:
#finding the distribution of the data using Visualization
heart_df.plot(kind='density', subplots=True, layout=(7,2),sharex=False,sharey=False,fontsize=15, figsize=(20,20))
plt.suptitle("PDF", y=1.00, fontweight='bold', fontsize=20)
plt.subplots_adjust(hspace=1.5,wspace=0.5)
plt.show()

In [None]:
heart_df.diff().hist(color="k", alpha=0.5, layout=(7,2),sharex=False,sharey=False, figsize=(15,15));
plt.suptitle("Histogram", y=1.00, fontweight='bold', fontsize=20)
plt.subplots_adjust(hspace=0.5,wspace=0.5)
plt.show()

In [None]:
# Skipping standardization and normalization as we have already treated the outliers. 
# Also the data is in the range easy to handle.

#Feature Selection

### We have data of both categorical and numerical inputs and the output is categorical. So let's apply chi2 and ANOVA feature selection techniques to select the top 10 features. We will use the top 10 features obtained from this selection technique to the models.

In [None]:
heart_df.head()

In [None]:

## ***************Next Steps: perform Feature Selection*************** ##
X_cat = heart_df[['sex','chest_pain','fasting_blood_sugar','rest_ECG','exang','slope','ca','thallium_stress']] #categorical variables
X_num = heart_df.drop(X_cat.columns, axis=1)
X_num.drop('is_heart_disease',axis = 1,inplace=True)

y = heart_df.iloc[:,-1]
X_cat 

In [None]:
X_num # numerical variables

In [None]:
## *************** Method 1a:Feature Selection - chi2 *************** ##
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

#apply SelectKBest class to extract top 10 best features using chi2
BestFeatures = SelectKBest(score_func=chi2, k='all')
fit = BestFeatures.fit(X_cat,y)
chi2_df_scores = pd.DataFrame(fit.scores_)
chi2_df_columns = pd.DataFrame(X_cat.columns)

#concatenating df_scores and df_columns dataframes for better visualization
chi2_feature_Scores = pd.concat([chi2_df_columns,chi2_df_scores],axis=1) # feature scores using chi2
chi2_feature_Scores.columns = ['Ranked Attributes/Features','Score'] 
print('##TOP chi2 FEATURES')
chi2_feature_Scores # Score value is directly proportional to the feature importance

In [None]:
## *************** Method 1b:Feature Selection - ANOVA *************** ##
from sklearn.feature_selection import f_classif

#apply SelectKBest class to extract top 10 best features using ANOVA F-measure via f_classif
BestFeatures = SelectKBest(score_func=f_classif, k='all')
fit = BestFeatures.fit(X_num,y)
anova_df_scores = pd.DataFrame(fit.scores_)
anova_df_columns = pd.DataFrame(X_num.columns)

#concatenating anova_df_scores and anova_df_columns dataframes for better visualization
anova_feature_Scores = pd.concat([anova_df_columns,anova_df_scores],axis=1) # feature scores obtained by ANOVA F-measure
anova_feature_Scores.columns = ['Ranked Attributes/Features','Score'] 
print('##TOP Anova FEATURES')
anova_feature_Scores # Score value is directly proportional to the feature importance

In [None]:
frames = [chi2_feature_Scores,anova_feature_Scores] #features from chi2 and anova with ranking
result = pd.concat(frames) # concatinating features from chi2 and anova with ranking
result #features in score order

In [None]:
chi2_anova_top_10 = result.nlargest(10,'Score') # combined top_10 features 
print('##TOP 10 FEATURES - chi2_anova_top_10')
chi2_anova_top_10

###Model - RandomForest

In [None]:
X=heart_df[['oldpeak','ca','thalach','chest_pain','exang','age','slope','sex','thallium_stress','resting_blood_pressure']]
Y=heart_df['is_heart_disease']
X

In [None]:
Y #target variable is_heart_disease

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,mean_squared_error, classification_report,accuracy_score,f1_score,precision_score,recall_score,roc_curve,roc_auc_score

model_RF = RandomForestClassifier(n_estimators=30)
train_X, test_X, train_y, test_y = train_test_split(X, Y,test_size=0.2,random_state=100)
model_RF.fit(train_X, train_y)
prediction_RF = model_RF.predict(test_X)

#Score calculation for RandomForest
acc_RF = accuracy_score(test_y,prediction_RF)
trainscore_RF =  model_RF.score(train_X,train_y)
testscore_RF =  model_RF.score(test_X,test_y) 
conf_matrix_RF = confusion_matrix(test_y,prediction_RF)

print("Training Accuracy: ", trainscore_RF*100,'\n')
print("Test Accuracy: ", testscore_RF*100,'\n')
print("Classification report: \n", classification_report(test_y,prediction_RF))
print('Confusion Matrix : \n', conf_matrix_RF)
    
# drawing confusion matrix
sns.heatmap(conf_matrix_RF, center = True , annot=True, fmt="d" ,cmap="RdYlBu")
plt.show()

### Model - KNN Clustering

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model_KNN = KNeighborsClassifier(n_neighbors = 5)
model_KNN.fit(train_X,train_y)
prediction_KNN = model_KNN.predict(test_X)  # Make predictions

#Score calculation for KNN
acc_KNN = accuracy_score(test_y,prediction_KNN)
trainscore_KNN =  model_KNN.score(train_X,train_y)
testscore_KNN =  model_KNN.score(test_X,test_y) 
conf_matrix_KNN = confusion_matrix(test_y,prediction_KNN)

print("Training Accuracy: ", trainscore_KNN*100,'\n')
print("Test Accuracy: ", testscore_KNN*100,'\n')
print("Classification report: \n", classification_report(test_y,prediction_KNN))
print('Confusion Matrix : \n', conf_matrix_KNN)
    
# drawing confusion matrix
sns.heatmap(conf_matrix_KNN, center = True , annot=True, fmt="d" ,cmap="RdYlBu")
plt.show()