In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

import seaborn as sns 

# PART 1 : DATA PRE-PROCESSING 

## 1.1. Check the data

In [None]:
# Import data
data = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')

# Visualize data
pd.set_option('display.max_columns', None)
data.head(10)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data.isna().sum()

In [None]:
column_names = data.columns 
for i in column_names:
    print('{} is unique: {}'.format(i, data[i].is_unique))

## 1.2. Data preprocessing : standardization and encoding

In [None]:
categorical_features = ['sex','exang','ca','cp','fbs','restecg','slope','thal']
quantitative_features = ['age','trestbps','chol','thalach','oldpeak']
features = categorical_features + quantitative_features


# Pre-processing function : clean data, standardization and encode categorical data
def datapreprocessing(data, categorical_features, quantitative_features):
               
    # Feature scaling
    for i in quantitative_features :
        scaler = StandardScaler()
        data[i] = scaler.fit_transform(data[[i]])
            
    # Encoding categorical features    
    for i in categorical_features : 
        labelencoder=LabelEncoder()
        data[i]=labelencoder.fit_transform(data[i])   
    
    Y = data.loc[:,'target']
    X = data.drop(['target'],axis=1) 
    
    return(X,Y)


dataset = data.copy()
X, Y = datapreprocessing(dataset, categorical_features, quantitative_features) 

# PART 2 : EXPLORATORY DATA ANALYSIS

### Data distribution

In [None]:
#Output
sns.countplot(x=Y)
plt.title('Risks of heart disease (M=1 , B=0)')
    
#Age
sns.countplot(x=X["age"]);
plt.title("Age tdistribution", fontsize=20)
plt.xlabel("AGE")
    
#Chest pain
chest_pain_count = X["cp"].value_counts().reset_index()
sns.barplot(x=chest_pain_count["index"], y=chest_pain_count["cp"])
plt.title("TYPE OF CHEST PAIN WITH NUMBER OF PATIENTS", fontsize=20)
plt.xlabel("CHEST PAIN TYPE")
    
sns.displot(X["trestbps"])   
plt.title("DISTRIBUTION OF BLOOD PRESSURE AMONG PATIENTS",fontsize=18)
plt.xlabel("BLOOD PRESSURE")
    
sns.set_color_codes()
sns.displot(X["chol"], color="y")
plt.title("DISTRIBUTION OF CHOLESTROL LEVEL AMONG PATIENTS", fontsize=18)
plt.xlabel("CHOLESTROL LEVEL")
    
sns.displot(X["thalach"], color="blue")
plt.title("DISTRIBUTION OF HEART RATE AMONG PATIENTS", fontsize=18)
plt.xlabel("HEART RATE")

### Bivariate analysis

In [None]:
#Categorical data
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(8,10))
axes = axes.ravel()
for idx,ax in enumerate(axes):
    ax.figure
    sns.catplot(ax=ax, x=categorical_features[idx], data=dataset, kind="count", hue="target")
    ax.set_title(categorical_features[idx])

In [None]:
# split dataframe into two based on diagnosis
dataM=X[Y==1]
dataB=X[Y==0]

# Quantitative data
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,8))
axes = axes.ravel()
for idx,ax in enumerate(axes):
    feature=quantitative_features[idx]
    ax.figure
    binwidth= (max(X[feature]) - min(X[feature]))/50
    ax.hist([dataM[feature],dataB[feature]], bins=np.arange(min(X[feature]), max(X[feature]) + binwidth, binwidth) , alpha=0.5,stacked=True, label=['M','B'],color=['r','g'])
    ax.legend(loc='upper right')
    ax.set_title(feature)

# PART 3 : FEATURES IMPORTANCE

In [None]:
# Splitting traing dataset for validation testing (80% for training and 20% for validation)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=1, shuffle=True)


from sklearn.ensemble import ExtraTreesClassifier

def feature_importance_plotting(x_train, y_train) :
    # Build a forest and compute the impurity-based feature importances
    model = ExtraTreesClassifier(n_estimators=10,random_state=0)
    model.fit(x_train, y_train)
    importances = model.feature_importances_
    std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
    indices = np.argsort(importances)[::-1]
    # Print the feature ranking
    print("Feature ranking:")
    for f in range(x_train.shape[1]):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
    # Plot the impurity-based feature importances of the forest
    plt.figure(1, figsize=(14, 13))
    plt.title("Feature importances")
    plt.bar(range(x_train.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
    plt.xticks(range(x_train.shape[1]), x_train.columns[indices],rotation=90)
    plt.xlim([-1, x_train.shape[1]])


feature_importance_plotting(X_train, Y_train)