# 1)Importing the required libraries



In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score 
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV

%matplotlib inline

# 2)Loading the datasets

In [None]:
# Loading the datasets
dataset = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
# Looking at data
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe()

### **Variable Description**

#### **Age** : Age of the patient

#### **Sex** : Sex of the patient

#### **Exang** : exercise induced angina (1 = yes; 0 = no)

#### **Ca** : number of major vessels (0-3)

#### **Cp** : Chest Pain type chest pain type

* Value 1: typical angina
* Value 2: atypical angina
* Value 3: non-anginal pain
* Value 4: asymptomatic

#### **trtbps** : resting blood pressure (in mm Hg)

#### **Chol** : cholestoral in mg/dl fetched via BMI sensor

#### **Fbs** : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

#### **Rest_ecg** : resting electrocardiographic results

* Value 0: normal
* Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
* Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

#### **Thalach** : maximum heart rate achieved

#### **Target** : 0= less chance of heart attack 1= more chance of heart attack

# 3)Exploratory Data Analysis

## >Univariate Analysis

In [None]:
fig = plt.figure(figsize=(10,6))
X = dataset['age']*dataset['output']
X= X[X != 0]
plt.xlabel('AGE')
plt.ylabel('Frequency')
plt.title("Age v/s Number of heart attacks")
sns.histplot(data=X,binwidth=5,kde=True)
# We observe that people in the age of 40-60 got more heart attacks compared to others

In [None]:
fig = plt.figure(figsize=(10,6))
sns.histplot(data=dataset, x="age", hue="output",kde=True)

In [None]:
X = dataset['sex']
X = X[dataset['output'] !=0]
fig = plt.figure(figsize=(10,6))
plt.figtext( .16,.5, "FeMales",color='white',fontsize=15)
plt.figtext(.8, .5, "Males",color='white',fontsize=15)
sns.histplot(data=X,color='r')

In [None]:
X = dataset['cp']
X = X[dataset['output'] !=0]
plt.title("Chest Pain type v/s # heart attacks")
plt.ylabel("Chest Pain Type")
sns.histplot(y=X)
# We observe that people who got chest pain of type 3(value 2) are most likely to suffer heart attack

In [None]:
# fig = plt.figure(figsize=(10,6))
fig, axs = plt.subplots(ncols=4,figsize=(25,5))
X = dataset['trtbps']
X = X[dataset['output'] !=0]
sns.histplot(x=X,ax=axs[0],color='r')
axs[0].set_title('resting blood pressure level v/s #heart attacks')
axs[0].set_xlabel('trtbps')
axs[0].set_ylabel('Count')
X = dataset['chol']
X = X[dataset['output'] !=0]
sns.histplot(x=X,ax=axs[1],color='orange')
axs[1].set_title('cholestoral level v/s #heart attacks')
axs[1].set_xlabel('cholestoral')
axs[1].set_ylabel('Count')
X = dataset['thalachh']
X = X[dataset['output'] !=0]
sns.histplot(x=X,ax=axs[2],color='pink')
axs[2].set_title('maximum heart rate achieved v/s #heart attacks')
axs[2].set_xlabel('thalachh')
axs[2].set_ylabel('Count')
X = dataset['oldpeak']
X = X[dataset['output'] !=0]
sns.histplot(x=X,ax=axs[3],color='green')
axs[3].set_title('Previous peak v/s #heart attacks')
axs[3].set_xlabel('oldpeak')
axs[3].set_ylabel('Count')

In [None]:
plt.figure(figsize=(10,6))
plt.suptitle("VARIOUS SYMPTOMS OBSERVED IN PAITENTS DIED DUE TO HEART ATTACK ", fontsize=14)
plt.subplot(2, 3, 1)
X = dataset['fbs']
X = X[dataset['output'] !=0]
# grid(True)
X.value_counts().plot.pie(labels=['NO','YES'])
plt.title("fasting blood sugar > 120")
plt.legend()
# grid(True)

plt.subplot(2, 3, 2)
X = dataset['restecg']
X = X[dataset['output'] !=0]
X.value_counts().plot.pie()
plt.title("resting electrocardiographic level")

plt.subplot(2, 3, 3)
X = dataset['exng']
X = X[dataset['output'] !=0]
X.value_counts().plot.pie(labels=['NO','YES'])
plt.title("exercise induced angina")
plt.legend()

plt.subplot(2, 3, 4)
X = dataset['slp']
X = X[dataset['output'] !=0]
X.value_counts().plot.pie()
plt.title("Slope")


plt.subplot(2, 3, 5)
X = dataset['caa']
X = X[dataset['output'] !=0]
X.value_counts().plot.pie()
plt.title("number of major vessels")

plt.subplot(2, 3, 6)
X = dataset['thall']
X = X[dataset['output'] !=0]
X.value_counts().plot.pie()
plt.title("Thal rate")


## Multivariate Analysis

In [None]:
df = dataset.loc[:,['age','trtbps','chol','thalachh','oldpeak','output']]

In [None]:
sns.pairplot(df,hue='output')

In [None]:
sns.pairplot(df,hue='output',kind='kde')

In [None]:
sns.pairplot(dataset,hue='output')

In [None]:
plt.figure(figsize=(13,13))
sns.heatmap(dataset.corr(),color='red',annot=True)

#### Observations

1)Following symptoms are more likely to appear in paitents who suffered with heart attack

*  Chestpain type 3(Non anginal chest pains)
*  number of major vessels 0
*  Higher Thall rate
*  People with exercise induced angina
*  People with higher old peak value

2)Correlations

*   OldPeak and slp show strongly negetive Correlation 
*   Slp,thalachh,cp show strongly positive Correltion

# 4)Preprocessing Data

## Handling the missing data

In [None]:
missing_values_count = dataset.isnull().sum()
missing_values_count[0:]
# No missing values observed

## One hot encoding,Scaling and Splitting the data

In [None]:
# cp,restecg,exng,slp,caa,thall
#age,trtbps,chol,thalachh,oldpeak

In [None]:
dataset.head()

In [None]:
# Multiple categorical columns
categorical_cols = ['cp','restecg','exng','slp','caa','thall']
df =pd.get_dummies(dataset, columns=categorical_cols)
df

In [None]:
X = df.drop('output',axis=1)
y = df['output']

In [None]:
# Train and Test Splitting of data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
scaler = StandardScaler()
X_train[['age','trtbps','chol','thalachh','oldpeak']] = scaler.fit_transform(X_train[['age','trtbps','chol','thalachh','oldpeak']].values)
X_test[['age','trtbps','chol','thalachh','oldpeak']] = scaler.transform(X_test[['age','trtbps','chol','thalachh','oldpeak']])

# 5)Performing machine learning algorithms on our data

## a)Logistic Regression
<hr>
<table>
  <tr>
    <th>Train Accuracy</th>
    <th>Test Accuracy</th>
    <th>Mean(10-fold CV score)</th>
    <th>Standard Deviation(10-fold CV Score)</th>
    <th>F1 Score 0</th>
     <th>F1 Score 1</th>
  </tr>
  <tr>
    <td>86.78%</td>
    <td>90.16%</td>
    <td>86.00%</td>
    <td>5.27%</td>
    <td>0.90</td>
    <td>0.91%</td>
  </tr>
</table>


**Model**

In [None]:
lr_clf = LogisticRegression(random_state = 0,max_iter=1000)
lr_clf.fit(X_train, y_train)
y_pred = lr_clf.predict(X_test)
print("Train Accuracy : {:.2f}%".format(accuracy_score(y_train,lr_clf.predict(X_train))*100))
print("Test Accuracy  : {:.2f}%".format(accuracy_score(y_test,y_pred)*100))
print(lr_clf.get_params)

**Exaustive Grid Search**

In [None]:
param = [
  {'solver': ['newton-cg', 'lbfgs', 'liblinear','saga'],
   'penalty':['l2','l1','elasticnet'],
    'C':[ 100,10, 1.0, 0.1, 0.01,0.0001],
   'l1_ratio':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
   },
 ]
grid_clf = GridSearchCV(lr_clf, param_grid=param, scoring='accuracy', cv=10)
grid_clf.fit(X_train,y_train)
grid_clf.best_params_

**Training the model with best params**

In [None]:
clf = LogisticRegression(C = 1.0 ,solver='saga',penalty='elasticnet',l1_ratio=0.2,max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

**Confusion Matrix**

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)
# False negetives = False positives

**Train/Test Accuracy**

In [None]:
print("Train Accuracy : {:.2f}%".format(accuracy_score(y_train,clf.predict(X_train))*100))
print("Test Accuracy  : {:.2f}%".format(accuracy_score(y_test,y_pred)*100))

**Classification Report**

In [None]:
print(classification_report(y_test,y_pred))

**K-fold Cross Validation**

In [None]:
accuracies = cross_val_score(estimator = clf, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

## b)K-NN
<hr>
<table>
  <tr>
    <th>Train Accuracy</th>
    <th>Test Accuracy</th>
    <th>Mean(10-fold CV score)</th>
    <th>Standard Deviation(10-fold CV Score)</th>
    <th>F1 Score 0</th>
     <th>F1 Score 1</th>
  </tr>
  <tr>
    <td>88.43%</td>
    <td>90.16%</td>
    <td>84.00%</td>
    <td>6.48%</td>
    <td>0.89</td>
    <td>0.91%</td>
  </tr>
</table>

**Model**

In [None]:
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
classifier.get_params

**Exaustive Grid Search**

In [None]:
param = [
  {
      'n_neighbors':[1,3,5,7,9,11,13,15],
      'weights':['uniform', 'distance'],
      'metric' : ['euclidean','manhattan','minkowski'], 
      'algorithm' :['auto', 'ball_tree', 'kd_tree', 'brute'],
      'p':[1,2],
   },
 ]
grid_clf = GridSearchCV(classifier, param_grid=param, scoring='accuracy', cv=10)
grid_clf.fit(X_train,y_train)
grid_clf.best_params_

In [None]:
classifier = KNeighborsClassifier(n_neighbors = 5,metric='manhattan' ,weights = 'uniform', p = 1,algorithm= 'auto')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

**Accuracy**

In [None]:
print("Train Accuracy : {:.2f}%".format(accuracy_score(y_train,classifier.predict(X_train))*100))
print("Test Accuracy  : {:.2f}%".format(accuracy_score(y_test,y_pred)*100))

**Confusion Matrix**

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)
# False negetive < False positives

**Classification Report**

In [None]:
print(classification_report(y_test,y_pred))

**K fold Cross Validation**

In [None]:
# K-fold Cross Validation
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

## c)SVM
<hr>
<table>
  <tr>
    <th>Train Accuracy</th>
    <th>Test Accuracy</th>
    <th>Mean(10-fold CV score)</th>
    <th>Standard Deviation(10-fold CV Score)</th>
    <th>F1 Score 0</th>
     <th>F1 Score 1</th>
  </tr>
  <tr>
    <td>93.80%</td>
    <td>90.16%</td>
    <td>80.15%</td>
    <td>5.43%</td>
    <td>0.90</td>
    <td>0.91%</td>
  </tr>
</table>


**Model**

In [None]:
classifier = SVC(random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
classifier.get_params

**Exaustive Grid Search**

In [None]:
param = [
  {
      'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'], 
       'C': [0.001,0.01,0.1, 1, 10, 100, 1000],
       'gamma' :[1, 0.1, 0.01, 0.001, 0.0001,'scale','auto']
   },
 ]
grid_clf = GridSearchCV(classifier, param_grid=param, scoring='accuracy', cv=10)
grid_clf.fit(X_train,y_train)
grid_clf.best_params_

**Training the model with best params**

In [None]:
classifier = SVC(kernel='rbf',gamma='scale',C=1.445,random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [None]:
print("Train Accuracy : {:.2f}%".format(accuracy_score(y_train,classifier.predict(X_train))*100))
print("Test Accuracy  : {:.2f}%".format(accuracy_score(y_test,y_pred)*100))

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
# K-fold Cross Validation
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

## d)Decision Tree
<hr>
<table>
  <tr>
    <th>Train Accuracy</th>
    <th>Test Accuracy</th>
    <th>Mean(10-fold CV score)</th>
    <th>Standard Deviation(10-fold CV Score)</th>
    <th>F1 Score 0</th>
     <th>F1 Score 1</th>
  </tr>
  <tr>
    <td>89.67%</td>
    <td>85.25%</td>
    <td>83.47%</td>
    <td>5.88%</td>
    <td>0.85</td>
    <td>0.86%</td>
  </tr>
</table>


**Model**

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
classifier.get_params

**Grid Search CV**

In [None]:
param = [
  {
      'criterion':['gini', 'entropy'], 
      'splitter':['best', 'random'],
      'max_depth':[3,None],
      'max_features':[1,2,3,4,5,6,7,8,9,'auto','sqrt','log2',None],
      'min_samples_leaf': [1,2,3,4,5,6,7,8,9],
   },
 ]
grid_clf = GridSearchCV(classifier, param_grid=param, scoring='accuracy', cv=10)
grid_clf.fit(X_train,y_train)
grid_clf.best_params_

**Fitting the model**

In [None]:
classifier = DecisionTreeClassifier(criterion = 'gini',max_depth=10,max_features=5,min_samples_leaf=3,splitter='best', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [None]:
print("Train Accuracy : {:.2f}%".format(accuracy_score(y_train,classifier.predict(X_train))*100))
print("Test Accuracy  : {:.2f}%".format(accuracy_score(y_test,y_pred)*100))

**Confusion Matrix**

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

**Classification Report**

In [None]:
print(classification_report(y_test,y_pred))

**K-fold Cross Validation**

In [None]:
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

## e)Random Forest
<hr>
<table>
  <tr>
    <th>Train Accuracy</th>
    <th>Test Accuracy</th>
    <th>Mean(10-fold CV score)</th>
    <th>Standard Deviation(10-fold CV Score)</th>
    <th>F1 Score 0</th>
     <th>F1 Score 1</th>
  </tr>
  <tr>
    <td>92.15%</td>
    <td>88.52%</td>
    <td>81.00%</td>
    <td>2.45</td>
    <td>0.88</td>
    <td>0.89%</td>
  </tr>
</table>


**Model**

In [None]:
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0,bootstrap=False)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
classifier.get_params

**Random Hyperparameter Grid**

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
random_grid

**Random Search Training**

In [None]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf,
                               param_distributions = random_grid,
                               n_iter = 100, cv = 3,
                               verbose=2, random_state=42,
                               n_jobs = -1)
rf_random.fit(X_train,y_train)
rf_random.fit(X_train,y_train)

In [None]:
classifier = RandomForestClassifier(bootstrap=True,max_depth=10,n_estimators = 400, criterion = 'entropy', random_state = 0,max_features='sqrt',min_samples_leaf=4,min_samples_split=2)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

**Train,Test Accuracy**

In [None]:
print("Train Accuracy : {:.2f}%".format(accuracy_score(y_train,classifier.predict(X_train))*100))
print("Test Accuracy  : {:.2f}%".format(accuracy_score(y_test,y_pred)*100))

**Confusion Matrix**

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

**Classification Report**

In [None]:
print(classification_report(y_test,y_pred))

**K Fold Cross Validation**

In [None]:
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 3)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

# Results And Observations

#### Results
<table>
<tr>
    <th>MODEL</th>
    <th>Train Accuracy</th>
    <th>Test Accuracy</th>
    <th>Mean(10-fold CV score)</th>
    <th>Standard Deviation(10-fold CV Score)</th>
    <th>F1 Score 0</th>
     <th>F1 Score 1</th>
</tr>

<tr>
     <th scope="row">Logistic Regression</th>
    <td>86.78%</td>
    <td>90.16%</td>
    <td>86.00%</td>
    <td>5.27%</td>
    <td>0.90</td>
    <td>0.91</td>
</tr>

<tr>
    <th scope="row" > KNN </th>
    <td>89.67%</td>
    <td>85.25%</td>
    <td>83.47%</td>
    <td>5.88%</td>
    <td>0.85</td>
    <td>0.86</td>
</tr>

<tr>
    <th scope="row" > SVM </th>
    <td>93.80%</td>
    <td>90.16%</td>
    <td>80.15%</td>
    <td>5.43%</td>
    <td>0.90</td>
    <td>0.91</td>
</tr>
<tr>
    <th scope="row" > Decision Tree </th>
    <td>93.80%</td>
    <td>90.16%</td>
    <td>80.15%</td>
    <td>5.43%</td>
    <td>0.90</td>
    <td>0.91</td>
</tr>

<tr>
    <th scope="row" > Random Forest</th>
    <td>92.15%</td>
    <td>88.52%</td>
    <td>81.00%</td>
    <td>2.45</td>
    <td>0.88</td>
    <td>0.89</td>
</tr>

</table>
