# Dr. House in the House


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv(r'/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')
df.head()

### **Variable Description**

#### **Age** : Age of the patient

#### **Sex** : Sex of the patient

#### **Exang** : exercise induced angina (1 = yes; 0 = no)

#### **Ca** : number of major vessels (0-3)

#### **Cp** : Chest Pain type chest pain type

* Value 1: typical angina
* Value 2: atypical angina
* Value 3: non-anginal pain
* Value 4: asymptomatic

#### **trtbps** : resting blood pressure (in mm Hg)

#### **Chol** : cholestoral in mg/dl fetched via BMI sensor

#### **Fbs** : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

#### **Rest_ecg** : resting electrocardiographic results

* Value 0: normal
* Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
* Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

#### **Thalach** : maximum heart rate achieved

#### **Target** : 0= less chance of heart attack 1= more chance of heart attack

In [None]:
print(df.isna().sum())

#### Who doesn't love a complete dataset!
I hope I dont mess this up

In [None]:
print(df.duplicated().sum())

#### Lets drop that and move on

In [None]:
df.drop_duplicates(inplace=True)
print(df.duplicated().sum())

#### Lets pair these columns up and pick the really fancy ones


In [None]:
sns.set_theme()
sns.pairplot(df,hue='output')
plt.legend()

#### That was a lot but now lets make a heatmap of the correaltion matrix and put 1 and 2 together to get this rolling.

In [None]:
plt.figure(figsize=(13,13))
sns.heatmap(df.corr(),color='red',annot=True)

#### looks like the variables **slp,thalach,cp and oldpeak** seem to show relatively strong positive/negative correlation. Lets take a closer look ~~with Seth Meyers~~ 

In [None]:
fig,axes=plt.subplots(2,2,figsize=(20,20))

sns.kdeplot(ax=axes[0,0],x='slp',hue='output',data=df)
widths=[2,2]
g=sns.barplot(ax=axes[0,1],y='thalachh',x='output',hue='output',data=df)
g.legend(loc='center')

sns.countplot(ax=axes[1,0],x='cp',hue='output',data=df)

sns.swarmplot(ax=axes[1,1],y='oldpeak',x='output',hue='output',data=df)

### Quick Observations
#### 1. **thalachh** values tend to be **higher** for patients who are more **likely** to have heart attacks.
#### 2. **Non anginal chest pains** are relatively **high** for patients who are **likely** to have a heart attack
#### 3. the **oldspeak** distribution for both probabilities of patients really **compliment** each other.

#### Before we use a classification model on our features and output we need to do feature scaling to minimize any potential bias that can affect our model.

![](https://datascience.foundation/img/pdf_images/Equation4_Gradient-descent-update-rule.png)

#### I dont mean to scare you but the 'x' in the equation implies that the gradient descent optimization technique (thats what that equation does) that is used by  many of the popular classification algorithms(including the one I am planning to use) are greatly  influenced by our feature variables. And as you can notice from our  dataset trtbps, chol and thalachh have their values in the hundreds while cp, slp, thall, just to name a few are single digits. For this reason we have to scale their  values. We can do this by either normalizing or standardising them. I've chosen to standardize them  but I'll hopefully update this notebook to show you how the results may vary if we normalize them in the coming days. 

# Logistic Regression Model

In [None]:
Scaleme= StandardScaler()
features=df.drop(columns='output')
output=df['output']
X_train, X_test, y_train, y_test = train_test_split(features, output, test_size = 0.2, random_state = 42)
X_train=Scaleme.fit_transform(X_train)
X_test=Scaleme.transform(X_test)
Classifier=LogisticRegression(random_state=45)
model=Classifier.fit(X_train,y_train)
y_pred=Classifier.predict(X_test)



In [None]:
print("The accuracy of the Logistic Regression model is : ", accuracy_score(y_test, y_pred.round())*100, "%")
def cmcrcheck(X_test,y_test,y_pred,model):
    
    print(classification_report(y_test,y_pred))
    cm= confusion_matrix(y_test,y_pred)
    cmdf=pd.DataFrame(index=[0,1],columns=[0,1],data=cm)
    fig,axes=plt.subplots(figsize=(5,5))
    g=sns.heatmap(cmdf,annot=True,cmap='Greens',fmt='.0f',ax=axes,cbar=False)
    g.set_xlabel('Predicted Value')
    g.set_ylabel('True Value')
   
    plot_roc_curve(model,X_test,y_test)
    plt.show()
    
    
cmcrcheck(X_test,y_test,y_pred,model)

    

# KNN Classifier

#### Also dubbed as the lazy algorithm for its lack of emphasize on training. I can relate to this algorithm on a spiritual level lol. 

#### The KneighborClassifier simply takes in the number of neighbours you want it to check for and classify the new data based on its neighbors. It does so by calculting the Euclidean distance (shortest distance) between the new data and its neighbors to make out the closest ones and takes a popularity vote to classify the new data.

In [None]:
model = KNeighborsClassifier(n_neighbors = 7)
  
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
  

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))

print("The accuracy of KNN is : ", accuracy_score(y_test, y_pred.round())*100, "%")

cmcrcheck(X_test,y_test,y_pred,model)


##### We notice a smoother curve the second time although covering roughly the same amount of area as our Logistic Regression model. The confusion matrix however shows a clear indication as to why the model has a higher accuracy score than the first. 

In [None]:
id=pd.DataFrame(X_test)
output = pd.DataFrame({'Id': id.index,
                       'Output': y_pred})
output.to_csv('submission.csv', index=False)

#### Any and all feedback is appreciated. Hope this was informative. Thanks for reading homie!