# **Imports and Reading Dataset**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
#visualizing Null values if it exists with heatmap
plt.figure(figsize=(22,10))
plt.xticks(size=20,color='grey')
plt.tick_params(size=12,color='grey')
plt.title('Finding Null Values Using Heatmap\n',color='grey',size=30)
sns.heatmap(df.isnull(),
            yticklabels=False,
            cbar=False,
            cmap='PuBu_r',
            )

# **Data Analysis**

**Feature Selection**

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
data = df.copy()
X = data.iloc[:,0:13]  
y = data.iloc[:,-1]     

bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  
print(featureScores.nlargest(12,'Score'))  

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) 

feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(13).plot(kind='barh')
plt.show()

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(),annot=True,cmap="magma",fmt='.2f')

In [None]:
for i in df.columns:
    print(i,len(df[i].unique()))

# **Data Visualization**

In [None]:
sns.set_style('darkgrid')
sns.set_palette('Set2')

In [None]:
df2 = df.copy()
def chng(sex):
    if sex == 0:
        return 'female'
    else:
        return 'male'
df2['sex'] = df2['sex'].apply(chng)
def chng2(prob):
    if prob == 0:
        return 'Heart Disease'
    else:
        return 'No Heart Disease'
df2['output'] = df2['output'].apply(chng2)

In [None]:
df2['output'] = df2['output'].apply(chng2)
sns.countplot(data= df2, x='sex',hue='output')
plt.title('Gender v/s target\n')

In [None]:
sns.countplot(data= df2, x='sex',hue='thall')
plt.title('Gender v/s Thalassemia\n')
print('Thalassemia (thal-uh-SEE-me-uh) is an inherited blood disorder that causes your body to have less hemoglobin than normal. Hemoglobin enables red blood cells to carry oxygen')

In [None]:
sns.countplot(data= df2, x='slp',hue='output')
plt.title('Slope v/s Target\n')

In [None]:
sns.countplot(data= df2, x='exng',hue='thall')
plt.title('exang v/s Thalassemia\n')

In [None]:
plt.figure(figsize=(16,7))
sns.distplot(df[df['output']==0]['age'],kde=False,bins=50)
plt.title('Age of Heart Diseased Patients\n')

In [None]:
plt.figure(figsize=(16,7))
sns.distplot(df[df['output']==0]['chol'],kde=False,bins=40)
plt.title('Chol of Heart Diseased Patients\n')

**Joint plots in seaborn helps us to understand the trend seen among two features**

In [None]:
df3 = df[df['output'] == 0 ][df.columns] 
#target 0 - people with heart disease
pal = sns.light_palette("blue", as_cmap=True)
print('Age vs trestbps(Heart Diseased Patinets)')
sns.jointplot(data=df3,
              x='age',
              y='trtbps',
              kind='hex',
              cmap='Reds'
           
              )
#age	sex	cp	trtbps	chol	fbs	restecg	thalachh	exng	oldpeak	slp	caa	thall	output

In [None]:
sns.jointplot(data=df3,
              x='chol',
              y='age',
              kind='kde',
              cmap='PuBu'
              )

In [None]:
sns.jointplot(data=df3,
              x='chol',
              y='trtbps',
              kind='resid',
             
              )

In [None]:
sns.boxplot(data=df2,x='output',y='age')


In [None]:
sns.boxplot(data=df2,x='cp',y='thalachh',hue='output')

In [None]:
sns.pairplot(df,hue='cp')

# **Logistic Regression**

In [None]:
X = df.drop('output', axis = 1)
y = df['output']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

**Normalization**

In [None]:
X_train=(X_train-np.min(X_train))/(np.max(X_train)-np.min(X_train)).values
X_test=(X_test-np.min(X_test))/(np.max(X_test)-np.min(X_test)).values

In [None]:
from sklearn.linear_model import LogisticRegression
logre = LogisticRegression()
logre.fit(X_train,y_train)

In [None]:
y_pred = logre.predict(X_test)
actual = []
predcition = []
for i,j in zip(y_test,y_pred):
  actual.append(i)
  predcition.append(j)
dic = {'Actual':actual,
       'Prediction':predcition
       }
result  = pd.DataFrame(dic)
import plotly.graph_objects as go
 
fig = go.Figure()
 
 
fig.add_trace(go.Scatter(x=np.arange(0,len(y_test)), y=y_test,
                    mode='markers+lines',
                    name='Test'))
fig.add_trace(go.Scatter(x=np.arange(0,len(y_test)), y=y_pred,
                    mode='markers',
                    name='Pred'))

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred))
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True)

**ROC Curve**

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr,tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Heart disease classifier')
plt.xlabel('False positive rate (1-Specificity)')
plt.ylabel('True positive rate (Sensitivity)')
plt.grid(True)