In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_o2Saturation = pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/o2Saturation.csv")
df_heart = pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv")
df_o2Saturation.head()
df_heart.head()

In [None]:
df_o2Saturation.columns
df_heart.columns


In [None]:

df_o2Saturation.shape
df_heart.shape


In [None]:
df_heart.isnull().sum()

In [None]:
import seaborn as sns
sns.set_style('darkgrid') # set grid for all graphs
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
from collections import Counter

fig = plt.figure()
ax = fig.add_subplot()
ax.set(title="Distribution of Taget Variable")
df_heart.output.value_counts().plot(kind="pie", autopct = "%.2f")
plt.show()

In [None]:
for col in df_heart.columns:
    n = df_heart[col].nunique()
    print(col + " has " + str(n) +" unique value")

In [None]:
num_cols = [col for col in df_heart.columns if df_heart[col].nunique() > 5]
cat_cols = [col for col in df_heart.columns if df_heart[col].nunique() <= 5]
print("Numerical columns ", num_cols , " Categorical columns ", cat_cols)

In [None]:
df_heart.describe()


In [None]:
df_heart[num_cols].describe()

In [None]:
df_heart[cat_cols].describe()

In [None]:
sns.pairplot(df_heart, vars = num_cols, hue='output', corner=True)
plt.show()

In [None]:
sns.histplot(data=df_heart, x='thalachh', hue="output", kde=True)
plt.show()

In [None]:
df_heart['age'] = df_heart['age']/max(df_heart['age'])
df_heart['cp'] = df_heart['cp']/max(df_heart['cp'])
df_heart['trtbps'] = df_heart['trtbps']/max(df_heart['trtbps'])
df_heart['chol'] = df_heart['chol']/max(df_heart['chol'])
df_heart['thalachh'] = df_heart['thalachh']/max(df_heart['thalachh'])

In [None]:
df_heart.describe()

In [None]:
from sklearn.model_selection import train_test_split

#splitting data into training data and testing data
X_train, X_test, y_train, y_test = train_test_split(
    df_heart.drop(['output'], axis=1),
    df_heart.output,
    test_size= 0.2,  # 20% test data & 80% train data
    random_state=0,
    stratify=df_heart.output
)

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score

Y_pred = clf.predict(X_test)
acc=accuracy_score(y_test, Y_pred)
print('Accuracy is',round(acc,2)*100,'%')

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, Y_pred)

from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score
y = label_binarize(y_train, classes=[0, 1])
n_classes = y.shape[1]
y_score = clf.fit(X_train, y_train).decision_function(X_test)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=2000,
                               random_state=0)
distributions = dict(C=uniform(loc=0, scale=4),
                      penalty=['l2', 'l1'])
clf = RandomizedSearchCV(logistic, distributions, random_state=0)
search = clf.fit(X_train, y_train)
print(search.best_params_)

from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_test)
acc=accuracy_score(y_test, y_pred)
print('Accuracy is',round(acc,2)*100,'%')

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

precision = precision_score(y_test, y_pred, average='binary')
print('Precision: %.3f' % precision)
recall = recall_score(y_test, Y_pred, average='binary')
print('Recall: %.3f' % recall)
print(pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True))

In [None]:
>>> from sklearn import tree
>>> X, y = X_train, y_train
>>> clf = tree.DecisionTreeClassifier()
>>> clf = clf.fit(X, y)

In [None]:
tree.plot_tree(clf) 

In [None]:
import graphviz 
dot_data = tree.export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("Heart") 

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
cross_val_score(clf, X_train, y_train, cv=10)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from scipy.stats import uniform
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score

from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Classification report - \n", classification_report(y_test,y_pred))
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

precision = precision_score(y_test, y_pred, average='binary')
print('Precision: %.3f' % precision)
recall = recall_score(y_test, y_pred, average='binary')
print('Recall: %.3f' % recall)
pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True)


In [None]:
from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.neighbors import KNeighborsClassifier
knn= KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train,y_train)
y_pred= knn.predict(X_test)
metrics.accuracy_score(y_test,y_pred)
confusion_matrix(y_test,y_pred)

precision = precision_score(y_test, y_pred, average='binary')
print('Precision: %.3f' % precision)
recall = recall_score(y_test, y_pred, average='binary')
print('Recall: %.3f' % recall)
pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
  
# performing predictions on the test dataset
y_pred = clf.predict(X_test)
  
# metrics are used to find accuracy or error
from sklearn import metrics  
  
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))



In [None]:
from sklearn import metrics
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
# define dataset
model = RandomForestClassifier()
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

from sklearn.ensemble import RandomForestClassifier
# define dataset
# fit the model on the whole dataset
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
metrics.accuracy_score(y_test,y_pred)
confusion_matrix(y_test,y_pred)

precision = precision_score(y_test, y_pred, average='binary')
print('Precision: %.3f' % precision)
recall = recall_score(y_test, y_pred, average='binary')
print('Recall: %.3f' % recall)
pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True)
