In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
dataset = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
dataset.head(5)

In [None]:
dataset.info()

In [None]:
dataset.isnull().sum()

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(dataset.corr(), annot=True)

In [None]:
age_chol_df = dataset.groupby('age',as_index=False)['chol'].mean()
age_chol_df.style.background_gradient(cmap='Reds')

In [None]:
numerical_features = [feat for feat in dataset.columns if feat not in ['output']]

In [None]:
numerical_features

In [None]:
for feat in numerical_features:
    df = dataset.groupby(feat)['output'].count()
    print(feat,',', 'Unique :',len(dataset[feat].unique()))
   

In [None]:
discrete_feat = [feat for feat in numerical_features if len(dataset[feat].unique())<10]
discrete_feat
    

In [None]:
continuous_feat = [feat for feat in numerical_features if feat not in discrete_feat]
continuous_feat

In [None]:
for feat in discrete_feat:
    print(feat,',', 'Unique :',len(dataset[feat].unique()),dataset.groupby('output')[feat].count())

In [None]:
for feat in discrete_feat:
    data = dataset.copy()
    
    sns.set_color_codes()
    plt.figure(figsize=(10,7))
    plt.style.use("ggplot")
    sns.histplot(data[feat],color='red')
    plt.xlabel(feat,fontsize=12)
    plt.ylabel("COUNT",fontsize=12)
    plt.show()

In [None]:
for feat in continuous_feat:
    plt.figure(figsize=(10,7))
    plt.style.use("ggplot")
    hue =['sex']
    for i in hue:
        sns.histplot(data= dataset, x =dataset[feat],hue=i )
        plt.xlabel(feat)
        plt.ylabel('Count')
        plt.show()

In [None]:
for feat in continuous_feat:
    plt.figure(figsize=(10,7))
    plt.style.use("ggplot")
    hue =['output']
    for i in hue:
        sns.histplot(data= dataset, x =dataset[feat],hue=i )
        plt.xlabel(feat)
        plt.ylabel('Count')
        plt.show()

In [None]:
for feat in continuous_feat:
    plt.figure(figsize=(10,7))
    plt.style.use("ggplot")
    hue =['cp']
    for i in hue:
        sns.histplot(data= dataset, x =dataset[feat],hue=i )
        plt.xlabel(feat)
        plt.ylabel('Count')
        plt.show()

In [None]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

import xgboost 

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

In [None]:
X = dataset.drop('output',axis=1)
y = dataset['output']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# PipeLines

In [None]:
pipe_LogReg = Pipeline([('scaler',MinMaxScaler()),
                        ('pca1',PCA()),
                        ('LogisticRegression',LogisticRegression(C=2))])

pipe_knn = Pipeline([('scaler',MinMaxScaler()),
                     ('pca1',PCA()),
                    ('KNeighborsClassifier',KNeighborsClassifier())])

pipe_Svm = Pipeline([('scaler',MinMaxScaler()),
                     ('pca1',PCA()),
                    ('SVC',SVC())])

pipe_RForest = Pipeline([('scaler',MinMaxScaler()),
                    ('pca1',PCA()),     
                   ('RandomForestClassifier',RandomForestClassifier())])


pipe_Ada = Pipeline([('scaler',MinMaxScaler()),
                     ('pca1',PCA()),
                  ('AdaBoostClassifier',AdaBoostClassifier())])

pipe_Ada_RF = Pipeline([('scaler',MinMaxScaler()),
                     ('pca1',PCA()),
                  ('AdaBoostClassifier_rf',AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=100),learning_rate=.1,algorithm='SAMME.R'))])

In [None]:
pipelines1 = [pipe_LogReg ,pipe_knn,pipe_Svm,pipe_RForest,pipe_Ada,pipe_Ada_RF]

In [None]:
for pipe in pipelines1:
    pipe.fit(X_train,y_train)

In [None]:
model_name1 =[]
scores1=[]

for i,model in enumerate(pipelines1):
    pred = model.predict(X_test)
    score = accuracy_score(y_test,pred)
    model_name1.append(model.steps[2][0])
    scores1.append(score)
    print(model.steps[2][0].upper(),score)
    print('\n')

In [None]:
scores1

In [None]:
plt.figure(figsize = (17,9))
sns.barplot(x = model_name1, y = scores1)

In [None]:
pipe_LogReg = Pipeline([('scaler',StandardScaler()),
                        ('LogisticRegression',LogisticRegression())])

pipe_knn = Pipeline([('scaler',StandardScaler()),
                        ('KNeighborsClassifier',KNeighborsClassifier())])

pipe_Svm = Pipeline([('scaler',StandardScaler()),
                        ('SVC',SVC(C=2))])

pipe_RForest = Pipeline([('scaler',StandardScaler()),
                  ('RandomForestClassifier',RandomForestClassifier())])


pipe_Ada = Pipeline([('scaler',StandardScaler()),
                  ('AdaBoostClassifier',AdaBoostClassifier())])

In [None]:
pipelines2 = [pipe_LogReg ,pipe_knn,pipe_Svm,pipe_RForest,pipe_Ada]

In [None]:
for pipe in pipelines2:
    pipe.fit(X_train,y_train)

In [None]:
model_name2 =[]
scores2=[]

for i,model in enumerate(pipelines2):
    pred2 = model.predict(X_test)
    score = accuracy_score(y_test,pred2)
    model_name2.append(model.steps[1][0])
    scores2.append(score)
    print(i,model.steps[1][0].upper(),':',score)
    print('\n')

In [None]:
plt.figure(figsize = (15,8))
sns.barplot(x = model_name2, y = scores2)

# RandomizedSearchCV

In [None]:

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_classifier =RandomForestClassifier()

In [None]:
params = {
    'n_estimators':[int(x) for x in np.linspace(100,1000,10)],
    'criterion':["gini", "entropy"],
    'max_depth':[int(x) for x in np.linspace(1,100,10)],
    'min_samples_split':[1,2,3,4,6,8,10,13],
    'min_samples_leaf':[1,2,3,4,5,6,7,8],
    'max_features':["sqrt", "log2","auto"],
    'n_jobs':[4],
    'verbose':[2]
}

In [None]:
rand_srchCV = RandomizedSearchCV(estimator=rf_classifier,
    param_distributions=params,
    n_iter=100,
    n_jobs=4,
    cv=5,
    verbose=2)

In [None]:
rand_srchCV.fit(X_train,y_train)

In [None]:
rand_srchCV.best_score_

In [None]:
rand_srchCV.best_params_

In [None]:
rf_pred = rand_srchCV.predict(X_test)

In [None]:
print('accuracy :',accuracy_score(y_test,rf_pred))

# KNNCLASSIFIER

In [None]:
X_train_knn = X_train
X_test_knn = X_test

In [None]:
scaler = StandardScaler()


In [None]:
X_train_knn = scaler.fit_transform(X_train_knn)
X_test_knn =scaler.fit_transform(X_test_knn)

In [None]:
error_rate = []
for i in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train_knn,y_train)
    knn_pred = knn.predict(X_test_knn)
    error_rate.append(np.mean(knn_pred != y_test))
    

In [None]:
error_rate

In [None]:
plt.figure(figsize=(15,9))
plt.plot(range(1,40),error_rate,marker='o')
plt.xlabel('range(1,40)')
plt.ylabel('errorRate')

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_knn,y_train)
knn_pred2 = knn.predict(X_test_knn)

In [None]:
print('accuracy :',accuracy_score(y_test,knn_pred2))