In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2,f_classif


import warnings

# ignore *all* warnings
warnings.filterwarnings("ignore")


ModuleNotFoundError: No module named 'pandas'

In [None]:
data=pd.read_csv('/content/heart.csv')
data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [None]:
train=data.iloc[:800,:]
test_df=data.iloc[800:,0:12]

*Data Cleaning**

In [None]:
cat_cols=train.select_dtypes(include=['object']).columns
cat_cols
num_cols=train.select_dtypes(include=['number']).columns
num_cols

Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak',
       'HeartDisease'],
      dtype='object')

In [None]:
for i in cat_cols:
  print(train[i].value_counts())
  print()
  print()

Sex
M    640
F    160
Name: count, dtype: int64


ChestPainType
ASY    444
NAP    167
ATA    151
TA      38
Name: count, dtype: int64


RestingECG
Normal    489
ST        176
LVH       135
Name: count, dtype: int64


ExerciseAngina
N    464
Y    336
Name: count, dtype: int64


ST_Slope
Flat    404
Up      341
Down     55
Name: count, dtype: int64




EDA

In [None]:
# for i in num_cols:
#   plt.figure(figsize=(6,4))
#   sns.histplot(train[i],kde=True)
#   plt.show()

In [None]:
# for i in cat_cols:
#   plt.figure(figsize=(6,4))
#   sns.countplot(data=train,x=i,hue='HeartDisease')
#   plt.show()

In [None]:
# for i in num_cols:
#   plt.figure(figsize=(6,4))
#   sns.boxplot(data=train,x=i)
#   plt.show()

DATA PREPROCESSING

In [None]:
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import VotingClassifier

for col in train.select_dtypes(include="object").columns:
    train[col] = train[col].str.lower()
for col in test_df.select_dtypes(include="object").columns:
    test_df[col] = test_df[col].str.lower()










In [None]:
# RestingECG categories
restingecg_order = ["normal", "st", "lvh"]

# ST_Slope categories
st_slope_order = ["up", "flat", "down"]
ExerciseAngina_order=['n','y']
#Function Transformer
def df_lowercase(df):
    df = df.copy()

    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.lower()
    return df

lowercase_transformer = FunctionTransformer(df_lowercase, validate=False)

In [None]:
#Make ColumnsTransformer
num_cols = ['Age','RestingBP','Cholesterol','MaxHR']
cat_cols = ['Sex','ChestPainType']
cat_cols2 = ['RestingECG','ExerciseAngina','ST_Slope']

preprocessor = ColumnTransformer(
    transformers=[

        ("num", StandardScaler(), num_cols),
        ("cat_cols", OneHotEncoder(drop='first'), cat_cols),
        ("catcols2", OrdinalEncoder(categories=[restingecg_order,ExerciseAngina_order, st_slope_order]), cat_cols2)
    ],remainder='passthrough'
)


In [None]:
# make model for decison tree
model = Pipeline(steps=[
    ("lowercase", lowercase_transformer),
    ("preprocessor", preprocessor),
    ('dt',DecisionTreeClassifier(max_depth=4,min_samples_split=50))
])
x_train,x_test,y_train,y_test=train_test_split(data.drop('HeartDisease',axis=1),data['HeartDisease'],test_size=0.2)

model.fit(x_train,y_train)
pred1=model.predict(x_test)

print(accuracy_score(y_test,pred1))
accuracy_score(y_train,model.predict(x_train))

0.8369565217391305


0.8651226158038147

In [None]:

#make model for logistic regression
model2 = Pipeline(steps=[
    ("lowercase", lowercase_transformer),
    ("preprocessor", preprocessor),
    ('lr',LogisticRegression())
])
x_train,x_test,y_train,y_test=train_test_split(data.drop('HeartDisease',axis=1),data['HeartDisease'],test_size=0.2)
model2.fit(x_train,y_train)
pred1=model2.predict(x_test)
base_tree=DecisionTreeClassifier(max_depth=4,min_samples_split=50)

print(accuracy_score(y_test,pred1))
accuracy_score(y_train,model2.predict(x_train))

0.8315217391304348


0.8610354223433242

In [None]:
#using same pipeline for more models with together
kernels = ["linear", "poly", "rbf", "sigmoid"]
models = {
    "DecisionTree": DecisionTreeClassifier(max_depth=4, min_samples_split=50),
    "LogisticRegression": LogisticRegression(max_iter=500),
    "KNN": KNeighborsClassifier(n_neighbors=7),
    **{f"SVM_{k}": SVC(kernel=k) for k in kernels},
    "Gradient Boosting":GradientBoostingClassifier(
    n_estimators=300,      # number of trees (boosting stages)
    learning_rate=0.03,     # step size shrinkage
    max_depth=5,
    min_samples_split=50,
    min_impurity_decrease=0.04,
    subsample=0.7,
    min_samples_leaf=10,
    n_iter_no_change=5


),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_split=50),
    'xgb':XGBClassifier(
    n_estimators=250,      # number of trees
    learning_rate=0.05,    # step size shrinkage
    max_depth=3,           # depth of each tree
    subsample=0.6,         # fraction of rows per tree
    colsample_bytree=0.8,  # fraction of features per tree
    reg_lambda=1,          # L2 regularization
    reg_alpha=0,           # L1 regularization
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss"
),
    'bagging':BaggingClassifier(
    estimator=base_tree,     # base model
    n_estimators=301,        # number of trees
    max_samples=0.6,         # fraction of samples per estimator
    max_features=0.6,        # fraction of features per estimator
    bootstrap=True,          # sample with replacement

    n_jobs=-1
)

}



for name, estimator in models.items():
    pipe = Pipeline(steps=[
        ("lowercase", lowercase_transformer),
        ("preprocessor", preprocessor),
        ("selection",SelectKBest(score_func=f_classif, k=9)),
        ("model", estimator)
    ])

    pipe.fit(x_train, y_train)
    y_pred = pipe.predict(x_test)

    print(f"{name} Train Accuracy: {accuracy_score(y_train, pipe.predict(x_train)):.4f}")
    print(f"{name} Test  Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("="*50)
a=pd.DataFrame([[40,'m','ata',140,289,0,'normal',172,'n',0,'up']],columns=x_train.columns)
a
model.predict(a)

DecisionTree Train Accuracy: 0.8583
DecisionTree Test  Accuracy: 0.7609
LogisticRegression Train Accuracy: 0.8583
LogisticRegression Test  Accuracy: 0.8098
KNN Train Accuracy: 0.8910
KNN Test  Accuracy: 0.8207
SVM_linear Train Accuracy: 0.8692
SVM_linear Test  Accuracy: 0.8261
SVM_poly Train Accuracy: 0.8828
SVM_poly Test  Accuracy: 0.7989
SVM_rbf Train Accuracy: 0.8924
SVM_rbf Test  Accuracy: 0.8315
SVM_sigmoid Train Accuracy: 0.7193
SVM_sigmoid Test  Accuracy: 0.6793
Gradient Boosting Train Accuracy: 0.9264
Gradient Boosting Test  Accuracy: 0.8478
Random Forest Train Accuracy: 0.8896
Random Forest Test  Accuracy: 0.8261
xgb Train Accuracy: 0.9319
xgb Test  Accuracy: 0.8587
bagging Train Accuracy: 0.8787
bagging Test  Accuracy: 0.8152


array([0])

In [None]:
Data=data.copy()
col=data.select_dtypes(include=['object']).columns
for i in col:
  data[i]=data[i].str.lower()
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(
    n_estimators=50,      # number of boosting stages
    learning_rate=0.1,     # step size shrinkage
    max_depth=3,           # max depth of individual trees

)
model3=Pipeline(steps=[
    ("lowercase", lowercase_transformer),
    ("preprocessor", preprocessor),
     ("selection",SelectKBest(score_func=f_classif, k=9)),
    ('gbc',gbc)
])
model3.fit(x_train,y_train)
pred1=model3.predict(x_test)
print('gradient boosting classifier')
print(accuracy_score(y_test,pred1))
print(accuracy_score(y_train,model3.predict(x_train)))
a = pd.DataFrame([[40,'M','Ata',140,289,0,'Normal',172,'N',0,'Up']], columns=x_train.columns)
model3.predict(a)

gradient boosting classifier
0.842391304347826
0.9291553133514986


array([0])

In [None]:
#use xtreme gradient bossting classifier
xgb=XGBClassifier(
    n_estimators=151,      # number of trees
    learning_rate=0.1,    # step size shrinkage
    max_depth=3,           # depth of each tree
    subsample=0.6,         # fraction of rows per tree
    colsample_bytree=0.8,  # fraction of features per tree
    reg_lambda=1,          # L2 regularization
    reg_alpha=1,           # L1 regularization
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss"
)

x_train,x_test,y_train,y_test=train_test_split(data.drop('HeartDisease',axis=1),data['HeartDisease'],test_size=0.2)

final=Pipeline(steps=[
    ("lowercase", lowercase_transformer),
    ("preprocessor", preprocessor),
    ("selection",SelectKBest(score_func=f_classif, k=9)),
    ('xgb',xgb)
])
final.fit(x_train,y_train)
prediction=final.predict(x_test)
print(accuracy_score(y_test,prediction))
print(accuracy_score(y_train,final.predict(x_train)))



0.8804347826086957
0.9237057220708447


In [None]:
#use voting classifier
voting_clf = VotingClassifier(
    estimators=[
      #  ('dt', DecisionTreeClassifier(max_depth=3, min_samples_split=50)),
        ('lr', LogisticRegression(max_iter=1000)),
        ('knn', KNeighborsClassifier(n_neighbors=5)),
       ('xgb',XGBClassifier(
    n_estimators=151,      # number of trees
    learning_rate=0.1,    # step size shrinkage
    max_depth=3,           # depth of each tree
    subsample=0.6,         # fraction of rows per tree
    colsample_bytree=0.8,  # fraction of features per tree
    reg_lambda=1,          # L2 regularization
    reg_alpha=1,           # L1 regularization
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss"
)),
       ('bg',GradientBoostingClassifier(
    n_estimators=300,      # number of trees (boosting stages)
    learning_rate=0.03,     # step size shrinkage
    max_depth=5,
    min_samples_split=50,
    min_impurity_decrease=0.04,
    subsample=0.7,
    min_samples_leaf=10,
    n_iter_no_change=5


)),
       ('bagging',BaggingClassifier(
    estimator=base_tree,     # base model
    n_estimators=301,        # number of trees
    max_samples=0.6,         # fraction of samples per estimator
    max_features=0.6,        # fraction of features per estimator
    bootstrap=True,          # sample with replacement

    n_jobs=-1
)),
       ('rf',RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_split=50)),

       ('svm_rbf',SVC(kernel='rbf')),
      #  ('svm_linear',SVC(kernel='linear')),
    ],
    voting='hard'   # 'hard' ya 'soft' try kar sakte ho
)

In [None]:
voting_clf

In [None]:
final2=Pipeline(steps=[
    ("lowercase", lowercase_transformer),
    ("preprocessor", preprocessor),
    ("selection",SelectKBest(score_func=f_classif, k=9)),
    ('voting',voting_clf)
])
final2.fit(x_train,y_train)
prediction=final2.predict(x_test)
print(accuracy_score(y_test,prediction))
print(accuracy_score(y_train,final2.predict(x_train)))

0.9021739130434783
0.8896457765667575


In [None]:
# import joblib
# joblib.dump(final, "model.pkl")
# expected_columns = x_train.columns.tolist()
# joblib.dump(expected_columns, "columns.pkl")

In [None]:
x_train.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope'],
      dtype='object')