# Data exploration

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split,KFold,GridSearchCV,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score,mean_squared_error,confusion_matrix,roc_curve,classification_report,roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.feature_selection import RFECV, RFE,SelectFromModel
from sklearn.svm import SVC
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier

In [2]:
data=pd.read_csv(r"C:\Users\User\Desktop\New folder (2)\cardio_train.csv")

In [28]:
data.head()

Unnamed: 0,age,bmi,ap_hi,ap_lo,gender,above normal gluc,well above normal gluc,above normal chol,well above normal chol,smoke,alco,active,cardio
0,50,21.96712,110,80,1,0,0,0,0,0,0,1,0
1,55,34.927679,140,90,0,0,0,0,1,0,0,1,1
2,51,23.507805,130,70,0,0,0,0,1,0,0,0,1
3,48,28.710479,150,100,1,0,0,0,0,0,0,1,1
4,60,29.384676,120,80,0,1,0,1,0,0,0,0,0


In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [3]:
data.drop("id",axis=1,inplace=True)

In [None]:
data.head()

In [4]:
data["age"]=(data["age"]/365).values.astype(int) 
data["age"]

0        50
1        55
2        51
3        48
4        47
         ..
69995    52
69996    61
69997    52
69998    61
69999    56
Name: age, Length: 70000, dtype: int32

In [5]:
data["bmi"]=data["weight"]/((data["height"]/100)**2)
data.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi
0,50,2,168,62.0,110,80,1,1,0,0,1,0,21.96712
1,55,1,156,85.0,140,90,3,1,0,0,1,1,34.927679
2,51,1,165,64.0,130,70,3,1,0,0,0,1,23.507805
3,48,2,169,82.0,150,100,1,1,0,0,1,1,28.710479
4,47,1,156,56.0,100,60,1,1,0,0,0,0,23.011177


# EDA

In [None]:
data.boxplot(["age","bmi","ap_hi","ap_lo"])

In [6]:
data_num=data[["age","bmi","ap_hi","ap_lo"]]

In [7]:
Q1=data_num.quantile(0.25)
Q3=data_num.quantile(0.75)
IQR=Q3-Q1
IQR

age      10.000000
bmi       6.347107
ap_hi    20.000000
ap_lo    10.000000
dtype: float64

In [8]:
Outliers=((data_num<(Q1-1.5*IQR))|(data_num>(Q3+1.5*IQR))).any(axis=1)
data=data[~Outliers]

In [9]:
data=data.reset_index(drop=True)

In [None]:
data.boxplot(["age","bmi","ap_hi","ap_lo"])

# Data Preprocessing

In [10]:
le=LabelEncoder()
data["gender"]=le.fit_transform(data["gender"])

In [11]:
data=pd.get_dummies(data,columns=["gluc"])
data=pd.get_dummies(data,columns=["cholesterol"])

In [12]:
data=data.rename(columns={"gluc_2":"above normal gluc","gluc_3":"well above normal gluc"})
data=data.rename(columns={"cholesterol_2":"above normal chol","cholesterol_3":"well above normal chol"})

In [13]:
data["above normal gluc"]=le.fit_transform(data["above normal gluc"])
data["well above normal gluc"]=le.fit_transform(data["well above normal gluc"])
data["above normal chol"]=le.fit_transform(data["above normal chol"])
data["well above normal chol"]=le.fit_transform(data["well above normal chol"])

In [None]:
data.head()

In [14]:
data.drop(["cholesterol_1","gluc_1"],axis=1,inplace=True)

In [15]:
data=data.reindex(columns=["age","bmi","ap_hi","ap_lo","gender","above normal gluc","well above normal gluc","above normal chol","well above normal chol","smoke","alco","active","cardio"])

In [16]:
X=data.iloc[:,:12]
Y=data.iloc[:,12]

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(15, 10))
axes = axes.flatten()
for i, column in enumerate(X):  # Limit to 12 features
    axes[i].hist(X[column], bins=20, color='skyblue', edgecolor='black')
    axes[i].set_title(f'Histogram of {column}')
    axes[i].set_xlabel(column)
    axes[i].set_ylabel('Frequency')
    axes[i].grid(True)

plt.tight_layout()

plt.show()

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(15, 10))
axes = axes.flatten()

for i, column in enumerate(X):
    axes[i].scatter(X[column], data["cardio"], color='skyblue', edgecolor='black', alpha=0.7)
    axes[i].set_title(f'Scatter of {column} vs cardio')
    axes[i].set_xlabel(column)
    axes[i].set_ylabel("Cardio")
    axes[i].grid(True)

plt.tight_layout()

plt.show()

In [None]:
fig = plt.figure(figsize=(20,20))
sns.heatmap(data.corr(method='pearson'),annot=True,fmt='.2f',mask=np.triu(data.corr(method='pearson')),cbar=False)
plt.show()

In [17]:
Scalar=StandardScaler() 
pre_scal=X.iloc[:,:4]
post_scal=Scalar.fit_transform(pre_scal)

In [18]:
X_New=pd.DataFrame(post_scal,columns=["age","bmi","ap_hi","ap_lo"])

In [19]:
X_New

Unnamed: 0,age,bmi,ap_hi,ap_lo
0,-0.427260,-1.150143,-1.147411,-0.220180
1,0.313890,1.791382,0.950959,1.083017
2,-0.279030,-0.800469,0.251502,-1.523377
3,-0.723719,0.380328,1.650415,2.386213
4,1.055039,0.533344,-0.447954,-0.220180
...,...,...,...,...
62740,0.017430,-0.765601,0.251502,1.083017
62741,0.610349,0.533362,1.650415,-0.220180
62742,-0.130800,-0.024350,-0.447954,-0.220180
62743,1.203269,0.014644,0.601231,-0.220180


In [20]:
X=pd.concat([X_New,data.iloc[:,4:12]],axis=1)
X

Unnamed: 0,age,bmi,ap_hi,ap_lo,gender,above normal gluc,well above normal gluc,above normal chol,well above normal chol,smoke,alco,active
0,-0.427260,-1.150143,-1.147411,-0.220180,1,0,0,0,0,0,0,1
1,0.313890,1.791382,0.950959,1.083017,0,0,0,0,1,0,0,1
2,-0.279030,-0.800469,0.251502,-1.523377,0,0,0,0,1,0,0,0
3,-0.723719,0.380328,1.650415,2.386213,1,0,0,0,0,0,0,1
4,1.055039,0.533344,-0.447954,-0.220180,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
62740,0.017430,-0.765601,0.251502,1.083017,0,0,0,0,0,0,0,1
62741,0.610349,0.533362,1.650415,-0.220180,0,0,0,0,0,0,0,1
62742,-0.130800,-0.024350,-0.447954,-0.220180,1,0,0,0,0,1,0,1
62743,1.203269,0.014644,0.601231,-0.220180,0,1,0,0,0,0,0,0


In [21]:
X_Train,X_Test,Y_Train,Y_Test=train_test_split(X,Y,test_size=0.2,random_state=0)

# Hyper Parameter Optimization 

In [None]:
kf=KFold(n_splits=10)

In [None]:
params={"max_iter":[10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200]}
params1={"n_neighbors":[10,20,30,40,50,60,70,80,90,100]}
params2={"n_estimators":[100,200,300,400,500,600,700,800,900,1000]}
params3={"C":[1,2,3,4,5,6,7,8,9,10]}

In [None]:
model=LogisticRegression()
# model1=KNeighborsClassifier()
# model2=RandomForestClassifier()
# model3=SVC()


In [None]:
gsearch=GridSearchCV(model,param_grid=params,cv=kf)
rserach=RandomizedSearchCV(model,param_distributions=params,cv=kf)

In [None]:
result1=gsearch.fit(X_Train,Y_Train)
result2=rserach.fit(X_Train,Y_Train)
print(result1.best_params_)
print(result2.best_params_)

# Model Fitting

In [22]:
LR=LogisticRegression(max_iter=20)
KN=KNeighborsClassifier(n_neighbors=80)
RF=RandomForestClassifier(n_estimators=100)
Svc=SVC(kernel="rbf",C=1,probability=True)
T1=("knn",KN)
T2=("lgr",LR)
T3=("svm",Svc)
# VC=VotingClassifier(estimators=[T1,T2,T3],voting="hard")

In [None]:
VC.fit(X_Train,Y_Train)

In [23]:
LR.fit(X_Train,Y_Train)
KN.fit(X_Train,Y_Train)
RF.fit(X_Train,Y_Train)
Svc.fit(X_Train,Y_Train)

# Model Evaluation

In [24]:
Y_Pred1=LR.predict(X_Test)
Y_Pred2=KN.predict(X_Test)
Y_Pred3=RF.predict(X_Test)
Y_Pred4=Svc.predict(X_Test)
print(accuracy_score(Y_Test,Y_Pred1))
print(accuracy_score(Y_Test,Y_Pred2))
print(accuracy_score(Y_Test,Y_Pred3))
print(accuracy_score(Y_Test,Y_Pred4))

0.7209339389592796
0.7260339469280421
0.6785401227189417
0.7277073870427923


In [None]:
comparison_df = pd.DataFrame({
    'Actual': Y_Test,   
    'Predicted': Y_Pred4 
})


comparison_df.head(50)

In [None]:
joblib.dump(VC, 'VC_Model.pkl')

In [None]:
confusion_matrix(Y_Test,Y_Pred2)

In [None]:
sns.heatmap(confusion_matrix(Y_Test,Y_Pred2),annot=True,fmt="g")
plt.xlabel("predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
print(classification_report(Y_Test,Y_Pred2))

In [None]:
y_pred_probs= KN.predict_proba(X_Test)

In [None]:
fpr,tpr,_=roc_curve(Y_Test,y_pred_probs[:,1])

In [None]:
plt.plot(fpr,tpr)
plt.title("ROC curve")
plt.show()

In [None]:
roc_auc_score(Y_Test,y_pred_probs[:,1])

In [None]:
model=SVC(kernel="linear")

In [None]:
l1_model = LogisticRegression(penalty='l2', solver='liblinear', C=0.1,max_iter=130)
l2=KNeighborsClassifier(n_neighbors=80)

In [None]:
rfe=RFE(estimator=model,n_features_to_select=8)

In [None]:
result=rfe.fit(X_Train,Y_Train)

In [None]:
result.support_

In [None]:
selected=X.columns[result.support_]
print(selected)

In [None]:
X=X[selected]
X.head()

In [None]:
X_Train,X_Test,Y_Train,Y_Test=train_test_split(X,Y,test_size=0.2,random_state=0)

In [None]:
l1_model = LogisticRegression(penalty='l2', solver='liblinear', C=0.1,max_iter=130)

In [None]:
Svc=SVC(kernel="linear",C=3,probability=True)

In [None]:
lr=Svc.fit(X_Train,Y_Train)

In [None]:
pr=lr.predict(X_Test)
print(accuracy_score(Y_Test,pr))

In [None]:
sns.heatmap(confusion_matrix(Y_Test,pr),annot=True,fmt="g")
plt.xlabel("predicted")
plt.ylabel("Actual")
plt.show()

In [29]:
def Std_Scalar(pre_scale):
    Scalar1=StandardScaler() 
    post_scale=Scalar1.fit_transform(pre_scale)
    return post_scale

In [32]:
age=input("Age")
gender=input("gender")
height=input("Height")
weight=input("weight")
sys_blood=input("Systolic blood pressure")
height=input("gender")

hello


In [31]:
test_val=np.array([51,23.507805130,130,70,0,0,0,0,1,0,0,0])
X_user = test_val.reshape(1, -1)
prescl=np.array([X_user[0][0],X_user[0][1],X_user[0][2],X_user[0][3]])
prescl=prescl.reshape(1,-1)
mod=Std_Scalar(prescl)
mod.reshape(1,-1)
prol = np.concatenate(
    (mod, X_user[:, 4:].astype(int)),
    axis=1  # Concatenate along columns (axis=1)
)
prol.shape
prediction = Svc.predict(prol)
print("Predicted value:", prediction[0])

Predicted value: 1


