In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
data=pd.read_csv("D:\\Workshops\\Machine Learning for Data Science & Artifcial Intelligence With Python\\data\\cardio.CSV")

In [None]:
data.head()

In [None]:
data.shape

# Missing Values & Duplicates

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
data[["age","height","weight","ap_hi","ap_lo"]].boxplot()

# Some Feature Engineering

In [None]:
data["age"]=(data["age"]/365).values.astype(int)

In [None]:
data["bmi"]=data["weight"]/((data["height"]/100)**2)

In [None]:
data.head()

In [None]:
data.drop(["height","weight"],axis=1,inplace=True)

In [None]:
data.head()

In [None]:
data[["age","ap_hi","ap_lo","bmi"]].boxplot()

# Outlier Handling

In [None]:
data_num=data[["age","ap_hi","ap_lo","bmi"]]

In [None]:
Q1 = data_num.quantile(0.25)
Q3 = data_num.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

In [None]:
out_rows=((data_num < (Q1 - 1.5 * IQR)) |(data_num > (Q3 + 1.5 * IQR))).any(axis=1) 
out_rows

In [None]:
data=data[~out_rows]
data.head()

In [None]:
data[["age","ap_hi","ap_lo","bmi"]].boxplot()

# Descriptive Analysis

In [None]:
data[["age","ap_hi","ap_lo","bmi"]].describe()

In [None]:
data["cardio"].value_counts()

In [None]:
sns.countplot(data["cardio"])
plt.show()

In [None]:
plt.figure(figsize=(12,7))
sns.countplot(x="age",hue="cardio",data=data)
plt.show()

In [None]:
sns.boxplot(x="cardio",y="bmi",data=data)
plt.show()

In [None]:
sns.heatmap(data[["age","ap_hi","ap_lo","bmi"]].corr(),annot=True,vmax=1,vmin=-1)
plt.show()

In [None]:
sns.pairplot(data[["age","ap_hi","ap_lo","bmi"]])
plt.show()

In [None]:
df=data[["age","ap_hi","ap_lo","bmi"]]
df["cardio"]=["Yes" if m==1 else "No" for m in data["cardio"]]

In [None]:
df.head()

In [None]:
sns.pairplot(df,hue="cardio")
plt.show()

# Machine Learning

In [None]:
data.head()

In [None]:
data.drop("id",axis=1,inplace=True)

In [None]:
data=data.reindex(columns=["age","ap_hi","ap_lo","bmi","gender","cholesterol","gluc","smoke","alco","active","cardio"])

In [None]:
data.head()

In [None]:
data["gender"].value_counts()

In [None]:
data["cholesterol"].value_counts()

In [None]:
data["gluc"].value_counts()

In [None]:
data["smoke"].value_counts()

In [None]:
data["alco"].value_counts()

In [None]:
data["active"].value_counts()

In [None]:
data["cardio"].value_counts()

# Label Encoding

In [None]:
le=LabelEncoder()
le

In [None]:
data["gender"]=le.fit_transform(data["gender"])

In [None]:
data.head()

# One Hot Encoding

In [None]:
ohe=OneHotEncoder()
ohe

In [None]:
ohot_encoded1=ohe.fit_transform(data["cholesterol"].values.reshape(len(data["cholesterol"].values),1)).toarray()
ohot_encoded1=ohot_encoded1[:,1:].astype(int)
df_ohot1=pd.DataFrame(ohot_encoded1,columns=["Above Nor Chol","Well Above Nor Chol"])

In [None]:
df_ohot1.head()

In [None]:
ohot_encoded2=ohe.fit_transform(data["gluc"].values.reshape(len(data["gluc"].values),1)).toarray()
ohot_encoded2=ohot_encoded2[:,1:].astype(int)
df_ohot2=pd.DataFrame(ohot_encoded2,columns=["Above Nor Gluc","Well Above Nor Gluc"])

In [None]:
df_ohot2.head()

In [None]:
data.reset_index(inplace=True)

In [None]:
data.drop("index",axis=1,inplace=True)

In [None]:
x=pd.concat([data.iloc[:,:10],df_ohot1,df_ohot2],axis=1)

In [None]:
x.head()

In [None]:
x.drop(["cholesterol","gluc"],axis=1,inplace=True)

In [None]:
x=x.values

In [None]:
y=data.iloc[:,10].values

# Standardizing Neumerical Features

In [None]:
sc=StandardScaler()

In [None]:
x[:,:4]=sc.fit_transform(x[:,:4])

# Train Test Splitting

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

# Modeling

In [None]:
model1=KNeighborsClassifier(n_neighbors=50)
model2=LogisticRegression()
model3=SVC(kernel="rbf",C=1)
model4=RandomForestClassifier(n_estimators=500)

T1=("knn",model1)
T2=("lgr",model2)
T3=("svm",model3)
T4=("rf",model4)

model=VotingClassifier(estimators=[T1,T2,T3,T4],voting="hard")

In [None]:
model.fit(x_train,y_train)

In [None]:
y_pred=model.predict(x_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
print(classification_report(y_test,y_pred))