In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV,cross_validate
import warnings
warnings.filterwarnings('ignore')
from sklearn.svm import SVC, LinearSVC


In [2]:
# Read in the dataset
df = pd.read_csv("cardio_train.csv", sep=";")
print("Number of rows: %d" % df.shape[0])
print("Number of columns: %d" % df.shape[1])


Number of rows: 70000
Number of columns: 13


In [3]:
# Print the first 5 rows of the dataset
print(df.head())

   id    age  gender  height  weight  ap_hi  ap_lo  cholesterol  gluc  smoke  \
0   0  18393       2     168    62.0    110     80            1     1      0   
1   1  20228       1     156    85.0    140     90            3     1      0   
2   2  18857       1     165    64.0    130     70            3     1      0   
3   3  17623       2     169    82.0    150    100            1     1      0   
4   4  17474       1     156    56.0    100     60            1     1      0   

   alco  active  cardio  
0     0       1       0  
1     0       1       1  
2     0       0       1  
3     0       1       1  
4     0       0       0  


In [4]:
# Check for missing values
print(df.isnull().sum())

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64


In [5]:
# Check the data types of each column
print(df.dtypes)


id               int64
age              int64
gender           int64
height           int64
weight         float64
ap_hi            int64
ap_lo            int64
cholesterol      int64
gluc             int64
smoke            int64
alco             int64
active           int64
cardio           int64
dtype: object


In [6]:
df['age'] = round(df['age']/365.25,2)

In [7]:
df.drop(['id'], axis=1, inplace=True)

In [8]:
df

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,50.36,2,168,62.0,110,80,1,1,0,0,1,0
1,55.38,1,156,85.0,140,90,3,1,0,0,1,1
2,51.63,1,165,64.0,130,70,3,1,0,0,0,1
3,48.25,2,169,82.0,150,100,1,1,0,0,1,1
4,47.84,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,52.68,2,168,76.0,120,80,1,1,1,0,1,0
69996,61.88,1,158,126.0,140,90,2,2,0,0,1,1
69997,52.20,2,183,105.0,180,90,3,1,0,1,0,1
69998,61.41,1,163,72.0,135,80,1,2,0,0,0,1


In [21]:
df['ap_hi'].describe()

count    69925.000000
mean       128.826314
std        154.093668
min       -150.000000
25%        120.000000
50%        120.000000
75%        140.000000
max      16020.000000
Name: ap_hi, dtype: float64

In [9]:
df.duplicated().sum()

75

In [10]:
df.drop_duplicates(inplace=True)

In [11]:
# Split the dataset into features and labels
X = df.drop(columns=["cardio"])
y = df["cardio"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [12]:
# training with SVC classifier
svc_model = SVC(C=100, gamma=0.00001, kernel="rbf", random_state=42)
svc_cv = cross_validate(svc_model, X, y, cv=3)
svc_cv

{'fit_time': array([68.2727983 , 79.16018033, 76.74615955]),
 'score_time': array([52.2744379 , 54.49357057, 59.08168936]),
 'test_score': array([0.73272127, 0.72936331, 0.72897718])}

In [14]:
svc_model.fit(X_train, y_train)


In [15]:
csv_pred = svc_model.predict(X_test)

In [17]:
score=accuracy_score(y_test,csv_pred)

In [18]:
score

0.7358598498391133

fig =  plt.figure(figsize = (15,15))
axes = fig.add_subplot(111)
xgb.plot_importance(XGB_Classifier,ax = axes,height =0.5)
plt.show();
plt.close()

In [19]:
### Create a Pickle file using serialization 
import pickle
pickle_out = open("classifier.pkl","wb")
pickle.dump(svc_model, pickle_out)
pickle_out.close()