# Import Important Package

In [1483]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split
from sklearn import metrics 

# Load and Prepare Data

In [1484]:
data=pd.read_csv('cardio_train.csv')
data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


### *The age is given in days, I have to convert it into years*

In [1485]:
data["age"] = data["age"]/365
data["age"] = data["age"].astype("int")

### *I don't use id column so i drop it*

In [1486]:
data = data.drop('id',axis=1)
data.rename(columns={'ap_hi':'systolic_bp','ap_lo':'diastolic_bp'},inplace=True)
data.head(1)

Unnamed: 0,age,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,gluc,smoke,alco,active,cardio
0,50,2,168,62.0,110,80,1,1,0,0,1,0


### *See if have any dublicates in data*

In [1487]:
data.duplicated().sum()

3208

### *Drop dublicates*

In [1488]:
data.drop_duplicates(inplace=True)

In [1489]:
data.shape

(66792, 12)

### *See if there is any outliers in data*

In [1490]:
outlier = ((data["systolic_bp"]>200) | (data["diastolic_bp"]>180) | (data["diastolic_bp"]<50) | (data["systolic_bp"]<=80) )
print("There is {} outlier".format(data[outlier]["cardio"].count()))

There is 1415 outlier


### *Drop outliers*

In [1491]:
# Removing  the outlier from the Dataset.
data = data[~outlier]


### *Height and weight seems uncorrelated with the cardio feature but Body Mass Index (BMI) could be more helpful*
### *i use pulse pressure to determine cardio feature to reduce numbers of feature*

In [1492]:
data["bmi"] = data["weight"]/ (data["height"]/100)**2


In [1493]:
data = data.drop(['weight','height'],axis=1)


In [1494]:
data["gender"] = data["gender"] % 2
data.head(5)


Unnamed: 0,age,gender,systolic_bp,diastolic_bp,cholesterol,gluc,smoke,alco,active,cardio,bmi
0,50,0,110,80,1,1,0,0,1,0,21.96712
1,55,1,140,90,3,1,0,0,1,1,34.927679
2,51,1,130,70,3,1,0,0,0,1,23.507805
3,48,0,150,100,1,1,0,0,1,1,28.710479
4,47,1,100,60,1,1,0,0,0,0,23.011177


### *BMI between  18.5 and 25 , personis Normal*
### *if BMI obove 25 , person is obese*
### *if BMI less than 18.5 , person is underweight *

In [1495]:

for i,row in data.iterrows():
    if row['bmi'] <18.5 :
        data.at[i,'bmi'] = 0
    elif row['bmi'] >25 :
        data.at[i,'bmi'] = 2
    else  :
        data.at[i,'bmi'] = 1      


### *systolic blood pressure number:*

*Normal: Below 120*

*Elevated: 120-129*

*Stage 1 high blood pressure (also called hypertension): 130-139*

*Stage 2 hypertension: 140 or more*

*Hypertensive crisis: 180 or more.*

In [1496]:
for i,row in data.iterrows():
    if row['systolic_bp'] <120 :
        data.at[i,'systolic_bp'] = 0
    elif (row['systolic_bp'] >=120) and (row['systolic_bp'] <=129) :
        data.at[i,'systolic_bp'] = 1
    elif (row['systolic_bp'] >=130) and (row['systolic_bp'] <=139) :
        data.at[i,'systolic_bp'] = 2
    elif (row['systolic_bp'] >=140) and (row['systolic_bp'] <179) :
        data.at[i,'systolic_bp'] = 3
    else  :
        data.at[i,'systolic_bp'] = 4  

### *diastolic blood pressure number means:*

*Normal: Lower than 80*

*Stage 1 hypertension: 80-89*

*Stage 2 hypertension: 90 or more*

*Hypertensive crisis: 120 or more*

In [1497]:
for i,row in data.iterrows():
    if row['diastolic_bp'] <80 :
        data.at[i,'diastolic_bp'] = 0
    elif (row['diastolic_bp'] >=80) and (row['diastolic_bp'] <=90) :
        data.at[i,'diastolic_bp'] = 1
    elif (row['diastolic_bp'] >90) and (row['diastolic_bp'] <=120) :
        data.at[i,'diastolic_bp'] = 2
    else  :
        data.at[i,'diastolic_bp'] = 3  

In [1498]:
data.age.unique()

array([50, 55, 51, 48, 47, 60, 61, 54, 40, 39, 45, 58, 59, 63, 64, 53, 49,
       57, 56, 46, 43, 62, 52, 42, 44, 41, 29, 30])

In [1499]:
for i,row in data.iterrows():
    if (row['age'] >=20) and (row['age'] <=40) :
        data.at[i,'age'] = 0
    elif (row['age'] >40) and (row['age'] <=45) :
        data.at[i,'age'] = 1
    elif (row['age'] >45) and (row['age'] <=50) :
        data.at[i,'age'] = 2
    elif (row['age'] >50) and (row['age'] <=55) :
        data.at[i,'age'] = 3
    elif (row['age'] >55) and (row['age'] <=60) :
        data.at[i,'age'] = 4
    else  :
        data.at[i,'age'] = 5

### *Rearrange Columns*

In [1500]:
data =  data[ [ col for col in data.columns if col != 'cardio' ]+['cardio'] ]

In [1501]:
data.head(3)

Unnamed: 0,age,gender,systolic_bp,diastolic_bp,cholesterol,gluc,smoke,alco,active,bmi,cardio
0,2,0,0,1,1,1,0,0,1,1.0,0
1,3,1,3,1,3,1,0,0,1,2.0,1
2,3,1,2,0,3,1,0,0,0,1.0,1


# Train and Test Split

In [1502]:
def train_test_splits(df,test_size):
    if isinstance(test_size,float):
        test_size=round(test_size *len(df))
    indcies=df.index.tolist()
    test_indices=random.sample(population=indcies,k=test_size)
    test_set=df.loc[test_indices]
    train_set=df.drop(test_indices)
    return train_set,test_set

In [1503]:
random.seed(0)
train_set,test_set=train_test_splits(data,test_size=0.1)

Algorithm
![](Algorithm.png)
 

In [1504]:
def check_purity(data): 
    labels = data[:,-1]
    unique_classes = np.unique(labels)

    if len(unique_classes) == 1:
        return True
    else:
        return False

In [1505]:
check_purity(train_set.values)

False

### *Return Majority Class*

In [1506]:
def classify_data(dataset): 
    labels = dataset[:,-1]
    unique_classes, count_unique_classes = np.unique(labels, return_counts=True)
    index = count_unique_classes.argmax()
    classification = unique_classes[index]
    return classification

In [1507]:
classify_data(train_set[train_set.age<30].values)

1.0

### *Potential_splits*

In [1508]:
def get_potential_split(data):
    potential_splits = {}
    n_cols = data.shape[1]  # Number of columns
    for i_col in range(n_cols - 1): # Disregarding the last label column  
        potential_splits[i_col] = []
        values = data[:,i_col]
        unique_values = np.unique(values)   # All possible values
        for index in range(len(unique_values)):
            if index !=0 :
                current_value=unique_values[index]
                previous_value=unique_values[index-1]
                potential_splits[i_col].append((current_value+previous_value)/2)
    return potential_splits

In [1509]:
get_potential_split(train_set.values)

{0: [0.5, 1.5, 2.5, 3.5, 4.5],
 1: [0.5],
 2: [0.5, 1.5, 2.5, 3.5],
 3: [0.5, 1.5, 2.5],
 4: [1.5, 2.5],
 5: [1.5, 2.5],
 6: [0.5],
 7: [0.5],
 8: [0.5],
 9: [0.5, 1.5]}

### *Split data*

In [1510]:
def split_data(data,split_column, split_value):
    split_column_values = data[:, split_column]

    left = data[split_column_values <= split_value]
    right = data[split_column_values >  split_value]

    return left, right


In [1511]:
left,right=split_data(train_set.values,3,80)

### Lowest Overall gini

### Gini impurity 
## *$G=1-\sum^{n}_{k=1}{p_{k}^{2}}$*

In [1512]:
def calculate_gini(data): 
    labels = data[:,-1]
    _, counts = np.unique(labels, return_counts=True)

    probs = counts / counts.sum()
    gini = 1 - sum(np.square(probs))

    return gini

### Cost Function That is minimized in classification

### Overall Gini
## *$J=\frac{m_{left}}{m} G_{left}+\frac{m_{right}}{m} G_{right}$*

In [1513]:
def calculate_overall_gini(left, right): 
    total_num = len(left) + len(right)
    prob_left = len(left) / total_num
    prob_right = len(right) / total_num

    overall_gini = prob_left * calculate_gini(left) + prob_right * calculate_gini(right)

    return overall_gini 

In [1514]:
calculate_overall_gini(left,right)

0.49990733740243565

### *Find best feature and best valur to this feature to split data*

In [1515]:
def find_best_split(data, potential_splits): 
    global best_split_column, best_split_value

    min_overall_impurity = float('inf') # Store the largest overall impurity value
    for coulmn_index in potential_splits:
        for value in potential_splits[coulmn_index]:
            left,right = split_data(data,coulmn_index, value)
            overall_impurity = calculate_overall_gini(left, right)

            if overall_impurity <= min_overall_impurity:    # Find new minimised impurity
                min_overall_impurity = overall_impurity     # Replace the minimum impurity
                best_split_column = coulmn_index
                best_split_value = value
    return best_split_column, best_split_value

In [1516]:

splits=get_potential_split(train_set.values)

find_best_split(train_set.values,splits)

(2, 1.5)

## Decision Tree Algorithm

In [1517]:
def decision_tree_algorithm(data,counter=0,min_sample=10,max_depth=6):
    #data Preperation 
    if counter==0:
        global COLUMN_HEADERS
        COLUMN_HEADERS=data.columns
        data=data.values
        

    else:
        
        data=data 
    # base Algorithm ==> recursive function
    if (check_purity(data)) or (len(data)< min_sample) or(counter==max_depth) :
        classification=classify_data(data)
        return classification
        #recursive part   
    else:
        counter+=1
        #helper function 
        potential_splits=get_potential_split(data)

        split_column,split_value=find_best_split(data,potential_splits)
        left,right=split_data(data,split_column,split_value)

        #instant sub tree
        features_name=COLUMN_HEADERS[split_column]
        question ="{} <= {}".format(features_name,split_value)
        sub_tree={question:[]}


        #find answer(recuresion)
        yes_answer=decision_tree_algorithm(left,counter,min_sample,max_depth)
        no_answer=decision_tree_algorithm(right,counter,min_sample,max_depth)
        if yes_answer==no_answer:
            sub_tree=yes_answer

        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)

        return sub_tree


In [1518]:
tree=decision_tree_algorithm(train_set,min_sample=10)


## Classification

### *Classifiy Just one instance*

In [1519]:
question=list(tree.keys())[0]
question


'systolic_bp <= 1.5'

In [1520]:
def classify_instance(instance,tree):        
        question=list(tree.keys())[0]
        feature_name,comparison_operator,value = question.split()

        #ask question 
        if instance[feature_name] <= float(value):
            answer=tree[question][0]
        else:
            answer=tree[question][1]

        #base case
        if not isinstance(answer,dict):
            return answer
        else:
            residual_tree = answer
            return classify_instance(instance,residual_tree)   
                




### *Classifiy all instances from test set*

In [1521]:
def predict(test_set,tree):
    predications=list()

    for i in range(test_set.shape[0]):
        predications.append(classify_instance(test_set.iloc[i],tree))

    return predications

In [1522]:
Y_p=predict(test_set,tree)

## Evaluate Performance

In [1523]:
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0
 

In [1524]:
actual=test_set.iloc[:,-1]
accuracy_metric(actual.values,Y_p)

72.83572958091159

## Use My Model 

In [1525]:
random.seed(0)
train_df,test_df=train_test_splits(data,test_size=0.1)
Tree=decision_tree_algorithm(train_df,min_sample=10)
y_predict=predict(test_set,Tree)
actual=test_df.iloc[:,-1]
accuracy_metric(actual.values,y_predict)

72.83572958091159

# Use Sicit_Learn Model To compare score

In [1526]:
col= data.shape[1]
X= data.iloc[:,:col-1]
Y=data.iloc[:,col-1:col]

In [1527]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

In [1528]:
Model = DecisionTreeClassifier()
Model.fit(X_train,y_train)
y_pred = Model.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100)

Accuracy: 71.46426022229019


# Bagging Ensemble Learning.

In [1529]:
def Bagged_fitting(data, num_of_bagged):
        models=[]
       
        for i in range(num_of_bagged):
            sample=data.sample(n=len(data))
            model=decision_tree_algorithm(sample)
            models.append(model)
        return models   

In [1530]:
def Bagged_prdiction(test_set,models,num_of_bagged):
        pred=np.zeros(len(test_set))
        for model in models:
            pred+=predict(test_set,model)  
        return np.round(pred/num_of_bagged) 

# Use My Bagging Ensemble Model

In [1531]:
trees=Bagged(train_set,9)

In [1532]:
actual=test_set.iloc[:,-1]
YM=Bagged_prdiction(test_set,trees,9)
accuracy_metric(actual.values,YM)

71.93331293973692