In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

## Data Description

**There are 3 types of input features:**

- Objective: factual information;
- Examination: results of medical examination;
- Subjective: information given by the patient.


1. Age | Objective Feature | age | int (days)
2. Height | Objective Feature | height | int (cm) |
3. Weight | Objective Feature | weight | float (kg) |
4. Gender | Objective Feature | gender | categorical code |
5. Systolic blood pressure | Examination Feature | ap_hi | int |
6. Diastolic blood pressure | Examination Feature | ap_lo | int |
7. Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal |
8. Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal |
9. Smoking | Subjective Feature | smoke | binary |
10. Alcohol intake | Subjective Feature | alco | binary |
11. Physical activity | Subjective Feature | active | binary |
12. Presence or absence of cardiovascular disease | Target Variable | cardio | binary |

<img src=https://img.webmd.com/dtmcms/live/webmd/consumer_assets/site_images/article_thumbnails/other/blood_pressure_charts/basic_blood_pressure_chart.png width="650">

Viewing the raw dataset and its information:

In [None]:
# read the raw data
df_original = pd.read_csv('/kaggle/input/cardiovascular-disease-dataset/cardio_train.csv', sep = ';')
df_original.sample(7)

In [None]:
df = df_original.copy()
df.info()

In [None]:
# there is no missing values in this dataset, checking for duplicated rows
df.duplicated().any()

In [None]:
# changing the age column into year
df['age'] = df['age'].map(lambda x: round(x/365))

In [None]:
df.describe().T[1:]    #excluding the id column

### **Observations from the statistics description:**
- minumum age recorded is 30 years old
- height(cm) and weight(kg) are objective features
    - height - min: 55cm
    - weight - min: 10kg
- ap_hi and ap_lo are examination features and both recorded extremes values
    - ap_hi - min:-150 and max:16020
    - ap_low - min:-70 and max:11000
    
## Checking for outliers:

plotting histograms for quick view

In [None]:
# plotting histogram for quick view
figure = plt.figure(figsize=(12,10))

ax1 = plt.subplot(221)
ax1 = plt.hist(df['height'], bins=50)
ax1 = plt.title('height')

ax2 = plt.subplot(222)
ax2 = plt.hist(df['weight'], bins=50)
ax2 = plt.title('weight')

ax3 = plt.subplot(223)
ax3 = plt.hist(df['ap_hi'], bins=30)
ax3 = plt.title('ap_hi')

ax4 = plt.subplot(224)
ax4 = plt.hist(df['ap_lo'], bins=30)
ax4 = plt.title('ap_lo')

plt.show()

# the extreme values in column ap_hi and ap_lo affected the range of x in historgrams

In [None]:
# to remove values more than 1.5 times the Inter Quartile Range (IQR) variable values
def outliers_iqr(ys):
    quartile1, quartile3 = np.percentile(ys, [25,75])
    iqr = quartile3 - quartile1
    lower_bound = quartile1 - (iqr*3)
    upper_bound = quartile3 + (iqr*3)
    
    print(f'Q1:{quartile1}, Q3:{quartile3}, IQR:{iqr}')
    print(f'Lower Bound:{lower_bound}, Upper Bound:{upper_bound}')
    
    result = np.where((ys > upper_bound) | (ys < lower_bound))
    boundary = (lower_bound, upper_bound)
    
    print(f'Number of outliers: {len(result[0])}')
    
    return result, boundary

In [None]:
height_outlier_index = list(outliers_iqr(df['height'])[0][0])
df_height_outlier = df.iloc[height_outlier_index,:]
df_height_outlier

In [None]:
weight_outlier_index = list(outliers_iqr(df['weight'])[0][0])
df_weight_outlier = df.iloc[weight_outlier_index,:]
df_weight_outlier

In [None]:
ap_hi_outlier_index = list(outliers_iqr(df['ap_hi'])[0][0])
df_aphi_outlier = df.iloc[ap_hi_outlier_index,:]
df_aphi_outlier

In [None]:
ap_lo_outlier_index = list(outliers_iqr(df['ap_lo'])[0][0])
df_aplo_outlier = df.iloc[ap_lo_outlier_index,:]
df_aplo_outlier

## 2 different ways in handling outliers

1. replace with nearest boundary values
2. remove outliers

### 1.  replace with nearest boundary values -- creating df1

In [None]:
def replace_outliers_boundary(dataframe, column):
    lower_bound = outliers_iqr(dataframe[column])[1][0]
    upper_bound = outliers_iqr(dataframe[column])[1][1]
    
    dataframe.loc[(dataframe[column] < lower_bound), column] = lower_bound
    dataframe.loc[(dataframe[column] > upper_bound), column] = upper_bound

In [None]:
replace_outliers_boundary(df, 'height')
replace_outliers_boundary(df, 'weight')
replace_outliers_boundary(df, 'ap_hi')
replace_outliers_boundary(df, 'ap_lo')

In [None]:
df1 = df.drop(columns = ['id'])
df1

### 2. remove outliers -- creating df2

In [None]:
all_outlier_index = height_outlier_index + weight_outlier_index + ap_hi_outlier_index + ap_lo_outlier_index
all_outlier_index = list(set(all_outlier_index))
print(len(all_outlier_index))

In [None]:
df2 = df.drop(index = all_outlier_index).reset_index()
df2.drop(columns=['index','id'], inplace=True)
df2

## Data Understanding

First, checking on the proportion of our target:

In [None]:
# checking on the proportion of our target

x = range(2)

plt.subplot(1,2,1)
bar1 = plt.bar(x, df1['cardio'].value_counts(normalize=True))
plt.xticks(x, df1['cardio'].unique())
plt.title('df1 target\'s proportion')

plt.subplot(1,2,2)
bar1 = plt.bar(x, df2['cardio'].value_counts(normalize=True))
plt.xticks(x, df2['cardio'].unique())
plt.title('df2 target\'s proportion')

plt.tight_layout()

plt.show()

### understand categorical variable, each category proportion

In [None]:
def get_cat_value_count(dataframe,column_list):
    for i in column_list:
        print(f'{i}')
        print('*'*len(i))
        print(dataframe[i].value_counts(normalize=True))
        print()

column_cat_list = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']

In [None]:
# for df1
get_cat_value_count(df1, column_cat_list)

In [None]:
# for df2
get_cat_value_count(df2, column_cat_list)

### understand continous variable distribution

for df1:

In [None]:
# understand continous variable distribution
# for df1

figure3 = plt.figure(figsize = (12,8))


plt.subplot(2,3,1)
sns.distplot(df1['age'], bins=15, kde=False, color="red")

plt.subplot(2,3,2)
sns.distplot(df1['height'], bins=25, kde=False, color="red")

plt.subplot(2,3,3)
sns.distplot(df1['weight'], bins=25, kde=False, color="red")

plt.subplot(2,3,4)
sns.distplot(df1['ap_lo'], bins=15, kde=False, color="red")

plt.subplot(2,3,5)
sns.distplot(df1['ap_hi'], bins=15, kde=False, color="red")

plt.tight_layout()

plt.show()

for df2:

In [None]:
# for df2
figure4 = plt.figure(figsize = (12,8))

plt.subplot(2,3,1)
sns.distplot(df2['age'], bins=15, kde=False, color="red")

plt.subplot(2,3,2)
sns.distplot(df2['height'], bins=25, kde=False, color="red")

plt.subplot(2,3,3)
sns.distplot(df2['weight'], bins=25, kde=False, color="red")

plt.subplot(2,3,4)
sns.distplot(df2['ap_lo'], bins=15, kde=False, color="red")

plt.subplot(2,3,5)
sns.distplot(df2['ap_hi'], bins=15, kde=False, color="red")

plt.tight_layout()

plt.show()

Looking into the correlation heatmap, and it seems like our target has higher correlation with age, weight, ap_hi, ap_lo, cholestrol.

In [None]:
sns.heatmap(df.drop(columns=['id']).corr(), annot=True, cmap='YlOrBr')
fig = plt.gcf()
fig.set_size_inches(10,8)

## Data Preprocessing
### for df1

In [None]:
# to combine weight and height into 1 feature
df1['bmi'] = round(df1['weight']/ (df1['height']/100)**2, 2)
df1

taking the bmi range as according to cdc.gov

- Below 18.5: Underweight --> 1
- 18.5 - 24.9: Normal --> 2
- 25.0 - 29.9: Overweight --> 3
- 30 and above: Obese --> 4

In [None]:
# binning the bmi feature
df1.loc[(df1['bmi'] < 18.5), 'bmi_cat'] = 1
df1.loc[(df1['bmi'] >= 18.5) & (df1['bmi'] < 25), 'bmi_cat'] = 2
df1.loc[(df1['bmi'] >= 25) & (df1['bmi'] < 30), 'bmi_cat'] = 3
df1.loc[(df1['bmi'] >= 30), 'bmi_cat'] = 4

In [None]:
df1['bmi_cat'] = df1['bmi_cat'].astype('int')
df1['bmi_cat'].value_counts()

We will also group the blood pressure range accordingly:

Systolic Blood Pressure (ap_hi), Diastolic Blood Pressure (ap_lo)
- ap_hi < 130 mmHg and ap_lo < 80 mmHg: Normal --> 1
- ap_hi >= 130 mmHg and ap_lo >= 80 mmHg: Hypertension--> 2

In [None]:
# binning the bp (ap_hi & ap_lo) features
df1.loc[(df1['ap_hi'] < 130) | (df1['ap_lo'] < 90), 'blood_pressure_cat'] = 1
df1.loc[(df1['ap_hi'] >= 130) | (df1['ap_lo'] >= 90), 'blood_pressure_cat'] = 2

In [None]:
df1['blood_pressure_cat'] = df1['blood_pressure_cat'].astype('int')
df1['blood_pressure_cat'].value_counts()

In [None]:
# doing the same for 'age' column
df1['age'].hist()

In [None]:
# binning the age feature
df1.loc[(df1['age'] >= 30) & (df1['age'] < 40), 'age'] = 1
df1.loc[(df1['age'] >= 40) & (df1['age'] < 50), 'age'] = 2
df1.loc[(df1['age'] >= 50) & (df1['age'] < 60), 'age'] = 3
df1.loc[df1['age'] >= 60, 'age'] = 4

In [None]:
# dropping the columns we do not need
columns_remove = ['height','weight','ap_hi','ap_lo','bmi']
df1_new = df1.drop(columns=columns_remove)
df1_new

### for df2

In [None]:
df2['bmi'] = round(df2['weight']/ (df2['height']/100)**2, 2)
# binning the bmi feature
df2.loc[(df2['bmi'] < 18.5), 'bmi_cat'] = 1
df2.loc[(df2['bmi'] >= 18.5) & (df2['bmi'] < 25), 'bmi_cat'] = 2
df2.loc[(df2['bmi'] >= 25) & (df2['bmi'] < 30), 'bmi_cat'] = 3
df2.loc[(df2['bmi'] >= 30), 'bmi_cat'] = 4

df2['bmi_cat'] = df2['bmi_cat'].astype('int')
df2['bmi_cat'].value_counts()

In [None]:
# binning the bp (ap_hi & ap_lo) features
df2.loc[(df2['ap_hi'] < 130) | (df2['ap_lo'] < 90), 'blood_pressure_cat'] = 1
df2.loc[(df2['ap_hi'] >= 130) | (df2['ap_lo'] >= 90), 'blood_pressure_cat'] = 2

df2['blood_pressure_cat'] = df2['blood_pressure_cat'].astype('int')
df2['blood_pressure_cat'].value_counts()

In [None]:
# binning the age feature
df2.loc[(df2['age'] >= 30) & (df2['age'] < 40), 'age'] = 1
df2.loc[(df2['age'] >= 40) & (df2['age'] < 50), 'age'] = 2
df2.loc[(df2['age'] >= 50) & (df2['age'] < 60), 'age'] = 3
df2.loc[df2['age'] >= 60, 'age'] = 4

In [None]:
columns_remove = ['height','weight','ap_hi','ap_lo','bmi']
df2_new = df2.drop(columns=columns_remove)
df2_new

## Logistic Regression model

### based on df1 -- outliers are replaced with boundary values

In [None]:
X1_data = df1_new.drop(columns=['cardio']).values
y1_data = df1_new['cardio']

# splitting the data into training and testing sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1_data, y1_data, test_size = 0.25, random_state = 2)
print(X1_train.shape)
print(y1_train.shape)
print(X1_test.shape)
print(y1_test.shape)

In [None]:
# fitting df1 data to the model
logreg1 = LogisticRegression()
logreg1.fit(X1_train, y1_train)

print(logreg1.intercept_)
print(logreg1.coef_)

y1_pred = logreg1.predict(X1_test)

In [None]:
# creating a confusion matrix
pd.crosstab(y1_test, 
            y1_pred, 
            rownames=['Actual'], 
            colnames=['Predicted'])

In [None]:
print("Accuracy:", metrics.accuracy_score(y1_test, y1_pred))

### based on df2 -- outliers are removed

In [None]:
X2_data = df2_new.drop(columns=['cardio']).values
y2_data = df2_new['cardio']

# splitting the data into training and testing sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_data, y2_data, test_size = 0.25, random_state = 2)
print(X2_train.shape)
print(y2_train.shape)
print(X2_test.shape)
print(y2_test.shape)

In [None]:
# fitting df2 data to the model
logreg2 = LogisticRegression()
logreg2.fit(X2_train, y2_train)

print(logreg2.intercept_)
print(logreg2.coef_)

y2_pred = logreg2.predict(X2_test)

In [None]:
# creating a confusion matrix
pd.crosstab(y2_test, 
            y2_pred, 
            rownames=['Actual'], 
            colnames=['Predicted'])

In [None]:
print("Accuracy:", metrics.accuracy_score(y2_test, y2_pred))

The outliers of the datasets are processed differently:
 - df1: outliers are replaced with boundary values, based on 1.5*Q1 and 1.5*Q3
 - df2: outliers are removed
 
The model accuracy for both datasets are similar, df1's model at 71.7% whereare df2's model at 72.1%.

**Actionables:**
 - to explore different type of classification models
 - to explore other methods in processing features with extreme values, as I realised that the lower boundary values for weight is 14kg, which is quiet impossible as the recorded minumum age of the dataset is 30 years old.
 - to identify both ap_hi and ap_lo columns with negative values -- is there a pattern to this error, or we could simply replace it to a positive sign of the same value