In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
df = pd.read_csv('cardio_train.csv', sep=';')

In [None]:
# Calculate age in years
df['age_years'] = df['age'] / 365

In [None]:
# Display DataFrame information
df.info()

In [None]:
# Describe the DataFrame
df.describe()

In [None]:
df.isnull().sum()

# Visualize Gender vs Cardio Counts

This section visualizes the relationship between gender and cardio outcomes. The plot shows the counts of cardio outcomes (0 and 1) for each gender (1 and 2).

In [None]:
# Group data by gender and cardio outcome
gender_cardio_counts = df.groupby(['gender', 'cardio']).size().unstack(fill_value=0)

# Plot grouped bar chart
ax = gender_cardio_counts.plot(kind='bar', stacked=False)
plt.title('Counts of Cardio Outcomes by Gender')
plt.xlabel('Gender (1 = Female, 2 = Male)')
plt.ylabel('Count')
plt.xticks([0, 1], ['Female', 'Male'], rotation=0)
plt.legend(title='Cardio Outcome', labels=['No Disease (0)', 'Disease (1)'])
plt.tight_layout()
plt.show()

In [None]:
# Calculate counts and percentage of cardio infection by gender
gender_counts = df['gender'].value_counts().sort_index()
cardio_infection_counts = df[df['cardio'] == 1]['gender'].value_counts().sort_index()
infection_percentage = (cardio_infection_counts / gender_counts * 100).round(2)

print("Gender counts:")
print(gender_counts)
print("\nCardio infection counts (cardio=1):")
print(cardio_infection_counts)
print("\nPercentage of infection by gender:")
print(infection_percentage)

In [None]:
# Box plot of systolic blood pressure (ap_hi) by gender
plt.figure(figsize=(8, 6))
sns.boxplot(x='gender', y='ap_hi', data=df)
plt.xlabel('Gender (1 = Female, 2 = Male)')
plt.ylabel('Systolic Blood Pressure (ap_hi)')
plt.title('Box Plot of Systolic Blood Pressure by Gender')
plt.xticks([0, 1], ['Female', 'Male'])
plt.tight_layout()
plt.show()

In [None]:
# Box plot of systolic blood pressure (ap_hi) by gender
plt.figure(figsize=(8, 6))
sns.boxplot(x='gender', y='ap_lo', data=df)
plt.xlabel('Gender (1 = Female, 2 = Male)')
plt.ylabel('Systolic Blood Pressure (ap_lo)')
plt.title('Box Plot of Systolic Blood Pressure by Gender')
plt.xticks([0, 1], ['Female', 'Male'])
plt.tight_layout()
plt.show()

In [None]:
df[(df['ap_hi'] > 200) & (df['ap_hi'] < 300)]

In [None]:
# Show rows where systolic blood pressure (ap_hi) is greater than 200
high_ap_hi = df[df['ap_hi'] > 200]
print(f"Number of records with ap_hi > 200: {len(high_ap_hi)}")
display(high_ap_hi)

In [None]:
# Show rows where diastolic blood pressure (ap_hi) is less than 0
low_ap_hi = df[df['ap_hi'] < 0]
print(f"Number of records with ap_hi < 0: {len(low_ap_hi)}")
display(low_ap_hi)

In [None]:
# Show rows where diastolic blood pressure (ap_lo) is less than 0
low_ap_lo = df[df['ap_lo'] < 0]
print(f"Number of records with ap_lo < 0: {len(low_ap_lo)}")
display(low_ap_lo)

In [None]:
# Show rows where diastolic blood pressure (ap_lo) is greater than 200
high_ap_lo = df[df['ap_lo'] > 200]
print(f"Number of records with ap_lo > 200: {len(high_ap_lo)}")
display(high_ap_lo)

In [None]:
plt.hist(high_ap_lo['ap_lo'], bins=30, edgecolor='black')

In [None]:
#(SBP: 60-240, DBP: 40-130)
# Defining plausible limits
MIN_SYSTOLIC = 60
MAX_SYSTOLIC = 240
MIN_DIASTOLIC = 40
MAX_DIASTOLIC = 130

In [None]:
# CORRECT THE COMMON DATA ENTRY ERROR: Missing decimal in Diastolic
# Define the range of values that are likely missing a decimal point
# We assume any diastolic value between 400 and 1300, also last two digits are 00 is meant to be divided by 10.
suspicious_dia_range = (400, 1300)

# Create a boolean mask for rows with this error
dia_error_mask = (df['ap_lo'] >= suspicious_dia_range[0]) & (df['ap_lo'] <= suspicious_dia_range[1]) & (df['ap_lo'] % 100 == 0)

# Correct those specific rows by dividing the diastolic value by 10
df.loc[dia_error_mask, 'ap_lo'] = df.loc[dia_error_mask, 'ap_lo'] / 10

print(f"Corrected {dia_error_mask.sum()} diastolic values by dividing by 10.")

In [None]:
# Create a boolean mask for rows that are within limits
valid_data_mask = (
    (df['ap_hi'] >= MIN_SYSTOLIC) &
    (df['ap_hi'] <= MAX_SYSTOLIC) &
    (df['ap_lo'] >= MIN_DIASTOLIC) &
    (df['ap_lo'] <= MAX_DIASTOLIC)
)

In [None]:
# Apply the mask to get your cleaned dataset
cleaned_df = df[valid_data_mask].copy()

In [None]:
cleaned_df.describe()

In [None]:
# Analyze what was removed
final_invalid_df = df[~valid_data_mask]
print(f"Final dataset has {len(cleaned_df)} valid rows.")
print(f"Removed {len(final_invalid_df)} invalid records that could not be corrected.")

In [None]:
# Summary statistics for ap_hi and ap_lo by cardio outcome (0 and 1)
summary = cleaned_df.groupby('cardio')[['ap_hi', 'ap_lo']].describe(percentiles=[0.25, 0.75])
# Extract only min, 25%, 75%, max for each group
summary_stats = summary.loc[:, (slice(None), ['min', '25%', '75%', 'max'])]
print(summary_stats)

In [None]:
# Box plot of ap_hi and ap_lo by cardio outcome (0 and 1)
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
sns.boxplot(x='cardio', y='ap_hi', data=cleaned_df)
plt.xlabel('Cardio (0 = No Disease, 1 = Disease)')
plt.ylabel('Systolic Blood Pressure (ap_hi)')
plt.title('Box Plot of ap_hi by Cardio Outcome')

plt.subplot(1, 2, 2)
sns.boxplot(x='cardio', y='ap_lo', data=cleaned_df)
plt.xlabel('Cardio (0 = No Disease, 1 = Disease)')
plt.ylabel('Diastolic Blood Pressure (ap_lo)')
plt.title('Box Plot of ap_lo by Cardio Outcome')

plt.tight_layout()
plt.show()

In [None]:
cardio_counts = cleaned_df['cardio'].value_counts().sort_index()
print("Counts of cardio outcomes when ap_hi > 120:")
print(cardio_counts)

In [None]:
# Count of cardio outcomes when ap_hi > 120
high_ap_hi = df[df['ap_hi'] > 120]
cardio_counts = high_ap_hi['cardio'].value_counts().sort_index()
print("Counts of cardio outcomes when ap_hi > 120:")
print(cardio_counts)