In [98]:
import pandas as pd
import numpy as np

# Uploadind .cvs file
df = pd.read_csv("data/covid19_patient_symptoms_diagnosis.csv")
rows, columns = df.shape
print(f"COVID-19 dataset has {rows} and {columns} columns.")


COVID-19 dataset has 5000 and 18 columns.


In [103]:
# Basic First Look at Data
print("--- First 5 Rows of the Dataset ---")
print(df.head())
print('\n')
print("--- Last 5 Rows of the Dataset ---")
print(df.tail())
print("\nData types:")
print(df.dtypes)


--- First 5 Rows of the Dataset ---
   patient_id  age  gender  fever  dry_cough  sore_throat  fatigue  headache  \
0           1   52    Male      1          0            1        1         0   
1           2   15    Male      0          0            0        1         1   
2           3   72    Male      1          0            1        0         0   
3           4   61  Female      0          0            1        1         1   
4           5   21  Female      1          1            0        0         0   

   shortness_of_breath  loss_of_smell  loss_of_taste  oxygen_level  \
0                    0              0              0            98   
1                    0              0              0            85   
2                    1              0              1            99   
3                    0              1              1            86   
4                    1              0              1            90   

   body_temperature comorbidity  travel_history  contact_with_

In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   patient_id            5000 non-null   int64  
 1   age                   5000 non-null   int64  
 2   gender                5000 non-null   object 
 3   fever                 5000 non-null   int64  
 4   dry_cough             5000 non-null   int64  
 5   sore_throat           5000 non-null   int64  
 6   fatigue               5000 non-null   int64  
 7   headache              5000 non-null   int64  
 8   shortness_of_breath   5000 non-null   int64  
 9   loss_of_smell         5000 non-null   int64  
 10  loss_of_taste         5000 non-null   int64  
 11  oxygen_level          5000 non-null   int64  
 12  body_temperature      5000 non-null   float64
 13  comorbidity           2275 non-null   object 
 14  travel_history        5000 non-null   int64  
 15  contact_with_patient 

In [33]:
# Check if there are any duplicate Rows
print("Total Duplicate Rows : ", df.duplicated().sum())

Total Duplicate Rows :  0


In [107]:
# Handling Missing Values  
print("List of Columns and their Missing Value Count")
print(df.isnull().sum())
print('\n')
print('In This Data set, the only Column that contains missing values is Comorbidity')
print('\n')
# Find the Values withing the Comorbidty Column
print('Unique values in comorbidity column: ', df['comorbidity'].unique())
# Comorbiditiy Value Count & Percentage
counts = df['comorbidity'].value_counts(dropna=False)
percent = df['comorbidity'].value_counts(normalize=True, dropna=False) * 100
print(pd.DataFrame({'Count': counts, 'Percent': percent.round(2)}))


List of Columns and their Missing Value Count
patient_id                 0
age                        0
gender                     0
fever                      0
dry_cough                  0
sore_throat                0
fatigue                    0
headache                   0
shortness_of_breath        0
loss_of_smell              0
loss_of_taste              0
oxygen_level               0
body_temperature           0
comorbidity             2725
travel_history             0
contact_with_patient       0
chest_pain                 0
covid_result               0
dtype: int64


In This Data set, the only Column that contains missing values is Comorbidity


Unique values in comorbidity column:  ['Diabetes' nan 'Asthma' 'Heart Disease']
               Count  Percent
comorbidity                  
NaN             2725    54.50
Diabetes        1001    20.02
Heart Disease    792    15.84
Asthma           482     9.64


In [68]:
# Handling Missing Values
print('Converting comorbidity values from NaN to None')
# Create new dataframe
df_cleaned = df.copy()
df_cleaned['comorbidity'] = df_cleaned['comorbidity'].fillna("None")
# Comorbiditiy Value Count & Percentage
counts = df_cleaned['comorbidity'].value_counts(dropna=False)
percent = df_cleaned['comorbidity'].value_counts(normalize=True, dropna=False) * 100
print(pd.DataFrame({'Count': counts, 'Percent': percent.round(2)}))

Converting comorbidity values from NaN to None
               Count  Percent
comorbidity                  
None            2725    54.50
Diabetes        1001    20.02
Heart Disease    792    15.84
Asthma           482     9.64


In [69]:
# Handling Missing Values 
# Checking Results 
print(f"Missing values in original (df): {df['comorbidity'].isnull().sum()}")
print(f"Missing values in new (df_cleaned): {df_cleaned['comorbidity'].isnull().sum()}")

Missing values in original (df): 2725
Missing values in new (df_cleaned): 0


In [114]:
# Detailed Statistics for Age
print("Detailed Statistics for Age")
print(df_cleaned['age'].describe(), '\n')
print(f"Missing values in Age: {df['age'].isnull().sum()}")

Detailed Statistics for Age
count    5000.000000
mean       44.380600
std        25.551632
min         1.000000
25%        22.000000
50%        44.000000
75%        66.000000
max        89.000000
Name: age, dtype: float64 

Missing values in Age: 0


In [109]:
# Handling Outliers for Age
Q1 = df_cleaned['age'].quantile(0.25)
Q3 = df_cleaned['age'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df_cleaned[(df_cleaned['age'] < lower_bound) | (df_cleaned['age'] > upper_bound)]

print(f"Any age below {lower_bound} or above {upper_bound} is a statistical outlier.")
# Print out Outlier Age Analysis
print('-' * 30)
print(" AGE OUTLIER ANALYSIS ")
print('-' * 30)
print(f"Quartile 1 (25%): {Q1:>10}")
print(f"Quartile 3 (75%): {Q3:>10}")
print(f"IQR:              {IQR:>10}")
print("-" * 30)
print(f"Lower Bound:      {lower_bound:>10}")
print(f"Upper Bound:      {upper_bound:>10}")
print("-" * 30)
print(f"Outliers Found:   {len(outliers):>10}")
print("-" * 30)

Any age below -44.0 or above 132.0 is a statistical outlier.
------------------------------
 AGE OUTLIER ANALYSIS 
------------------------------
Quartile 1 (25%):       22.0
Quartile 3 (75%):       66.0
IQR:                    44.0
------------------------------
Lower Bound:           -44.0
Upper Bound:           132.0
------------------------------
Outliers Found:            0
------------------------------


In [128]:
# Age Analysis using Gender Grouping
df_cleaned.groupby('gender')['age'].agg(['mean', 'median', 'min', 'max', 'count'])

Unnamed: 0_level_0,mean,median,min,max,count
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,44.638425,45.0,1,89,2514
Male,44.119871,44.0,1,89,2486


In [140]:
# Age Group Aggregation
bins = [0, 18, 35, 50, 60, 80, 100]
labels = ['Minors (0-17)', 'Young Adult (18-34)', 'Adult (35-49)', 'Middle Aged (50-64)',
          'Senior (65-79)', 'Elderly (80+)']

df_cleaned['age_categories'] = pd.cut(df_cleaned['age'], bins=bins, labels=labels, right=False)
print("First 5 rows by Age Category")
print(df_cleaned[['age','age_categories']].head())

category_counts = df_cleaned['age_categories'].value_counts().sort_index()
total_patients = category_counts.sum()



First 5 rows by Age Category
   age       age_categories
0   52  Middle Aged (50-64)
1   15        Minors (0-17)
2   72       Senior (65-79)
3   61       Senior (65-79)
4   21  Young Adult (18-34)


In [145]:
print('---Patient Total per Age Category ---')
print(category_counts)
print(f"Sum of All Age Categories: {total_patients}")
print('\n')
print('---Patient Total per Age Category by Gender Count ---')

---Patient Total per Age Category ---
age_categories
Minors (0-17)           975
Young Adult (18-34)     976
Adult (35-49)           861
Middle Aged (50-64)     560
Senior (65-79)         1084
Elderly (80+)           544
Name: count, dtype: int64
Sum of All Age Categories: 5000


---Patient Total per Age Category by Gender Count ---


In [148]:
gender_within_age = pd.crosstab(df['age_categories'], df['gender'], normalize='index') * 100
age_within_gender = pd.crosstab(df['age_categories'], df['gender'], normalize='columns') * 100

print("--- Gender Split WITHIN each Age Category (%) ---")
print(gender_within_age.map('{:.2f}%'.format))

print("\n--- Age Distribution WITHIN each Gender (%) ---")
print(age_within_gender.map('{:.2f}%'.format))


--- Gender Split WITHIN each Age Category (%) ---
gender               Female    Male
age_categories                     
Minors (0-17)        48.62%  51.38%
Young Adult (18-34)  50.92%  49.08%
Adult (35-49)        52.03%  47.97%
Middle Aged (50-64)  47.68%  52.32%
Senior (65-79)       51.38%  48.62%
Elderly (80+)        49.82%  50.18%

--- Age Distribution WITHIN each Gender (%) ---
gender               Female    Male
age_categories                     
Minors (0-17)        18.85%  20.15%
Young Adult (18-34)  19.77%  19.27%
Adult (35-49)        17.82%  16.61%
Middle Aged (50-64)  10.62%  11.79%
Senior (65-79)       22.16%  21.20%
Elderly (80+)        10.78%  10.98%


In [134]:
# COVID Positive cases
positive_stats = df.groupby('age_categories', observed=False)['covid_result'].mean()*100
cleaned_stats = positive_stats.map('{:.2f}%'.format)
print(cleaned_stats)

age_categories
Minors (0-17)          51.38%
Young Adult (18-34)    52.15%
Adult (35-49)          50.06%
Middle Aged (50-64)    53.39%
Senior (65-79)         53.23%
Elderly (80+)          52.02%
Name: covid_result, dtype: object


In [135]:
# Symptom Count by Age Group
symptoms = ['fever', 'dry_cough', 'sore_throat', 'fatigue','headache', 'shortness_of_breath',
            'loss_of_smell', 'loss_of_taste','chest_pain' ]
df['symptom_count'] = df[symptoms].sum(axis = 1)

severity_group = df.groupby('age_categories', observed = False)['symptom_count'].mean().sort_values()
print(severity_group)

age_categories
Minors (0-17)          3.793846
Young Adult (18-34)    3.790984
Adult (35-49)          3.725900
Middle Aged (50-64)    3.792857
Senior (65-79)         3.690037
Elderly (80+)          3.783088
Name: symptom_count, dtype: float64


In [None]:
# Check the first 5 rows
df.head()
# Print Total Number of Records
total_records = len(df)
print("Total Records in Data Frame", total_records)

#Checking Number of Males 
male_count = (df['gender'] == 'Male').sum()
print("Total Males:", male_count)

#Checking Number of Females
female_count = (df['gender'] == 'Female').sum()
print("Total Females:", female_count)

# Checking both male and female records 
gender_summary = df['gender'].value_counts()
print(gender_summary)

# Checking Mean age of entire dataset
mean_age = df['age'].mean()
print("Mean Age of Entire Dataset:", mean_age)
rounded_age = round(mean_age)
print("Rounded Age of Entire Dataset:", rounded_age)

# Using Pandas and Describe() on Entire Dataset Age
print(df['age'].describe())

In [23]:
# Create Male only Dataframe
male_df = df [df['gender'] == 'Male'].copy()
male_df.head()
total_male_records = len(male_df)
print("Male Records Total: ", total_male_records)

print(male_df.isnull().sum())


Male Records Total:  2486
patient_id                 0
age                        0
gender                     0
fever                      0
dry_cough                  0
sore_throat                0
fatigue                    0
headache                   0
shortness_of_breath        0
loss_of_smell              0
loss_of_taste              0
oxygen_level               0
body_temperature           0
comorbidity             1343
travel_history             0
contact_with_patient       0
chest_pain                 0
covid_result               0
dtype: int64
