In [1]:
# packages & libraries

import pandas as pd

In [2]:
# create dataframe from input csv path

def create_df(input_csv_path: str) -> pd.DataFrame:
    """
    Select CSV file for summary statistics generation.
    
    """
    
    df = pd.read_csv(input_csv_path)
    
    return df

In [3]:
# calculate number of rows

def calculate_n_rows(df: pd.DataFrame) -> int:
    """
    Calculate number of rows of DataFrame.
    
    """
    
    n_rows = df.shape[0]
    
    return n_rows

In [4]:
# calculate number of columns

def calculate_n_cols(df: pd.DataFrame) -> int:
    """
    Calculate number of columns of DataFrame.
    
    """
    
    n_cols = df.shape[1]
    
    return n_cols

In [5]:
def calculate_df_completeness(df: pd.DataFrame) -> float:
    """
    Calculate completeness of the dataframe, 
    round up at 2nd decimal.
    
    """
    
    n_missing_values = df.isna().sum().sum()
    
    df_completeness = round((df.size - n_missing_values)/df.size, 2)
    
    return df_completeness

In [6]:
def calculate_col_completeness(df: pd.DataFrame, n_rows: int) -> float:
    """
    Calculate completeness of the dataframe, 
    round up at 2nd decimal.
    
    """
    
    col_missing_values = df.isna().sum()
    
    col_completeness = round((n_rows - col_missing_values)/n_rows, 2)
    
    return col_completeness

In [7]:
# Calculation summary statistics below

In [8]:
# define input CSV path

input_csv_path = "./input-csv/canada/canada.csv"

In [9]:
df = create_df(input_csv_path)

# calculate and print number of rows
n_rows = calculate_n_rows(df)
print('The number of rows is: ' + str(n_rows) + '\n')

# calculate and print number of columns
n_cols = calculate_n_cols(df)
print('The number of columns is: ' + str(n_cols) + '\n')

# calculate and print dataset completeness
# where 1.00 is 100% complete
df_completeness = calculate_df_completeness(df)
print('The dataset completeness is: ' + str(df_completeness) + '\n')

# calculate and print the completeness for each
# column, where 1.00 is 100% complete
col_completeness = calculate_col_completeness(df, n_rows)
print('The completeness for each column is:\n' + str(col_completeness))

The number of rows is: 40208

The number of columns is: 5

The dataset completeness is: 0.99

The completeness for each column is:
REF_DATE              1.00
GEO                   1.00
Age group             1.00
Pregnancy outcomes    1.00
VALUE                 0.97
dtype: float64
