In [5]:
# packages & libraries

import pandas as pd

In [8]:
# create dataframe from input csv path

def create_df(input_csv_path: str) -> pd.DataFrame:
    """
    Select CSV file for summary statistics generation.
    
    """
    
    df = pd.read_csv(input_csv_path)
    
    return df

In [15]:
# calculate number of rows

def calculate_n_rows(df: pd.DataFrame) -> int:
    """
    Calculate number of rows of DataFrame.
    
    """
    
    n_rows = df.shape[0]
    
    return n_rows

In [19]:
# calculate number of columns

def calculate_n_cols(df: pd.DataFrame) -> int:
    """
    Calculate number of columns of DataFrame.
    
    """
    
    n_cols = df.shape[1]
    
    return n_cols

In [46]:
def calculate_df_completeness(df: pd.DataFrame) -> float:
    """
    Calculate completeness of the dataframe, 
    round up at 2nd decimal.
    
    """
    
    n_missing_values = df.isna().sum().sum()
    
    df_completeness = round((df.size - n_missing_values)/df.size, 2)
    
    return df_completeness

In [52]:
def calculate_col_completeness(df: pd.DataFrame, n_rows: int) -> float:
    """
    Calculate completeness of the dataframe, 
    round up at 2nd decimal.
    
    """
    
    col_missing_values = df.isna().sum()
    
    col_completeness = round((n_rows - col_missing_values)/n_rows, 2)
    
    return col_completeness

In [None]:
# Calculation summary statistics below

In [13]:
# define input CSV path

input_csv_path = "./input-csv/uk.csv"

In [63]:
df = create_df(input_csv_path)

# calculate and print number of rows
n_rows = calculate_n_rows(df)
print('The number of rows is: ' + str(n_rows) + '\n')

# calculate and print number of columns
n_cols = calculate_n_cols(df)
print('The number of columns is: ' + str(n_cols) + '\n')

# calculate and print dataset completeness
# where 1.00 is 100% complete
df_completeness = calculate_df_completeness(df)
print('The dataset completeness is: ' + str(df_completeness) + '\n')

# calculate and print the completeness for each
# column, where 1.00 is 100% complete
col_completeness = calculate_col_completeness(df, n_rows)
print('The completeness for each column is:\n' + str(col_completeness))

The number of rows is: 962760

The number of columns is: 10

The dataset completeness is: 0.98

The completeness for each column is:
year             1.00
RGN20CD          0.89
RGN20NM          0.89
LAD20CD          1.00
LAD20NM          1.00
WD20CD           1.00
WD20NM           1.00
age_of_mother    1.00
sex              1.00
births           1.00
dtype: float64
