In [1]:
import numpy as np
import pandas as pd
import polars as pl

In [13]:
df = pd.read_csv('variant_v.csv')
df1 = pd.read_csv('base.csv')

In [2]:
df_pl = pl.read_csv("variant_v.csv")

In [9]:
def ingest(path,has_header=True, columns=None):
    '''
    Reads a file into a Dataframe
    
    path: Path to a file or a file-like object.
    By file-like object, we refer to objects with a ``read()``
    method
    
    has_header: Indicate if the first row of dataset is a header or not.
    
    '''
    try:
        df = pl.read_csv(path,has_header,columns)
        shape = df.shape
        if shape[0] == 0:
            print("Empty Dataframe with (0,0) entries")
        else:
            return df
            
    except FileNotFoundError:
        print("No such file or directory: " + "{}".format(path))
        
def metric_summary(df, path_to_save=None, columns=None):
    '''
    Summarizes metrics of a dataframe:
    
    >>> from datetime import date
    >>> df = pl.DataFrame(
    ...     {
    ...         "a": [1.0, 2.8, 3.0],
    ...         "b": [4, 5, None],
    ...         "c": [True, False, True],
    ...         "d": [None, "b", "c"],
    ...         "e": ["usd", "eur", None],
    ...         "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)],
    ...     }
    ... )
    
    >>> df.describe()
    shape: (7, 7)
    ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐
    │ describe   ┆ a        ┆ b        ┆ c        ┆ d    ┆ e    ┆ f          │
    │ ---        ┆ ---      ┆ ---      ┆ ---      ┆ ---  ┆ ---  ┆ ---        │
    │ str        ┆ f64      ┆ f64      ┆ f64      ┆ str  ┆ str  ┆ str        │
    ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡
    │ count      ┆ 3.0      ┆ 3.0      ┆ 3.0      ┆ 3    ┆ 3    ┆ 3          │
    │ null_count ┆ 0.0      ┆ 1.0      ┆ 0.0      ┆ 1    ┆ 1    ┆ 0          │
    │ mean       ┆ 2.266667 ┆ 4.5      ┆ 0.666667 ┆ null ┆ null ┆ null       │
    │ std        ┆ 1.101514 ┆ 0.707107 ┆ 0.57735  ┆ null ┆ null ┆ null       │
    │ min        ┆ 1.0      ┆ 4.0      ┆ 0.0      ┆ b    ┆ eur  ┆ 2020-01-01 │
    │ max        ┆ 3.0      ┆ 5.0      ┆ 1.0      ┆ c    ┆ usd  ┆ 2022-01-01 │
    │ median     ┆ 2.8      ┆ 4.5      ┆ 1.0      ┆ null ┆ null ┆ null       │
    └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘
    
    df: Dataframe to describe
    
    path_to_save: path to save the metric summary results in csv,txt format;
    
    columns: Columns of dataframe to select
    
    e.g: metric_summary(df,columns=['b','c','e'],path_to_save='./metric_summary.csv')
    
    '''
    
    if type(columns) == list:
        df = df.describe()
        metric_df = pl.concat(
            [
                df.select(['describe']),
                df.select(columns)
            ], 
            how="horizontal")
        
    else:
        metric_df = df.describe()
    
    metric_df.write_csv(path_to_save) if path_to_save!= None else 0
    
    return metric_df
    
    
    

In [10]:
df = ingest("variant_v.csv")

In [14]:
metric_summary(df,columns=['income','customer_age','month'],path_to_save='./metric_summary.csv')

describe,income,customer_age,month
str,f64,f64,f64
"""count""",1000000.0,1000000.0,1000000.0
"""null_count""",0.0,0.0,0.0
"""mean""",0.578958,41.34948,3.658708
"""std""",0.288226,13.75192,2.116726
"""min""",0.1,10.0,0.0
"""max""",0.9,90.0,7.0
"""median""",0.6,50.0,4.0
