In [1]:
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow
from ydata_profiling import ProfileReport
from io import StringIO

In [2]:
df = pd.read_csv('variant_v.csv')
df1 = pd.read_csv('base.csv')

In [139]:
def ingest_csv(path, data_type='csv'):
    '''
    Reads a file into a Dataframe
    
    path: Path to a file or a file-like object.
    By file-like object, we refer to objects with a ``read()``
    method
    
    has_header: Indicate if the first row of dataset is a header or not.
    
    '''
    try:
        if data_type=='csv':
            df = pd.read_csv(path)
            shape = df.shape
            if shape[0] == 0:
                print("Empty Dataframe with (0,0) entries")
        else:
            return df
            
    except FileNotFoundError:
        print("No such file or directory: " + "{}".format(path))
        
def metric_summary(df, path_to_save=None, columns=None):
    '''
    Summarizes metrics of a dataframe:
    
    >>> from datetime import date
    >>> df = pd.DataFrame(
    ...     {
    ...         "a": [1.0, 2.8, 3.0],
    ...         "b": [4, 5, None],
    ...         "c": [True, False, True],
    ...         "d": [None, "b", "c"],
    ...         "e": ["usd", "eur", None],
    ...         "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)],
    ...     }
    ... )
    
    >>> df.describe()
    shape: (7, 7)
    ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐
    │ describe   ┆ a        ┆ b        ┆ c        ┆ d    ┆ e    ┆ f          │
    │ ---        ┆ ---      ┆ ---      ┆ ---      ┆ ---  ┆ ---  ┆ ---        │
    │ str        ┆ f64      ┆ f64      ┆ f64      ┆ str  ┆ str  ┆ str        │
    ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡
    │ count      ┆ 3.0      ┆ 3.0      ┆ 3.0      ┆ 3    ┆ 3    ┆ 3          │
    │ null_count ┆ 0.0      ┆ 1.0      ┆ 0.0      ┆ 1    ┆ 1    ┆ 0          │
    │ mean       ┆ 2.266667 ┆ 4.5      ┆ 0.666667 ┆ null ┆ null ┆ null       │
    │ std        ┆ 1.101514 ┆ 0.707107 ┆ 0.57735  ┆ null ┆ null ┆ null       │
    │ min        ┆ 1.0      ┆ 4.0      ┆ 0.0      ┆ b    ┆ eur  ┆ 2020-01-01 │
    │ max        ┆ 3.0      ┆ 5.0      ┆ 1.0      ┆ c    ┆ usd  ┆ 2022-01-01 │
    │ median     ┆ 2.8      ┆ 4.5      ┆ 1.0      ┆ null ┆ null ┆ null       │
    └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘
    
    df: Dataframe to describe
    
    path_to_save: path to save the metric summary results in csv format;
    
    columns: Columns of dataframe to select
    
    e.g: metric_summary(df,columns=['b','c','e'],path_to_save='./metric_summary.csv')
    
    '''
    
    if type(columns) == list:
        describe = df.describe()
        describe = describe[columns].T
    else:
        describe = df.describe().T
    
    describe.to_csv(path_to_save) if path_to_save!= None else 0
    
    return describe

def schema(df, path_to_save=None, show=True):
    
    '''
    Function to print and save data schema in CSV or txt format.
    
    df: Dataframe
    
    path_to_save: path to save the schema results in csv format;
    
    show: To show the schema on stdout.
    
    '''
    
    textStream = StringIO()
    df.info(show_counts=False, memory_usage=False, buf=textStream)
    lst =textStream.getvalue()
    #removing unnecessary info
    trim_list = lst.split()[19:]
    dump = []
    for i in range(0,len(trim_list)-6, 3):
        dump.append(trim_list[i] + ',' + trim_list[i+1] + ',' +  trim_list[i+2])
    if path_to_save != None:
        np.savetxt(path_to_save, dump, delimiter=",",fmt='%s')
    else:
        pass
    if show==True:
        df.info()
    else:
        pass
    
    

In [120]:
metric_summary(df,columns=['income','customer_age','month'],path_to_save='./metric_summary.csv')

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
income,1000000.0,0.578958,0.288226,0.1,0.3,0.6,0.8,0.9
customer_age,1000000.0,41.34948,13.75192,10.0,30.0,50.0,50.0,90.0
month,1000000.0,3.658708,2.116726,0.0,2.0,4.0,5.0,7.0


In [144]:
schema(df,path_to_save='abc.csv', show=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 34 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   fraud_bool                        1000000 non-null  int64  
 1   income                            1000000 non-null  float64
 2   name_email_similarity             1000000 non-null  float64
 3   prev_address_months_count         1000000 non-null  int64  
 4   current_address_months_count      1000000 non-null  int64  
 5   customer_age                      1000000 non-null  int64  
 6   days_since_request                1000000 non-null  float64
 7   intended_balcon_amount            1000000 non-null  float64
 8   payment_type                      1000000 non-null  object 
 9   zip_count_4w                      1000000 non-null  int64  
 10  velocity_6h                       1000000 non-null  float64
 11  velocity_24h                      1000