In [1]:
import pandas as pd
import numpy as np

import os
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.utils import price_formating, price_formating_factory, get_random_string
from src.toolbox import DataCleaner

pd.options.display.float_format = price_formating_factory(suffixe="€")


In [2]:
np.random.seed(10_031_995)
SIZE = 10_000

In [3]:
raw_dataframe = pd.DataFrame(
    {
        "Object High Cardinality": [get_random_string(5) for i in range(SIZE)],
        "Integer Small Magnitude": np.random.randint(1, 5, SIZE),
        "Float High Magnitude": np.random.uniform(0, 1_000_000.0, SIZE),
        "Integer High Magnitude": np.random.randint(0, 100_000_000, SIZE),
        "Object Small Cardinality": ["Prénom", "Nom", "Adresse", "Numéro", "Email"]
        * int(SIZE / 5),
    }
)

raw_dataframe.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Object High Cardinality   10000 non-null  object 
 1   Integer Small Magnitude   10000 non-null  int64  
 2   Float High Magnitude      10000 non-null  float64
 3   Integer High Magnitude    10000 non-null  int64  
 4   Object Small Cardinality  10000 non-null  object 
dtypes: float64(1), int64(2), object(2)
memory usage: 390.8+ KB


In [4]:
raw_dataframe.head()

Unnamed: 0,Object High Cardinality,Integer Small Magnitude,Float High Magnitude,Integer High Magnitude,Object Small Cardinality
0,odbtb,2,18 k€,32172635,Prénom
1,krtwm,4,801 k€,46603327,Nom
2,biwjg,1,701 k€,61829874,Adresse
3,fzldf,4,932 k€,82584146,Numéro
4,wsxtr,4,320 k€,42674988,Email


In [5]:
raw_dataframe_copy = raw_dataframe.copy()

In [6]:
cleaner = DataCleaner(df=raw_dataframe_copy)

cleaned_dataframe = (
    cleaner.downcast_columns()
    .uppercase_column_names()
    .reorder_columns(
        columns_order=[
            "OBJECT_SMALL_CARDINALITY",
            "OBJECT_HIGH_CARDINALITY",
            "INTEGER_SMALL_MAGNITUDE",
            "INTEGER_HIGH_MAGNITUDE",
            "FLOAT_HIGH_MAGNITUDE",
        ]
    )
    .sort_columns(
        columns_to_sort=["INTEGER_SMALL_MAGNITUDE"],
        ascending=[True],
    )
    .keep_rows("OBJECT_SMALL_CARDINALITY == 'Prénom'")
    .drop_rows("INTEGER_SMALL_MAGNITUDE <= 2")
    .rename_columns(
        column_map={
            "OBJECT_SMALL_CARDINALITY": "object_small_cardinality",
            "OBJECT_HIGH_CARDINALITY": "O_H_C",
        }
    )
    .create_column(
        new_column_name="NEW_COLUMN",
        conditions=[
            "INTEGER_SMALL_MAGNITUDE == 3",
            "FLOAT_HIGH_MAGNITUDE > 100_000",
        ],
        logic_operator="AND",
        value_if_true="== 3 AND > 100K",
        value_if_false="False",
    )
    .replace_values(
        object_small_cardinality={"Prénom": "Pr3n0m"},
        NEW_COLUMN={"== 3 AND > 100K": "Equals to 3 and stricty greater than 100k"},
    )
    .get_cleaned_dataframe()
)


cleaned_dataframe


Unnamed: 0,object_small_cardinality,O_H_C,INTEGER_SMALL_MAGNITUDE,INTEGER_HIGH_MAGNITUDE,FLOAT_HIGH_MAGNITUDE,NEW_COLUMN
0,Pr3n0m,wxfqd,3,66148757,988 k€,Equals to 3 and stricty greater than 100k
1,Pr3n0m,kpult,3,26986344,188 k€,Equals to 3 and stricty greater than 100k
2,Pr3n0m,feacb,3,49771988,949 k€,Equals to 3 and stricty greater than 100k
3,Pr3n0m,siqqp,3,20036756,683 k€,Equals to 3 and stricty greater than 100k
4,Pr3n0m,fnapk,3,47587304,390 k€,Equals to 3 and stricty greater than 100k
...,...,...,...,...,...,...
987,Pr3n0m,apedq,4,54205391,767 k€,False
988,Pr3n0m,mpkcg,4,46210806,778 k€,False
989,Pr3n0m,trkaw,4,69657984,987 k€,False
990,Pr3n0m,cvell,4,34466754,274 k€,False
