In [2]:
import polars as pl
import polars.selectors as cs
from polars import col
import numpy as np
from time import time
from typing import List, Callable
from termcolor import colored

color_text = lambda text: f"{'\033[1;92m'}{text}{'\033[0m'}"
extra_info = lambda extra='': f" {color_text("-->")} {extra}\n{'-' * 120}\n"

#                                         Polars Creation From Lists, Dict, Realworld Datasets
```js
1. In Polars DataFrame you can set the datatype for each column manually by the 'schema={ColumnName: dtype, ColumnName1: dtype...}'.
2. But if a column in CSV is a Floating Point, then polars doesn't directly cast the Floating Point to integer.
        df = pl.read_csv("D:\\datasets\\temp.csv", schema_overrides={'Marks': pl.UInt8}, infer_schema_length=10000, ignore_errors=True)
        but it make values like 89.87 to null and 45.0 to 45
3. And if a column in CSV is an Integer, Polar can cast it to Floating Point but not the Vice Varsa.

        BUTT you should not do these CASTING INSIDE 'read_csv'. First read the CSV, then analyze it, see if the max and min val of a column can be changed to for example 'pl.Uint8' or not or maybe we need to round it first? Then with 'pl.with_columns' cast the columns you need and set the new DataFrame back to the DataFrame you wanted to change.

        You can do such casting when you create your own small DataFrame, not on the 'real datasets'.
```

```js
        1. In Pandas we say df. then all the methods, attributes are shown.. But in polars when we want to do some operations ON ALL THE COLUMNS, we say 'pl.all()'.
        2. 'pl.all()' : This is an 'expression' that represents "all columns selected" in the DataFrame. You use it when you want to apply a 'transformation or condition' across 'all columns'. Output for pl.all() = * representing All the columns are selected.
        3. 'pl.all().is_null()' is another expresson. Output : *.is_null(). It has not executed yet.
            - expression = pl.all().is_null()               Assume we have 2 DataFrame, df1, df2.
            - To apply the expression 'expression' on df1 we say : 'df1.select(expression)'. Now the 'expression' is applied on 'df1' and will show the output. We can apply this same expression on df2 as well. 
        
        4. 'df.is_duplicated()' : It works ROW WISE. If you want to check duplicate values on each column => 'df.select(pl.all().is_duplicated())'.
        5. Some methods like 'count_null()' works on each column but We should respect polars and do p.all() when we want to do some operations on each column.

        `Why Polars Uses Expressions`:
            - Efficiency: By using expressions, Polars can optimize the query plan and execute operations more efficiently, especially for large datasets.
            - Flexibility: This approach allows chaining of transformations and applying them lazily, which can be evaluated only when needed.
```

In [3]:
# from lists
info = [
    ['Maria0', 15, 16],   # every list is a ROW.
    ['Maria1', 18, 19],
    ['Maria2', 21, 22],
    ['Maria3', 24, 25]
]
#  This "schema list" is to define column names. orient means how I want my each list to be, 'row' or 'col'.
pl.DataFrame(info, schema=['Name', 'Age', 'IQ'], orient='row', strict=False) # or pl.DataFrame(info, schema={'Name': pl.String, 'Age': pl.UInt8, 'IQ': pl.UInt8}, orient='row', strict=False)

Name,Age,IQ
str,i64,i64
"""Maria0""",15,16
"""Maria1""",18,19
"""Maria2""",21,22
"""Maria3""",24,25


In [4]:
# from dictionary
info = {
    'Name' : ['Maria0', 'Maria1', 'Maria2', 'Maria3'],
    'Age' : [15, 18, 21, 24],
    'IQ' : [16, 19, 22, 25]
}
#                  The below "schema/schema_overrides dict" is for defining the datatype for the columns.
pl.DataFrame(info, schema_overrides={'Name': pl.String, 'IQ': pl.UInt8})

Name,Age,IQ
str,i64,u8
"""Maria0""",15,16
"""Maria1""",18,19
"""Maria2""",21,22
"""Maria3""",24,25


In [5]:
# from real world datasets

df = pl.read_csv("D:\\datasets\\nba.csv")
# print(df, extra_info())
# 
# df1 = df.with_columns(
#     pl.col('Number').ceil().cast(pl.UInt8),
#     pl.col('Height')
# )

print(df.head(10))

shape: (10, 9)
┌────────────────┬─────────┬────────┬──────────┬───┬────────┬────────┬────────────────┬────────────┐
│ Name           ┆ Team    ┆ Number ┆ Position ┆ … ┆ Height ┆ Weight ┆ College        ┆ Salary     │
│ ---            ┆ ---     ┆ ---    ┆ ---      ┆   ┆ ---    ┆ ---    ┆ ---            ┆ ---        │
│ str            ┆ str     ┆ f64    ┆ str      ┆   ┆ str    ┆ f64    ┆ str            ┆ f64        │
╞════════════════╪═════════╪════════╪══════════╪═══╪════════╪════════╪════════════════╪════════════╡
│ Avery Bradley  ┆ Boston  ┆ 0.0    ┆ PG       ┆ … ┆ 6-2    ┆ 180.0  ┆ Texas          ┆ 7.730337e6 │
│                ┆ Celtics ┆        ┆          ┆   ┆        ┆        ┆                ┆            │
│ Jae Crowder    ┆ Boston  ┆ 99.0   ┆ SF       ┆ … ┆ 6-6    ┆ 235.0  ┆ Marquette      ┆ 6.796117e6 │
│                ┆ Celtics ┆        ┆          ┆   ┆        ┆        ┆                ┆            │
│ John Holland   ┆ Boston  ┆ 30.0   ┆ SG       ┆ … ┆ 6-5    ┆ 205.0  ┆ Bosto

#              [rows, columns]. rows = columns = `A Single Value i.e. A Scalar Value` OR `can be a List` OR `Slice(:)`.

In [6]:
#                                            Only for DataFrames, NOT FOR LAZYFRAME

print(df[ 4, ['Team', 'Height', 'College']], extra_info("df[4, ['Team', 'Height', 'College']]"))
print(df[:4, ['Team', 'Height', 'College']], extra_info("first 4 rows but only Those 3 columns"))
print(df[::100, ['Team', 'Height', 'College']], extra_info("df[::100, ['Team', 'Height', 'College']]"))

bool_columns = np.random.choice(a=[True, False], size=(df.width,), replace=True)
print(df[:, bool_columns], extra_info(f"\n{list(zip(df.columns, bool_columns))}"))

shape: (1, 3)
┌────────────────┬────────┬─────────┐
│ Team           ┆ Height ┆ College │
│ ---            ┆ ---    ┆ ---     │
│ str            ┆ str    ┆ str     │
╞════════════════╪════════╪═════════╡
│ Boston Celtics ┆ 6-10   ┆ null    │
└────────────────┴────────┴─────────┘  [1;92m-->[0m df[4, ['Team', 'Height', 'College']]
------------------------------------------------------------------------------------------------------------------------

shape: (4, 3)
┌────────────────┬────────┬───────────────────┐
│ Team           ┆ Height ┆ College           │
│ ---            ┆ ---    ┆ ---               │
│ str            ┆ str    ┆ str               │
╞════════════════╪════════╪═══════════════════╡
│ Boston Celtics ┆ 6-2    ┆ Texas             │
│ Boston Celtics ┆ 6-6    ┆ Marquette         │
│ Boston Celtics ┆ 6-5    ┆ Boston University │
│ Boston Celtics ┆ 6-5    ┆ Georgia State     │
└────────────────┴────────┴───────────────────┘  [1;92m-->[0m first 4 rows but only Those 3 colum

#                                               `slice()` For LAZYFRAME
```js
        1. LazyFrame 'lf', where the Tabular Data has not been created until we use 'lf.collect()'. Thats why we cant use stuff like   lf[:17:5, ['name', 'city', 'age']].
        2. lf.slice(index, length) => This will only slice 'rows' and returns also lazyframe.
        3. lf.select(pl.col( *Columns Name )) => This selects specific 'columns'. Nope, cant use 'Boolean Columns' in '*Columns Name'.
```

#                                                       DataFrame Attributes

In [7]:
print(df.shape, extra_info("shape"))
print(df.columns,'  ', type(df.columns), extra_info("column names"))
print(df.dtypes, '  ', type(df.dtypes),  extra_info("dtypes"))
print(df.height, '  ', df.width, extra_info("heigh and width"))
print(df.schema, '\n', type(df.schema), extra_info("Schema(Column Names with their Datatype)"))

(458, 9)  [1;92m-->[0m shape
------------------------------------------------------------------------------------------------------------------------

['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight', 'College', 'Salary']    <class 'list'>  [1;92m-->[0m column names
------------------------------------------------------------------------------------------------------------------------

[String, String, Float64, String, Float64, String, Float64, String, Float64]    <class 'list'>  [1;92m-->[0m dtypes
------------------------------------------------------------------------------------------------------------------------

458    9  [1;92m-->[0m heigh and width
------------------------------------------------------------------------------------------------------------------------

Schema({'Name': String, 'Team': String, 'Number': Float64, 'Position': String, 'Age': Float64, 'Height': String, 'Weight': Float64, 'College': String, 'Salary': Float64}) 
 <class 'polars.

#                                                       DataFrame Methods

In [8]:
# head, tail, sample are same as series.
df_info = lambda: df.null_count().cast(pl.String).vstack( pl.DataFrame(dict(zip(df.columns, map(str, df.dtypes)))) )
#                                         Polars doesn't have pandas dataframe.info() so I made it.
print(df_info(), extra_info("info(Index, Columns, Count of Non Null Values in each column, dtype)"))
print(df.describe(), extra_info("describe")) # works on every column

shape: (2, 9)
┌────────┬────────┬─────────┬──────────┬───┬────────┬─────────┬─────────┬─────────┐
│ Name   ┆ Team   ┆ Number  ┆ Position ┆ … ┆ Height ┆ Weight  ┆ College ┆ Salary  │
│ ---    ┆ ---    ┆ ---     ┆ ---      ┆   ┆ ---    ┆ ---     ┆ ---     ┆ ---     │
│ str    ┆ str    ┆ str     ┆ str      ┆   ┆ str    ┆ str     ┆ str     ┆ str     │
╞════════╪════════╪═════════╪══════════╪═══╪════════╪═════════╪═════════╪═════════╡
│ 1      ┆ 1      ┆ 1       ┆ 1        ┆ … ┆ 1      ┆ 1       ┆ 85      ┆ 12      │
│ String ┆ String ┆ Float64 ┆ String   ┆ … ┆ String ┆ Float64 ┆ String  ┆ Float64 │
└────────┴────────┴─────────┴──────────┴───┴────────┴─────────┴─────────┴─────────┘  [1;92m-->[0m info(Index, Columns, Count of Non Null Values in each column, dtype)
------------------------------------------------------------------------------------------------------------------------

shape: (9, 10)
┌────────────┬────────────┬────────────┬───────────┬───┬────────┬────────────┬─────────┬────

#                                                 filter / select / with_columns
```js
        1. df.filter(expression to perform on the df) :
                When based on an operation we want to display 'specific columns or specific values from the dataframe OR the entire dataframe', we do filter(..) which returns the ENTIRE New DataFrame but we can select(..) specific columns to display.

        2. df.select(list of columns to display OR expression to perform) :
                When we want to 'select specific columns to display' OR we want to find the sum()/prod()/mean()/std() etc such CALCULATION on the entire or specific columns OR we want to have only the 'Boolean Mask result', not the actual output, we do select().
        
        3. df.with_columns(changes on ALL or specific columns seperated by comma) :
                Assume we have 10 columns. Now we want to change 2 or 3 columns DataType by 'casting' or change their values and AFTER the changes on specific columns we want to have the ENTIRE DATAFRAME having those changes on those specific columns and the REST UNCHANGED COLUMNS as well. For this we do df.with_columns(). We can even set the new dataframe to the old dataframe.
```

In [9]:
print(df.filter( pl.col('Name').str.contains('Avery') ), '\n')
df.filter( pl.col('Name').str.contains('Avery') ).select(['Name', 'Height']) # YOU CAN'T WRITE df.filter( pl.col('Name').str.contains('Avery').select(['Name', 'Height']) )

mask = df.select(pl.col('Name').str.contains('Avery')).to_series()
print(mask, '\n')

df.select(pl.all().count()) # won't work if you write filter instead of select

shape: (1, 9)
┌───────────────┬────────────────┬────────┬──────────┬───┬────────┬────────┬─────────┬────────────┐
│ Name          ┆ Team           ┆ Number ┆ Position ┆ … ┆ Height ┆ Weight ┆ College ┆ Salary     │
│ ---           ┆ ---            ┆ ---    ┆ ---      ┆   ┆ ---    ┆ ---    ┆ ---     ┆ ---        │
│ str           ┆ str            ┆ f64    ┆ str      ┆   ┆ str    ┆ f64    ┆ str     ┆ f64        │
╞═══════════════╪════════════════╪════════╪══════════╪═══╪════════╪════════╪═════════╪════════════╡
│ Avery Bradley ┆ Boston Celtics ┆ 0.0    ┆ PG       ┆ … ┆ 6-2    ┆ 180.0  ┆ Texas   ┆ 7.730337e6 │
└───────────────┴────────────────┴────────┴──────────┴───┴────────┴────────┴─────────┴────────────┘ 

shape: (458,)
Series: 'Name' [bool]
[
	true
	false
	false
	false
	false
	…
	false
	false
	false
	false
	null
] 



Name,Team,Number,Position,Age,Height,Weight,College,Salary
u32,u32,u32,u32,u32,u32,u32,u32,u32
457,457,457,457,457,457,457,373,446


#                                                        filter()
```js
        1. df.filter(conditions seperated by comma) : It returns the Whole New DataFrame after applying the filter on 'df'. But if you want specific columns => df.filter(conditions seperated by comma).select([column names]) or df.filter(conditions seperated by comma)[[column names]]

        2. df.lazy().filter(conditions seperated by comma) : Polars known for laziness and it gives FASTER EXECUTION reducing the unnecessery operations which occurs INTERNALLY like :

            df.filter(conditions seperated by comma) returns the whole DataFrame and then with .select([column names]), it gives us the specfic columns. BUT
            in df.lazy().filter(conditions seperated by comma).select([column names]) : polars doesn't even return the whole new dataframe when we do 'df.filter(..)' because polars see that we selected only specific columns by '.select([..])', so it directly returns only those specific columns we selected after 'df.filter(..)'.' Thats why it RUNS FASTER!
```

In [10]:
start = time()
pp = df.filter(pl.col('Number') > 50).select(['Name', 'Age'])
print(time() - start)
# pp[0, 'Name'] = 'AAAAA'
# print(pp['Name'][0], df['Name'][0]) # AAAAA, Avery Bradley. I.E. pp is not a view but A NEW DataFrame

start = time()
pp1 = df.lazy().filter(pl.col('Number') > 50).select(['Name', 'Age']).collect()
print(time() - start) # Always do lazy operation
# pp1[0, 'Name'] = 'AAAAA'
# print(pp1['Name'][0], df['Name'][0]) # AAAAA, Avery Bradley. I.E. pp1 is not a view but A NEW DataFrame

0.0050618648529052734
0.0


In [11]:
df.head(2)

Name,Team,Number,Position,Age,Height,Weight,College,Salary
str,str,f64,str,f64,str,f64,str,f64
"""Avery Bradley""","""Boston Celtics""",0.0,"""PG""",25.0,"""6-2""",180.0,"""Texas""",7730337.0
"""Jae Crowder""","""Boston Celtics""",99.0,"""SF""",25.0,"""6-6""",235.0,"""Marquette""",6796117.0


#                                   rename(), is_duplicated(), null_count(), is_null()

In [12]:
# df.columns = np.arange(10, df.width+10, dtype='u8').astype('str') # it will change the column names PERMANENTLY.
print(df.rename({'Number' : 'Marks', 'Salary' : 'Wage'}), extra_info("RENAMING SPECIFIC COLUMNS"))

print(df.is_duplicated()) # returns a Boolean Mask i.e. SERIES. True = That row is DUPLICATED.
print(df.null_count(), extra_info("count nulls on Each COLUMN.")) # count nulls on Each COLUMN.

# DataFrame doesn't have isnull(), only Series has isnull(). So with select(..) traverse EACH COLUMN/SERIES and apply isnull().
print(df.select( pl.all().is_null() ), extra_info("Boolean DataFrame after is_null() applied on each column"))
print(df.select( pl.all().is_null().sum() ), extra_info("'pl.all().is_null().sum()' applied on each column"))

print(df.select( pl.all().has_nulls() ), extra_info("Columns having at least One Null Value = True, else False"))

shape: (458, 9)
┌────────────────┬───────────┬───────┬──────────┬───┬────────┬────────┬───────────────┬────────────┐
│ Name           ┆ Team      ┆ Marks ┆ Position ┆ … ┆ Height ┆ Weight ┆ College       ┆ Wage       │
│ ---            ┆ ---       ┆ ---   ┆ ---      ┆   ┆ ---    ┆ ---    ┆ ---           ┆ ---        │
│ str            ┆ str       ┆ f64   ┆ str      ┆   ┆ str    ┆ f64    ┆ str           ┆ f64        │
╞════════════════╪═══════════╪═══════╪══════════╪═══╪════════╪════════╪═══════════════╪════════════╡
│ Avery Bradley  ┆ Boston    ┆ 0.0   ┆ PG       ┆ … ┆ 6-2    ┆ 180.0  ┆ Texas         ┆ 7.730337e6 │
│                ┆ Celtics   ┆       ┆          ┆   ┆        ┆        ┆               ┆            │
│ Jae Crowder    ┆ Boston    ┆ 99.0  ┆ SF       ┆ … ┆ 6-6    ┆ 235.0  ┆ Marquette     ┆ 6.796117e6 │
│                ┆ Celtics   ┆       ┆          ┆   ┆        ┆        ┆               ┆            │
│ John Holland   ┆ Boston    ┆ 30.0  ┆ SG       ┆ … ┆ 6-5    ┆ 205.0  ┆ Bos

#                                               drop_nulls() on DataFrame
```js
        Polars.drop_nulls() traverse through each column i.e. vertically i.e. 'ROW-wise' and while traversing vertically if polars see a column's value is Null, it delete that 'ROW'.' So at the end in the result DataFrame you won't see a single ROW which has any Null value.

        'pandas drop_na(subset, axis=1 or 0, how='any' or 'all')' has these 3 important parameters which is really really beneficial. Polars drop_nulls() equivalent to Pandas drop_na(subset, axis=0, how='any'). So I created below after the next block, 'drop_nulls(lazyframe, subset, drop='rows' or 'columns', how='any' or 'all')'.

------> Dealing with 'drop=rows':
        -------------------------
                We want to delete 'WHOLE ROW', means doesnt matter if we want to drop_nulls() based on specific columns or all the columns. E.G.

                        True      False       True        True   True                              True
                        False     False       False       False  False         =>                  False
                        True      True        True        True   True                              True
                        ---------------------------      --------------              -------------------------------------
                             pl.all().is_null()       pl.col(subset).is_null()       pl.any_horizontal( on both left df)

                                                                                     = For both left Boolean DataFrame, The Output is same AND 'Both Giving OUTPUT in A Series i.e. A SINGLE COLUMN (NOT 1D array like [True, False. True])'. We can use this SINGLE COLUMN to filter('filter works only on A SERIES/A SINGLE COLUMN BOOLEAN MASK') the ROWS we want.
                                                                                     Of course True = It has NULLS and we dont want that row. So ~pl.any_horizontal().
------> Dealing with 'drop=columns':
        ----------------------------
                We want to delete 'WHOLE COLUMN'.

                        True      False       True                                   
                        False     False       False              =>                  
                        True      False       True                              True    False   True
                        ---------------------------                    -------------------------------------
                             pl.all().is_null()                                 pl.all().is_null().any() (Result : (1, 3) DataFrame)

                                                                       = We cant use A Single Row DataFrame as A Boolean Mask. You may think to use `df[0]` to select the Single Row BUT 'if we select just A SINGLE ROW, polars still give us a DataFrame with that single row having column names above'. For this use '.row(index=0)' which 'returns a 1D tuple'. Since we want to delete `COLUMNS` i.e. FILTER 'COLUMNS', filter() wont help us (as filter() filters 'ROWS' only) BUT 'lazyframe.collect()[:, boolean mask or column names], lazyframe.select(pl.col(column names))' will.

                                                                       We can use a 1D tuple e.g. [True, False, True] to select columns like 'lazyframe.collect()[:, [True, False, True]]' BUT we dont want to do 'lazyframe.collect()' to filter specific columns, its expensive. So we need to 'find the column names' and pass it to 'lazyframe.select(pl.col(..Here..))'. 
                                                                       zip(('name', True), ('toy', False), ('born', True)) => Now we can 'find the column names'.

                        True      False                                     
                        False     False               =>                  
                        True      False                           True    False
                        ---------------------------        ------------------------------
                          pl.col(subset).is_null()         pl.all().is_null().any() (Result : (1, 3) DataFrame)

                                                           = Since we are using 'subset' means finding the columns name for [True, False] wont work as 'WE NEED THE OTHER COLUMNS also in the result Data/Lazy-Frame which is not selected in the subset'.

                                                           'bool_columns_dict' = {subset[0] : True, subset[1] : False}.
                                                           Now we traverse the LazyFrames.columns Name 'SERIALLY' and check if the current columnName is in the 'set(subset)':
                                                                        if in the set(subset) => bool_columns_dict[columnname]
                                                                        else => 'False' means this columns 'doesnt have anY null'.
                                                           By doing this we will get the 'WHOLE 1D BOOLEAN ROW' and we can use it to select columns by passing the columns in 'lazyframe.select(pl.col(..Here..))'
        
        Note :  pl.all() denoted we selected all the columns.
                pl.all().is_null().all() => The last 'all()' does 'BITWISE AND OPERATION' on 'EACH BOOLEAN COLUMN'. THE LAST 'all()' IS NOT 'pl.all()' (which selects all the columns) but 'DATA/LAZY-FRAME.all()' (which does bitwise operation on each column).

                expression = pl.all().is_null()
                pl.any_horizontal(expression) means 'HORIZONTALLY BITWISE OR OPERATION ON THE ENTIRE DATA/LAZY-FRAME'. `It doesn't mean we are selecting rows to do is_null() row by row`.
```

In [13]:
df_drop = pl.LazyFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
                        "toy" : [None, 'Batmobile', 'Bullwhip'],
                        "born": [None, '1940-4-25', None]})

df_drop = df_drop.with_columns(pl.col('born').str.to_date()) # or ....to_datetime() if you also have time.
df_drop.collect()

name,toy,born
str,str,date
"""Alfred""",,
"""Batman""","""Batmobile""",1940-04-25
"""Catwoman""","""Bullwhip""",


In [14]:
def drop_nulls(lazyframe:pl.LazyFrame = None, subset:str|List[str] = None, drop:str = 'rows', how:str = 'any', outputType:str= 'lf') -> pl.LazyFrame|pl.DataFrame:
    """
        1. input_lazyframe = A LazyFrame. default = None
        2. subset = A Column Name or A List of "Column Names". default = None, means All Columns Selected.
        3. drop = 'rows' means Delete ROWS.
           drop = 'columns means Delete COLUMNS. default = 'rows'.
        4. how = 'any' or 'all'. by default how = 'any'.
        4. outputType = 'lf' = LazyFrame or
                        'df' = DataFrame.
    """
    all_column_names = np.array(lazyframe.collect_schema().names())

    if drop == 'rows':
        pl_all_or_subset = pl.all() if subset == None else pl.col(subset)
        expression: pl.Expr = pl.any_horizontal(pl_all_or_subset.is_null()) if how == 'any' else pl.all_horizontal(pl_all_or_subset.is_null())
        lazy_output = lazyframe.filter(~expression)

        return lazy_output if outputType == 'lf' else lazy_output.collect()
    
    else: # drop == 'columns'
        if subset == None or len(subset) == 0:
            expression: pl.Expr = pl.all().is_null().any() if how == 'any' else pl.all().is_null().all()
            bool_columns = lazyframe.select(expression).collect().row(0)  # (False, True, True, True)
            selected_columns = (colName for colName, true in zip(all_column_names, bool_columns) if not true)
            lazy_output = lazyframe.select(pl.col(selected_columns))

            return lazy_output if outputType == 'lf' else lazy_output.collect()
        
        else: #
            subset_set = set(subset) if subset != None else None
            expression: pl.Expr = pl.col(subset).is_null().any() if how == 'any' else pl.col(subset).is_null().all()
            bool_columns = lazyframe.select(expression).collect().row(0)  # (False, True)

            bool_columns_dict = dict(zip(subset, bool_columns))
            whole_bool_columns = (bool_columns_dict[colName] if colName in subset_set else False for colName in all_column_names)

            selected_columns = (colName for colName, true in zip(all_column_names, whole_bool_columns) if not true)
            lazy_output = lazyframe.select(pl.col(selected_columns))

            return lazy_output if outputType == 'lf' else lazy_output.collect()

In [15]:
print(df_drop.collect(), extra_info(color_text("df_drop")))

print(drop_nulls(df_drop, drop='rows', how='any', outputType='df'), extra_info("drop_nulls(drop='rows', how='any')"))
print(drop_nulls(df_drop, drop='rows', how='all', outputType='df'), extra_info("drop_nulls(drop='rows', how='all')"))

print(df_drop.collect(), extra_info(color_text("df_drop")))

print(drop_nulls(df_drop, subset=['name', 'toy'], drop='rows', how='any', outputType='df'), extra_info("drop_nulls(subset=['name', 'toy'], drop='rows', how='any')"))
print(drop_nulls(df_drop, subset=['name', 'toy'], drop='rows', how='all', outputType='df'), extra_info("drop_nulls(subset=['name', 'toy'], drop='rows', how='all')"))

df_drop = df_drop.with_columns(ALL_NULL = None) # New Column 'ALL_NULL' with all null values.
print(df_drop.collect(), extra_info(color_text("df_drop")))

print(drop_nulls(df_drop, drop='columns', how='any', outputType='df'), extra_info("drop_nulls(drop='columns', how='any')"))
print(drop_nulls(df_drop, drop='columns', how='all', outputType='df'), extra_info("drop_nulls(drop='columns', how='all')"))

print(df_drop.collect(), extra_info(color_text("df_drop")))
print(drop_nulls(df_drop, subset=['toy', 'name'], drop='columns', how='any', outputType='df'), extra_info("drop_nulls(subset=['toy', 'name'], drop='columns', how='any')"))
print(drop_nulls(df_drop, subset=['ALL_NULL', 'toy'], drop='columns', how='all', outputType='df'), extra_info("drop_nulls(subset=['ALL_NULL', 'toy'], drop='columns', how='all')"))

shape: (3, 3)
┌──────────┬───────────┬────────────┐
│ name     ┆ toy       ┆ born       │
│ ---      ┆ ---       ┆ ---        │
│ str      ┆ str       ┆ date       │
╞══════════╪═══════════╪════════════╡
│ Alfred   ┆ null      ┆ null       │
│ Batman   ┆ Batmobile ┆ 1940-04-25 │
│ Catwoman ┆ Bullwhip  ┆ null       │
└──────────┴───────────┴────────────┘  [1;92m-->[0m [1;92mdf_drop[0m
------------------------------------------------------------------------------------------------------------------------

shape: (1, 3)
┌────────┬───────────┬────────────┐
│ name   ┆ toy       ┆ born       │
│ ---    ┆ ---       ┆ ---        │
│ str    ┆ str       ┆ date       │
╞════════╪═══════════╪════════════╡
│ Batman ┆ Batmobile ┆ 1940-04-25 │
└────────┴───────────┴────────────┘  [1;92m-->[0m drop_nulls(drop='rows', how='any')
------------------------------------------------------------------------------------------------------------------------

shape: (3, 3)
┌──────────┬───────────┬──────────

#       drop_duplicates() = unique(subset, keep=`first` or `last`, maintain_order), approx_n_unique().
```js
        1. pandas drop_duplicates(subset, keep) is the same as polars 'unique(subset, keep=`first` or `last`, maintain_order)'.
        2. unique(subset=None, keep='first', maintain_order=False) by default. And by default it returns 'unique rows DataFrame'. It doesnt work on Delete Duplicate 'columns', only on 'rows'.
        3.
        'keep'='first' : Among [1, 1, 3, 2, 1] it keeps the 'first' 1 and delete its next duplicate 1s.
              ='last'  : .................................. 'last'  1 .............. previous duplicate 1s.
        
        'subset'= A Single Column Name OR LIST of columns names based on what we want to delete duplicate rows :
                Lets say among ['name', 'age', 'marks', 'city'] I want to delete those rows whose 'name' and 'marks' are duplicated, doesnt matter if their 'age' and 'city' are also same or not. So drop_duplicates(subset=['name', 'marks'])
        
        'maintain_order'= After getting the unique rows, do you want to maintain the ORIGINAL ORDER ('True') or ANY RANDOM ORDER ('False').

        Note: print(df.select(pl.all().unique())) => won't work because Each Column doesn't have Same Number of Unique Values, so can't ----- form a DatFrame with unmatched number of columns i.e. A column has 4 unique values, another column has 5 unique values, unmatched number of columns can't create a DataFrame.
```

In [16]:
lf = pl.scan_csv("D:\\datasets\\drop_duplicate.csv")
print(lf.collect(), extra_info(color_text("lf LazyFrame")))

print(lf.unique(keep='last').collect(), extra_info("unique(keep='last')"))
print(lf.unique(subset=['name', 'marks'], keep='last').collect(), extra_info("unique(subset=['name', 'marks'], keep='last')"))

# LazyFrame.approx_n_unique() ESTIMATES -UNIQUE VALUES "ON EACH COLUMN"- , -not 'unique rows count'-.
# print(lf.approx_n_unique().collect(), extra_info("lf.approx_n_unique()")) # `LazyFrame.approx_n_unique` is deprecated.
print(lf.select(pl.all().approx_n_unique()).collect(),  extra_info("Unique Number of Values on Each Column"))

shape: (5, 4)
┌───────┬───────┬────────┬─────┐
│ name  ┆ marks ┆ city   ┆ age │
│ ---   ┆ ---   ┆ ---    ┆ --- │
│ str   ┆ i64   ┆ str    ┆ i64 │
╞═══════╪═══════╪════════╪═════╡
│ Maria ┆ 79    ┆ Dhaka  ┆ 23  │
│ Maria ┆ 79    ┆ Khulna ┆ 25  │
│ Akira ┆ 89    ┆ Dhaka  ┆ 21  │
│ Maria ┆ 79    ┆ Dhaka  ┆ 23  │
│ Saria ┆ 76    ┆ Ctg    ┆ 27  │
└───────┴───────┴────────┴─────┘  [1;92m-->[0m [1;92mlf LazyFrame[0m
------------------------------------------------------------------------------------------------------------------------

shape: (4, 4)
┌───────┬───────┬────────┬─────┐
│ name  ┆ marks ┆ city   ┆ age │
│ ---   ┆ ---   ┆ ---    ┆ --- │
│ str   ┆ i64   ┆ str    ┆ i64 │
╞═══════╪═══════╪════════╪═════╡
│ Saria ┆ 76    ┆ Ctg    ┆ 27  │
│ Maria ┆ 79    ┆ Dhaka  ┆ 23  │
│ Maria ┆ 79    ┆ Khulna ┆ 25  │
│ Akira ┆ 89    ┆ Dhaka  ┆ 21  │
└───────┴───────┴────────┴─────┘  [1;92m-->[0m unique(keep='last')
---------------------------------------------------------------------------------

#                                           drop(`*Column Names`, strict), drop rows
```js
        1. '*Column Names' = 'age', 'name'
                       or  = ['age', 'name'], ultimately it will be unpacked(*).
        2. strict = throw an exception if a column name does not exist(True) OR not(False).
        3. If you want to drop 'rows', use filter() as filter()'s job is to filter/drop rows based on a or multiple conditions.

           i) But dropping a row by 'index'? In that case we need to have a 'index column'.'lazyframe.with_row_index(Name, startIndex)' add a row index as the first column in the LazyFrame.
           ii) Or you can slice(index, length). [:] --> This slice wont work because [:] works with DataFrame or Series because in LazyFrame the table data has not made till we call collect() and without table data we can't use [:]'. For 'LazyFrame its slice(index, length)'. After using slice(..) concatenate() them by 'vertically', cant use 'vstack' for Lazayframes.
```

In [17]:
print(lf.collect(), extra_info(color_text("lf LazyFrame")))
print(lf.drop('age', 'name').collect(), extra_info("drop('age', 'name') columns."))

# Drop the Index 1 and 3. slice(4, length = None) means slice from index 4 to the last column.
# solution 1
lf_temp = pl.concat([lf.slice(0, 1), lf.slice(2, 1), lf.slice(4, length=None)], how='vertical_relaxed')
print(lf_temp.collect(), extra_info("Dropped the index 1 and 3"))

# solution 2
lf1 = lf.with_row_index(name='index')
print(lf1.collect(), extra_info(color_text("lf1 LazyFrame having Index Column at very First")))
print( lf1.filter(~pl.col('index').is_in([1, 3])).collect(), extra_info("Dropped the index 1 and 3") )

shape: (5, 4)
┌───────┬───────┬────────┬─────┐
│ name  ┆ marks ┆ city   ┆ age │
│ ---   ┆ ---   ┆ ---    ┆ --- │
│ str   ┆ i64   ┆ str    ┆ i64 │
╞═══════╪═══════╪════════╪═════╡
│ Maria ┆ 79    ┆ Dhaka  ┆ 23  │
│ Maria ┆ 79    ┆ Khulna ┆ 25  │
│ Akira ┆ 89    ┆ Dhaka  ┆ 21  │
│ Maria ┆ 79    ┆ Dhaka  ┆ 23  │
│ Saria ┆ 76    ┆ Ctg    ┆ 27  │
└───────┴───────┴────────┴─────┘  [1;92m-->[0m [1;92mlf LazyFrame[0m
------------------------------------------------------------------------------------------------------------------------

shape: (5, 2)
┌───────┬────────┐
│ marks ┆ city   │
│ ---   ┆ ---    │
│ i64   ┆ str    │
╞═══════╪════════╡
│ 79    ┆ Dhaka  │
│ 79    ┆ Khulna │
│ 89    ┆ Dhaka  │
│ 79    ┆ Dhaka  │
│ 76    ┆ Ctg    │
└───────┴────────┘  [1;92m-->[0m drop('age', 'name') columns.
------------------------------------------------------------------------------------------------------------------------

shape: (3, 4)
┌───────┬───────┬───────┬─────┐
│ name  ┆ marks ┆ city  ┆

#                                   sort(by, descending, nulls_last, ...) LazyFrame
```js
        by = based on what columns, e.g. ['age', 'marks']
        descending = for each column in 'by' do you want that column in descending or not, e.g. `[False, True]`.
        nulls_last = After sorting where the nulls value should be? At the 'last' or not, e.g. 'True' or 'False'
```

In [18]:
print(lf.collect(), extra_info(color_text("lf LazyFrame")))
lf.sort(by=['age', 'marks'], descending=[False, True], nulls_last=True).collect()

shape: (5, 4)
┌───────┬───────┬────────┬─────┐
│ name  ┆ marks ┆ city   ┆ age │
│ ---   ┆ ---   ┆ ---    ┆ --- │
│ str   ┆ i64   ┆ str    ┆ i64 │
╞═══════╪═══════╪════════╪═════╡
│ Maria ┆ 79    ┆ Dhaka  ┆ 23  │
│ Maria ┆ 79    ┆ Khulna ┆ 25  │
│ Akira ┆ 89    ┆ Dhaka  ┆ 21  │
│ Maria ┆ 79    ┆ Dhaka  ┆ 23  │
│ Saria ┆ 76    ┆ Ctg    ┆ 27  │
└───────┴───────┴────────┴─────┘  [1;92m-->[0m [1;92mlf LazyFrame[0m
------------------------------------------------------------------------------------------------------------------------



name,marks,city,age
str,i64,str,i64
"""Akira""",89,"""Dhaka""",21
"""Maria""",79,"""Dhaka""",23
"""Maria""",79,"""Dhaka""",23
"""Maria""",79,"""Khulna""",25
"""Saria""",76,"""Ctg""",27


#                           pl.when(`condition`).then(`do this`).otherwise(`do that`)
```js
        'condition' = An Expression.
        'do this'   = An Expression or A Scaler Value.
        'do that'   = An Expression or A Scaler Value.
    
        Dont skip -- otherwise(`do that`) -- part even if you just want to do -- pl.when(`condition`).then(`do this`) -- operation. 
```

In [19]:
print(lf.collect(), extra_info(color_text("lf LazyFrame")))

condition: pl.Expr = (pl.col('marks') % 10).is_between(7, 9)
round_it: pl.Expr = ((pl.col('marks') // 10) + 1) * 10 # e.g. turn 57|58|59 to 60.
keep_it_as_it_is = pl.col('marks')

lf.with_columns(pl.when(condition).then(round_it).otherwise(keep_it_as_it_is)).collect()

shape: (5, 4)
┌───────┬───────┬────────┬─────┐
│ name  ┆ marks ┆ city   ┆ age │
│ ---   ┆ ---   ┆ ---    ┆ --- │
│ str   ┆ i64   ┆ str    ┆ i64 │
╞═══════╪═══════╪════════╪═════╡
│ Maria ┆ 79    ┆ Dhaka  ┆ 23  │
│ Maria ┆ 79    ┆ Khulna ┆ 25  │
│ Akira ┆ 89    ┆ Dhaka  ┆ 21  │
│ Maria ┆ 79    ┆ Dhaka  ┆ 23  │
│ Saria ┆ 76    ┆ Ctg    ┆ 27  │
└───────┴───────┴────────┴─────┘  [1;92m-->[0m [1;92mlf LazyFrame[0m
------------------------------------------------------------------------------------------------------------------------



name,marks,city,age
str,i64,str,i64
"""Maria""",80,"""Dhaka""",23
"""Maria""",80,"""Khulna""",25
"""Akira""",90,"""Dhaka""",21
"""Maria""",80,"""Dhaka""",23
"""Saria""",76,"""Ctg""",27


#                                               group_by

```js
        Look at 'group_by object.png' which is in this 'polars' folder. After doing 'group_by(by=..)', it returns a 'LazyGroupBy' object. Assume 'group1' is the group_by object in that image. Each 'key' in 'group1' has its own 'value(LazyFrame)'.
        Now 'group1.count()' means this 'count()' will be applied on 'each value(LazyFrame)'. So doesnt matter what function you apply on '(group1)' except `map_groups()`, because that (function) will be applied on (each value(LazyFrame)).

        Usage:
        ------
        Why we create group_by object? To perform ANY OPERATION on each group inside that group_by object.
        1) Now when we perform sum(), mean(), first(), last() etc on a LazyFrame, we get 'A Single Value' for each column in that LazyFrame.
        2) But when we perform cum_sum(), is_null() etc on A LazyFrame, we get 'A Column' for each Column in that LazyFrame.
                However if you apply them on a 'LazyGroupBy' object, For Each Group(LazyFrame): You get 'A List of Values, list(A Column)' for each column, so the answer can lies on a 'Single Row' e.g. [1, 2, 3, 4].
           
           After typing 'group10.' you will see only some functions BUT with group10.agg( col(..).choose_any_function() ). And of course we cant do 'select(), with_columns(), filter()' in agg(), but you can sure select specific columns inside agg() like we do with select().

           REMEMBER : 'is_null()', 'cum_sum()' etc inside 'agg()' return 'A LIST OF VALUES'(i.e. 'A SINGLE VALUE') for each COLUMN and this is called 'AGGREGATION' since we are using 'agg()'.

           WARNING : ALWAYS USE .agg() in group_by object to select built-in functions to do AGGREGATION('A SINGLE VALUE(SCALER/LIST egal) FOR EACH COLUMN in EACH GROUP'), why?? 'group1.count()' is not explicitly saying if the count() being applied row or column wise BUT "group1.agg( col('*').count() )" explicitly saying that it is being applied column-wise. But for custom function use map_groups().
                     Be carefull when using agg(..) since agg() AGGREGATES the result into a SINGLE VALUE i.e. {A SINGLE VALUE(Scaler/List) for EACH COLUMN}. So if you want A COLUMN for EACH COLUMN, you need to use 'CUSTOM FUNCTION' i.e. map_groups().
        
        Output:
        -------
        Now after you apply functions on 'group1' you will get a 'LazyFrame' result where the 'First Column = 'keys' of groupby object 'group1' and it is in random order.
        
        Note:
        -----
        LazyFrame.map_groups(lambda column: ....) => LazyFrame is nothing but 'bunch of columns'. When we map_groups() on LazyFrame, 'first we grab a column' and 'then traverse through each value of that column manually OR can apply vectorize operation (column.is_null()..) on that column'. Similarly for rest columns.

        LazyFrameGroupByObject.map_groups(lambda group: .... ) => group_by object consists of many 'group(Talking about each LazyFrame, not the keys)'. When we map_groups() on a group_by_object, 'first we grab teh first group(LazyFrame)', 'then we can apply vectorize operation on the whole group(Lazyframe) like Lazyframe.count()' OR 'we can traverse each column MANUALLY like we do on LazyFrame.map_groups(lambda column: ....)'. Similarly for rest groups.
```

#                                                        map_groups() on group_by object.
```js
        How map_groups() works :

            1) 'splits' each group(LazyFrame) into a Accessible LazyFrame. Thats why in map_groups(lambda LazyFrame:...), we can use the LazyFrame keyword as a Real LazyFrame.
            2) Then do the 'operation(function)' we set into map_groups(..) and get 'output(Scaler value/LazyFrame(A Single Column)/LazyFrame(Multiple Columns))' for EACH group(LazyFrame).
            3) 'Merge' EACH 'output' and return them as A LazyFrame.
      
         Note: At Step 2, we 'must return the output' we want for each group(LazyFrame). Otherwise what will we MERGE at the end? YOUR 
               MUM? NO!
               Dont think about the other groups(LazyFrames), 'focus on the first group' that 'WHAT you want to RETURN' after doing the calculation on that group(LazyFrame). If its a Single Boolean Column, other groups will also return a Single Boolean Column from each, eventually all SEPERATED OUTPUT will be merged into a SINGLE OUTPUT. So if we understand what the first LazyFrame will return, we can guess what the final merged output will be. Thats why focus on the first group(LazyFrame).

                  For e.g. we want to return a LazyFrame adding a new column. (speaking for the first group(LazyFrame))

                  def add_new_column(lazyframe) -> pl.LazyFrame:
                     new_lazyframe_after_adding_a_column = lazyframe.with_columns(col('a').rank(descending=True).alias('ranking on a'))
                     return new_lazyframe_after_adding_a_column

                  result = groupby_object.map_groups(func= add_new_column, schema=None)
                                          --> map_groups(..) will also be applied on rest groups(LazyFrames), ultimately will return a big big LazyFrame MERGING THOSE NEW LazyFrames.
                  
               So again : 'focus on the first group' that 'WHAT you want to return from this group(LazyFrame)'.
                          Ohh! All the LazyFrame, Column we can access inside map_groups(..), they are all copy! Mess with them, change them, no issue.

               Even during 'agg(..)' 'focus on the first group' that 'WHAT you want to return from this group(LazyFrame)'..
```

In [29]:
names = ['Maria', 'Saria', 'Akira', 'Masha', 'Aliya', 'Alya', 'Mukail', 'Hova']
data = {                          
    'a': [5, 8, 7, 1, 6, 2, 7, 4],
    'b': [2, 1, 2, 3, 1, 3, 2, 1],
    'c': [8, 9, 0, 8, 5, 8, 3, 7],
    'd': np.random.choice(a=names, size=(8,), replace=False)}

lf2 = pl.LazyFrame(data)
group10 = lf2.group_by('b')
print(group10, extra_info("group_by object 'group10'"))

group10_appearance = lf2.sort(by='b').with_columns(b = pl.when( ~col('b').is_first_distinct() ).then(pl.lit('')).otherwise(col('b'))) .select(col('b', 'a'), col('*').exclude('b', 'a'))

print(group10_appearance.collect(), extra_info(f"{color_text("group10")}_appearance"))
print(group10.agg(col('a'), col('c'), col('d')).collect(), extra_info("group10.agg(col('a'), col('c'), col('d'))"))
# Above : col('a') inside agg(..) working as an AGGREGATION, means all the values inside column 'a' will be returned as a SINGLE VALUE(here, as a List). Same for col('c') and col('d').

print(lf2.collect().partition_by(by='b'), extra_info("lf2.collect().partition_by(by='b') = Divide the DataFrame by col('b') into 'list of dataframes'"))

<polars.lazyframe.group_by.LazyGroupBy object at 0x0000023FA90C3AD0>  [1;92m-->[0m group_by object 'group10'
------------------------------------------------------------------------------------------------------------------------

shape: (8, 4)
┌─────┬─────┬─────┬────────┐
│ b   ┆ a   ┆ c   ┆ d      │
│ --- ┆ --- ┆ --- ┆ ---    │
│ str ┆ i64 ┆ i64 ┆ str    │
╞═════╪═════╪═════╪════════╡
│ 1   ┆ 8   ┆ 9   ┆ Akira  │
│     ┆ 6   ┆ 5   ┆ Hova   │
│     ┆ 4   ┆ 7   ┆ Saria  │
│ 2   ┆ 5   ┆ 8   ┆ Mukail │
│     ┆ 7   ┆ 0   ┆ Maria  │
│     ┆ 7   ┆ 3   ┆ Masha  │
│ 3   ┆ 1   ┆ 8   ┆ Alya   │
│     ┆ 2   ┆ 8   ┆ Aliya  │
└─────┴─────┴─────┴────────┘  [1;92m-->[0m [1;92mgroup10[0m_appearance
------------------------------------------------------------------------------------------------------------------------

shape: (3, 4)
┌─────┬───────────┬───────────┬──────────────────────────────┐
│ b   ┆ a         ┆ c         ┆ d                            │
│ --- ┆ ---       ┆ ---       ┆ ---    

In [30]:
print(color_text("                Whatever you do, the FIRST COLUMN will by the group_by object's key which is col('b')"), '\n')

# We can't access the keys of this group_by object 'group10'. Alternative is unique().
print( lf2.select(col('b').unique()).collect() , extra_info("lf2.select(col('b').unique()) = unique keys of 'group10'"))

print(group10.sum().collect(), extra_info("group10.sum()"))
print(group10.agg(col('*').sum()).collect(), extra_info("group10.agg(col('*').sum()"))

print(group10_appearance.collect(), extra_info(f"{color_text("group10")}_appearance"))
print(group10.agg(col('a', 'c').cum_sum()).collect(), extra_info("group10.agg(col('a', 'c').cum_sum()"))
print(group10.agg(col('a', 'c').is_null()).collect(), extra_info("group10.agg(col('a', 'c').is_null()"))

print(group10_appearance.collect(), extra_info(f"{color_text("group10")}_appearance"))
print(group10.len().collect(), extra_info("group10.len() = (key = The frequency of that key in col('b'))"))

print(group10.agg(col('a', 'd').first()).collect(), extra_info("group10.agg(col('a', 'd').first())"))

[1;92m                Whatever you do, the FIRST COLUMN will by the group_by object's key which is col('b')[0m 

shape: (3, 1)
┌─────┐
│ b   │
│ --- │
│ i64 │
╞═════╡
│ 1   │
│ 2   │
│ 3   │
└─────┘  [1;92m-->[0m lf2.select(col('b').unique()) = unique keys of 'group10'
------------------------------------------------------------------------------------------------------------------------

shape: (3, 4)
┌─────┬─────┬─────┬──────┐
│ b   ┆ a   ┆ c   ┆ d    │
│ --- ┆ --- ┆ --- ┆ ---  │
│ i64 ┆ i64 ┆ i64 ┆ str  │
╞═════╪═════╪═════╪══════╡
│ 2   ┆ 19  ┆ 11  ┆ null │
│ 3   ┆ 3   ┆ 16  ┆ null │
│ 1   ┆ 18  ┆ 21  ┆ null │
└─────┴─────┴─────┴──────┘  [1;92m-->[0m group10.sum()
------------------------------------------------------------------------------------------------------------------------

shape: (3, 4)
┌─────┬─────┬─────┬──────┐
│ b   ┆ a   ┆ c   ┆ d    │
│ --- ┆ --- ┆ --- ┆ ---  │
│ i64 ┆ i64 ┆ i64 ┆ str  │
╞═════╪═════╪═════╪══════╡
│ 3   ┆ 3   ┆ 16  ┆ null │
│ 1   ┆ 18  ┆ 21  ┆

In [42]:
# Q : Find all the names starts with 'M' in group10 object. (In other words: Filter the column 'd' with given condition for each group)

print(group10_appearance.collect(), extra_info(f"{color_text("group10")}_appearance"))

print(group10.agg( col('d').str.starts_with('M') ).collect()) # agg() useless here to find the names explictely. Reason below:
print(extra_info(f"group10.agg( col('d').str.starts_with('M') ) => starts_with() returns True/False. Since its used as AGGREGATE FUNCTION, we get list of True/False.\n{color_text("So be carefull when using agg(..) since agg() AGGREGATES the result.")}"))

# Ans = For Each Group(LazyFrame) FILTER the column 'd' which we can't do with agg() but filter(). For these we need custom fuction.
# Below Line : select('b', 'd') because if I do select('d'), only column 'd' will be in the output.
names_starts_with_M: Callable[[pl.LazyFrame], pl.LazyFrame] = lambda lazyframe: lazyframe.select('b', 'd').filter(col('d').str.starts_with('M'))
print(group10.map_groups(function=names_starts_with_M, schema=None).collect(), extra_info("group10.map_groups(function=names_starts_with_M, schema=None)"))
# Above : schema = Output Schema MANUALLY. schema=None means polars will decide what will be the OUTPUT DATATYPE for each column.

shape: (8, 4)
┌─────┬─────┬─────┬────────┐
│ b   ┆ a   ┆ c   ┆ d      │
│ --- ┆ --- ┆ --- ┆ ---    │
│ str ┆ i64 ┆ i64 ┆ str    │
╞═════╪═════╪═════╪════════╡
│ 1   ┆ 8   ┆ 9   ┆ Akira  │
│     ┆ 6   ┆ 5   ┆ Hova   │
│     ┆ 4   ┆ 7   ┆ Saria  │
│ 2   ┆ 5   ┆ 8   ┆ Mukail │
│     ┆ 7   ┆ 0   ┆ Maria  │
│     ┆ 7   ┆ 3   ┆ Masha  │
│ 3   ┆ 1   ┆ 8   ┆ Alya   │
│     ┆ 2   ┆ 8   ┆ Aliya  │
└─────┴─────┴─────┴────────┘  [1;92m-->[0m [1;92mgroup10[0m_appearance
------------------------------------------------------------------------------------------------------------------------

shape: (3, 2)
┌─────┬───────────────────────┐
│ b   ┆ d                     │
│ --- ┆ ---                   │
│ i64 ┆ list[bool]            │
╞═════╪═══════════════════════╡
│ 1   ┆ [false, false, false] │
│ 3   ┆ [false, false]        │
│ 2   ┆ [true, true, true]    │
└─────┴───────────────────────┘
 [1;92m-->[0m group10.agg( col('d').str.starts_with('M') ) => starts_with() returns True/False. Since its us

In [48]:
# Q: For each group(in group1) find the Rank based on column 'a' and create a new column('ranking on a') to set the ranking output.

print(group10_appearance.collect(), extra_info(f"{color_text("group10")}_appearance"))

def set_ranking_on_a(lazyframe: pl.LazyFrame) -> pl.LazyFrame: # this lazyframe is a COPY.
    return lazyframe.with_columns(col('a').rank(descending=True).alias('ranking on a'))

print(group10.map_groups(function=set_ranking_on_a, schema=None).collect())

print('''
The 'b' column is also shown in the output because (in my opinion) polars RANDOMLY select a group to do given calculation and
RANDOMLY show them in the output WHICH IS FASTER AND EFFICIENT CALCULATION since no maintaining order is needed. So to help to
recognize which values belongs to which group, polars show the column 'b' in the output.
''')

shape: (8, 4)
┌─────┬─────┬─────┬────────┐
│ b   ┆ a   ┆ c   ┆ d      │
│ --- ┆ --- ┆ --- ┆ ---    │
│ str ┆ i64 ┆ i64 ┆ str    │
╞═════╪═════╪═════╪════════╡
│ 1   ┆ 8   ┆ 9   ┆ Akira  │
│     ┆ 6   ┆ 5   ┆ Hova   │
│     ┆ 4   ┆ 7   ┆ Saria  │
│ 2   ┆ 5   ┆ 8   ┆ Mukail │
│     ┆ 7   ┆ 0   ┆ Maria  │
│     ┆ 7   ┆ 3   ┆ Masha  │
│ 3   ┆ 1   ┆ 8   ┆ Alya   │
│     ┆ 2   ┆ 8   ┆ Aliya  │
└─────┴─────┴─────┴────────┘  [1;92m-->[0m [1;92mgroup10[0m_appearance
------------------------------------------------------------------------------------------------------------------------

shape: (8, 5)
┌─────┬─────┬─────┬────────┬──────────────┐
│ a   ┆ b   ┆ c   ┆ d      ┆ ranking on a │
│ --- ┆ --- ┆ --- ┆ ---    ┆ ---          │
│ i64 ┆ i64 ┆ i64 ┆ str    ┆ f64          │
╞═════╪═════╪═════╪════════╪══════════════╡
│ 1   ┆ 3   ┆ 8   ┆ Alya   ┆ 2.0          │
│ 2   ┆ 3   ┆ 8   ┆ Aliya  ┆ 1.0          │
│ 5   ┆ 2   ┆ 8   ┆ Mukail ┆ 3.0          │
│ 7   ┆ 2   ┆ 0   ┆ Maria  ┆ 1.5          │
│ 

```js
                                Speed comparison between pandas apply() and polars map_groups().
```

In [32]:
# names = ['Maria', 'Saria', 'Akira', 'Masha', 'Aliya', 'Alya', 'Mukail', 'Hova']
# a = list(range(10))
# data = {                          
#     'a': np.random.choice(a= a, size=(1_00_00000,), replace=True),
#     'b': np.random.choice(a= a, size=(1_00_00000,), replace=True),
#     'c': np.random.choice(a= a, size=(1_00_00000,), replace=True),
#     'd': np.random.choice(a=names, size=(1_00_00000,), replace=True) }

# df0 = pl.LazyFrame(data)
# group0 = df0.group_by('b')

# df1 = pl.DataFrame(data)
# group1 = df1.group_by('b')

# df2 = pd.DataFrame(data)
# group2 = df2.groupby('b')

In [49]:
# start = time()
# find_names_M0: Callable[[pl.LazyFrame], pl.LazyFrame] = lambda lazyframe: lazyframe.filter(col('d').str.starts_with('M')).select(col('b', 'd'))
# r0 = group0.map_groups(function= find_names_M0, schema=None).sort(by='b').collect()
# print(time() - start)

# start = time()
# find_names_M1: Callable[[pl.DataFrame], pl.DataFrame] = lambda dataframe: dataframe.filter(col('d').str.starts_with('M')).select(col('b', 'd'))
# r1 = group1.map_groups(function= find_names_M1).sort(by='b')
# print(time() - start)

# start = time()
# find_names_M2: Callable[[pd.DataFrame], pd.DataFrame] = lambda dataframe: dataframe['d'] [dataframe['d'].str.startswith('M')]
# r2 =  group2.apply(func= find_names_M2, include_groups=False)
# print(time() - start)

# Ans :
print('''
    0.7093157768249512  => LazyFrame
    0.7516193389892578  => DataFrame (polars)
    6.648420572280884   => DataFrame (pandas)
''')


    0.7093157768249512  => LazyFrame
    0.7516193389892578  => DataFrame (polars)
    6.648420572280884   => DataFrame (pandas)



In [50]:
# print(r1, extra_info("polars dataframe"))
# print(r2.droplevel(1), extra_info("pandas"))

print('''
shape: (3_751_334, 2)
┌─────┬────────┐
│ b   ┆ d      │
│ --- ┆ ---    │
│ i64 ┆ str    │
╞═════╪════════╡
│ 0   ┆ Maria  │
│ 0   ┆ Mukail │
│ 0   ┆ Mukail │
│ 0   ┆ Masha  │
│ 0   ┆ Masha  │
│ …   ┆ …      │
│ 9   ┆ Masha  │
│ 9   ┆ Mukail │
│ 9   ┆ Mukail │
│ 9   ┆ Maria  │
│ 9   ┆ Masha  │
└─────┴────────┘  --> polars dataframe
------------------------------------------------------------------------------------------------------------------------

b
0     Maria
0    Mukail
0    Mukail
0     Masha
0     Masha
      ...  
9     Masha
9    Mukail
9    Mukail
9     Maria
9     Masha
Name: d, Length: 3751334, dtype: object  --> pandas
------------------------------------------------------------------------------------------------------------------------


''')


shape: (3_751_334, 2)
┌─────┬────────┐
│ b   ┆ d      │
│ --- ┆ ---    │
│ i64 ┆ str    │
╞═════╪════════╡
│ 0   ┆ Maria  │
│ 0   ┆ Mukail │
│ 0   ┆ Mukail │
│ 0   ┆ Masha  │
│ 0   ┆ Masha  │
│ …   ┆ …      │
│ 9   ┆ Masha  │
│ 9   ┆ Mukail │
│ 9   ┆ Mukail │
│ 9   ┆ Maria  │
│ 9   ┆ Masha  │
└─────┴────────┘  --> polars dataframe
------------------------------------------------------------------------------------------------------------------------

b
0     Maria
0    Mukail
0    Mukail
0     Masha
0     Masha
      ...  
9     Masha
9    Mukail
9    Mukail
9     Maria
9     Masha
Name: d, Length: 3751334, dtype: object  --> pandas
------------------------------------------------------------------------------------------------------------------------





In [52]:
df3 = pl.scan_csv("d:\\datasets\\20 Paper Names.csv", schema={'No': pl.Int8, 'Name': pl.String})

print(df3.collect(), extra_info())

shape: (15, 2)
┌─────┬─────────────────────────────────┐
│ No  ┆ Name                            │
│ --- ┆ ---                             │
│ i8  ┆ str                             │
╞═════╪═════════════════════════════════╡
│ 1   ┆ Evaluating Deep Neural Network… │
│ 2   ┆ SkinCon: A skin disease datase… │
│ 3   ┆ Towards Transparency in Dermat… │
│ 4   ┆ SkinCAP: A Multi-modal Dermato… │
│ 5   ┆ Assessing GPT-4's Diagnostic A… │
│ …   ┆ …                               │
│ 11  ┆ Pre-trained multimodal large l… │
│ 12  ┆ Unsupervised SoftOtsuNet Augme… │
│ 13  ┆ Fostering transparent medical … │
│ 14  ┆ Fair Conformal Predictors for … │
│ 15  ┆ null                            │
└─────┴─────────────────────────────────┘  [1;92m-->[0m 
------------------------------------------------------------------------------------------------------------------------

