In [1]:
import polars as pl
import polars.selectors as cs
from polars import col
import numpy as np
from time import time
from typing import List, Callable
from termcolor import colored
import seaborn as sns

color_text = lambda text: f"{'\033[1;92m'}{text}{'\033[0m'}"
extra_info = lambda extra='': f" {color_text("-->")} {extra}\n{'-' * 120}\n"

#                                         Polars Creation From Lists, Dict, Realworld Datasets
```js
1. In Polars DataFrame you can set the datatype for each column manually by the 'schema={ColumnName: dtype, ColumnName1: dtype...}'.
2. But if a column in CSV is a Floating Point, then polars doesn't directly cast the Floating Point to integer.
        df = pl.read_csv("D:\\datasets\\temp.csv", schema_overrides={'Marks': pl.UInt8}, infer_schema_length=10000, ignore_errors=True)
        but it make values like 89.87 to null and 45.0 to 45
3. And if a column in CSV is an Integer, Polar can cast it to Floating Point but not the Vice Varsa.

        BUTT you should not do these CASTING INSIDE 'read_csv'. First read the CSV, then analyze it, see if the max and min val of a column can be changed to for example 'pl.Uint8' or not or maybe we need to round it first? Then with 'pl.with_columns' cast the columns you need and set the new DataFrame back to the DataFrame you wanted to change.

        You can do such casting when you create your own small DataFrame, not on the 'real datasets'.
```

```js
        1. In Pandas we say df. then all the methods, attributes are shown.. But in polars when we want to do some operations ON ALL THE COLUMNS, we say 'pl.all()'.
        2. 'pl.all()' : This is an 'expression' that represents "all columns selected" in the DataFrame. You use it when you want to apply a 'transformation or condition' across 'all columns'. Output for pl.all() = * representing All the columns are selected.
        3. 'pl.all().is_null()' is another expresson. Output : *.is_null(). It has not executed yet.
            - expression = pl.all().is_null()               Assume we have 2 DataFrame, df1, df2.
            - To apply the expression 'expression' on df1 we say : 'df1.select(expression)'. Now the 'expression' is applied on 'df1' and will show the output. We can apply this same expression on df2 as well. 
        
        4. 'df.is_duplicated()' : It works ROW WISE. If you want to check duplicate values on each column => 'df.select(pl.all().is_duplicated())'.
        5. Some methods like 'count_null()' works on each column but We should respect polars and do p.all() when we want to do some operations on each column.

        `Why Polars Uses Expressions`:
            - Efficiency: By using expressions, Polars can optimize the query plan and execute operations more efficiently, especially for large datasets.
            - Flexibility: This approach allows chaining of transformations and applying them lazily, which can be evaluated only when needed.
```

In [3]:
# from lists
info = [
    ['Maria0', 15, 16],   # every list is a ROW.
    ['Maria1', 18, 19],
    ['Maria2', 21, 22],
    ['Maria3', 24, 25]
]
#  This "schema list" is to define column names. orient means how I want my each list to be, 'row' or 'col'.
pl.DataFrame(info, schema=['Name', 'Age', 'IQ'], orient='row', strict=False) # or pl.DataFrame(info, schema={'Name': pl.String, 'Age': pl.UInt8, 'IQ': pl.UInt8}, orient='row', strict=False)

Name,Age,IQ
str,i64,i64
"""Maria0""",15,16
"""Maria1""",18,19
"""Maria2""",21,22
"""Maria3""",24,25


In [4]:
# from dictionary
info = {
    'Name' : ['Maria0', 'Maria1', 'Maria2', 'Maria3'],
    'Age' : [15, 18, 21, 24],
    'IQ' : [16, 19, 22, 25]
}
#                  The below "schema/schema_overrides dict" is for defining the datatype for the columns.
pl.DataFrame(info, schema_overrides={'Name': pl.String, 'IQ': pl.UInt8})

Name,Age,IQ
str,i64,u8
"""Maria0""",15,16
"""Maria1""",18,19
"""Maria2""",21,22
"""Maria3""",24,25


In [5]:
# from real world datasets

df = pl.read_csv("D:\\datasets\\nba.csv")
# print(df, extra_info())
# 
# df1 = df.with_columns(
#     pl.col('Number').ceil().cast(pl.UInt8),
#     pl.col('Height')
# )

print(df.head(10))

shape: (10, 9)
┌────────────────┬─────────┬────────┬──────────┬───┬────────┬────────┬────────────────┬────────────┐
│ Name           ┆ Team    ┆ Number ┆ Position ┆ … ┆ Height ┆ Weight ┆ College        ┆ Salary     │
│ ---            ┆ ---     ┆ ---    ┆ ---      ┆   ┆ ---    ┆ ---    ┆ ---            ┆ ---        │
│ str            ┆ str     ┆ f64    ┆ str      ┆   ┆ str    ┆ f64    ┆ str            ┆ f64        │
╞════════════════╪═════════╪════════╪══════════╪═══╪════════╪════════╪════════════════╪════════════╡
│ Avery Bradley  ┆ Boston  ┆ 0.0    ┆ PG       ┆ … ┆ 6-2    ┆ 180.0  ┆ Texas          ┆ 7.730337e6 │
│                ┆ Celtics ┆        ┆          ┆   ┆        ┆        ┆                ┆            │
│ Jae Crowder    ┆ Boston  ┆ 99.0   ┆ SF       ┆ … ┆ 6-6    ┆ 235.0  ┆ Marquette      ┆ 6.796117e6 │
│                ┆ Celtics ┆        ┆          ┆   ┆        ┆        ┆                ┆            │
│ John Holland   ┆ Boston  ┆ 30.0   ┆ SG       ┆ … ┆ 6-5    ┆ 205.0  ┆ Bosto

#              [rows, columns]. rows = columns = `A Single Value i.e. A Scalar Value` OR `can be a List` OR `Slice(:)`.

In [6]:
#                                            Only for DataFrames, NOT FOR LAZYFRAME

print(df[ 4, ['Team', 'Height', 'College']], extra_info("df[4, ['Team', 'Height', 'College']]"))
print(df[:4, ['Team', 'Height', 'College']], extra_info("first 4 rows but only Those 3 columns"))
print(df[::100, ['Team', 'Height', 'College']], extra_info("df[::100, ['Team', 'Height', 'College']]"))

bool_columns = np.random.choice(a=[True, False], size=(df.width,), replace=True)
print(df[:, bool_columns], extra_info(f"\n{list(zip(df.columns, bool_columns))}"))

shape: (1, 3)
┌────────────────┬────────┬─────────┐
│ Team           ┆ Height ┆ College │
│ ---            ┆ ---    ┆ ---     │
│ str            ┆ str    ┆ str     │
╞════════════════╪════════╪═════════╡
│ Boston Celtics ┆ 6-10   ┆ null    │
└────────────────┴────────┴─────────┘  [1;92m-->[0m df[4, ['Team', 'Height', 'College']]
------------------------------------------------------------------------------------------------------------------------

shape: (4, 3)
┌────────────────┬────────┬───────────────────┐
│ Team           ┆ Height ┆ College           │
│ ---            ┆ ---    ┆ ---               │
│ str            ┆ str    ┆ str               │
╞════════════════╪════════╪═══════════════════╡
│ Boston Celtics ┆ 6-2    ┆ Texas             │
│ Boston Celtics ┆ 6-6    ┆ Marquette         │
│ Boston Celtics ┆ 6-5    ┆ Boston University │
│ Boston Celtics ┆ 6-5    ┆ Georgia State     │
└────────────────┴────────┴───────────────────┘  [1;92m-->[0m first 4 rows but only Those 3 colum

#                                               `slice()` For LAZYFRAME
```js
        1. LazyFrame 'lf', where the Tabular Data has not been created until we use 'lf.collect()'. Thats why we cant use stuff like   lf[:17:5, ['name', 'city', 'age']].
        2. lf.slice(index, length) => This will only slice 'rows' and returns also lazyframe.
        3. lf.select(pl.col( *Columns Name )) => This selects specific 'columns'. Nope, cant use 'Boolean Columns' in '*Columns Name'.
```

#                                                       DataFrame Attributes

In [7]:
print(df.shape, extra_info("shape"))
print(df.columns,'  ', type(df.columns), extra_info("column names"))
print(df.dtypes, '  ', type(df.dtypes),  extra_info("dtypes"))
print(df.height, '  ', df.width, extra_info("heigh and width"))
print(df.schema, '\n', type(df.schema), extra_info("Schema(Column Names with their Datatype)"))

(458, 9)  [1;92m-->[0m shape
------------------------------------------------------------------------------------------------------------------------

['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight', 'College', 'Salary']    <class 'list'>  [1;92m-->[0m column names
------------------------------------------------------------------------------------------------------------------------

[String, String, Float64, String, Float64, String, Float64, String, Float64]    <class 'list'>  [1;92m-->[0m dtypes
------------------------------------------------------------------------------------------------------------------------

458    9  [1;92m-->[0m heigh and width
------------------------------------------------------------------------------------------------------------------------

Schema({'Name': String, 'Team': String, 'Number': Float64, 'Position': String, 'Age': Float64, 'Height': String, 'Weight': Float64, 'College': String, 'Salary': Float64}) 
 <class 'polars.

#                                                       DataFrame Methods

In [8]:
# head, tail, sample are same as series.
df_info = lambda: df.null_count().cast(pl.String).vstack( pl.DataFrame(dict(zip(df.columns, map(str, df.dtypes)))) )
#                                         Polars doesn't have pandas dataframe.info() so I made it.
print(df_info(), extra_info("info(Index, Columns, Count of Non Null Values in each column, dtype)"))
print(df.describe(), extra_info("describe")) # works on every column

shape: (2, 9)
┌────────┬────────┬─────────┬──────────┬───┬────────┬─────────┬─────────┬─────────┐
│ Name   ┆ Team   ┆ Number  ┆ Position ┆ … ┆ Height ┆ Weight  ┆ College ┆ Salary  │
│ ---    ┆ ---    ┆ ---     ┆ ---      ┆   ┆ ---    ┆ ---     ┆ ---     ┆ ---     │
│ str    ┆ str    ┆ str     ┆ str      ┆   ┆ str    ┆ str     ┆ str     ┆ str     │
╞════════╪════════╪═════════╪══════════╪═══╪════════╪═════════╪═════════╪═════════╡
│ 1      ┆ 1      ┆ 1       ┆ 1        ┆ … ┆ 1      ┆ 1       ┆ 85      ┆ 12      │
│ String ┆ String ┆ Float64 ┆ String   ┆ … ┆ String ┆ Float64 ┆ String  ┆ Float64 │
└────────┴────────┴─────────┴──────────┴───┴────────┴─────────┴─────────┴─────────┘  [1;92m-->[0m info(Index, Columns, Count of Non Null Values in each column, dtype)
------------------------------------------------------------------------------------------------------------------------

shape: (9, 10)
┌────────────┬────────────┬────────────┬───────────┬───┬────────┬────────────┬─────────┬────

#                                           is_in

In [3]:
import pandas as pd

data = [[1, 23], [2, 9], [4, 30], [5, 54], [6, 96], [7, 54], [8, 54]]
visits = pd.DataFrame(data, columns=['visit_id', 'customer_id']).astype({'visit_id':'Int64', 'customer_id':'Int64'})
data = [[2, 5, 310], [3, 5, 300], [9, 5, 200], [12, 1, 910], [13, 2, 970]]
transactions = pd.DataFrame(data, columns=['transaction_id', 'visit_id', 'amount']).astype({'transaction_id':'Int64', 'visit_id':'Int64', 'amount':'Int64'})

print(visits, extra_info(color_text("visits")))
print(transactions, extra_info(color_text("transactions")))

   visit_id  customer_id
0         1           23
1         2            9
2         4           30
3         5           54
4         6           96
5         7           54
6         8           54  [1;92m-->[0m [1;92mvisits[0m
------------------------------------------------------------------------------------------------------------------------

   transaction_id  visit_id  amount
0               2         5     310
1               3         5     300
2               9         5     200
3              12         1     910
4              13         2     970  [1;92m-->[0m [1;92mtransactions[0m
------------------------------------------------------------------------------------------------------------------------



In [4]:
visits, transactions = pl.LazyFrame(visits), pl.LazyFrame(transactions)
transactions_visit_id = transactions.select(col('visit_id')).collect() # without collect() is_in() below won't work

visits.filter( ~col('visit_id').is_in(transactions_visit_id) ) .collect()

s= '''
        If we did is_in() using the same lazyframe 'visits' then we would just say something like this :
            visits.filter( ~col('visit_id').is_in(col('customer_id')) )
        
        But since we want to is_in() between 2 columns which belongs to 2 DIFFERENT LAZYFRAME, we did :
            transactions_visit_id = transactions.select(col('visit_id')).collect()

            Why need to write collect()?
            ----------------------------
            visits.filter(..) is already query here. Now inside filter(..) if we say :
                col('visit_id').is_in( transactions.select(col('visit_id')) ), here transactions.select(col('visit_id')) is
                ANOTHER QUERY. So query inside query which polars doesn't allow.
'''

visit_id,customer_id
i64,i64
4,30
6,96
7,54
8,54


#                                                 top_k, bottom_k
```js
        Syntax : top_k(k='n number Largest Values to return', by='on which column we want to do this')
                 bottom_k(k, by)

        - e.g. lf8.top_k(k=5, by='Number') :
            - It wont return the Top 5 largest numbers from col('Number'). Its 'filtering' the LazyFrame lf8 and returns another lazyframe with top 5 largest numbers.
        
        - e.g. lf8.bottom_k(k=5, by='Number') :
            - 'filtering' the LazyFrame lf8 and returns another lazyframe with top 5 smallest numbers.
```

In [9]:
lf8 = df.lazy().with_columns(col('Number').cast(pl.UInt8))

print( lf8.top_k(k=5, by='Number') .collect(),
       extra_info("\nlf8.top_k(k=5, by='Number') = A LazyFrame of who has the Top 5 Maximum Numbers in col('Number')"))

print( lf8.bottom_k(k=5, by='Number') .collect(),
       extra_info("\nlf8.bottom_k(k=5, by='Number') = A LazyFrame of who has the Top 5 Minimum Numbers in col('Number')"))

shape: (5, 9)
┌───────────────┬───────────────┬────────┬──────────┬───┬────────┬────────┬───────────┬────────────┐
│ Name          ┆ Team          ┆ Number ┆ Position ┆ … ┆ Height ┆ Weight ┆ College   ┆ Salary     │
│ ---           ┆ ---           ┆ ---    ┆ ---      ┆   ┆ ---    ┆ ---    ┆ ---       ┆ ---        │
│ str           ┆ str           ┆ u8     ┆ str      ┆   ┆ str    ┆ f64    ┆ str       ┆ f64        │
╞═══════════════╪═══════════════╪════════╪══════════╪═══╪════════╪════════╪═══════════╪════════════╡
│ Jae Crowder   ┆ Boston        ┆ 99     ┆ SF       ┆ … ┆ 6-6    ┆ 235.0  ┆ Marquette ┆ 6.796117e6 │
│               ┆ Celtics       ┆        ┆          ┆   ┆        ┆        ┆           ┆            │
│ Lucas         ┆ Toronto       ┆ 92     ┆ C        ┆ … ┆ 7-0    ┆ 220.0  ┆ null      ┆ 1.842e6    │
│ Nogueira      ┆ Raptors       ┆        ┆          ┆   ┆        ┆        ┆           ┆            │
│ Amir Johnson  ┆ Boston        ┆ 90     ┆ PF       ┆ … ┆ 6-9    ┆ 240.0  ┆ n

#                                                         value_counts()
```js
        polars `LazyFrame.select(col('x').value_counts())` returns a Single Column with struct values, e.g. `{23, 5}`({value, frequency}). To seperate these values into Two Different Columns, first `collect()` and then `unnest(column_names_to_unnest)`.
```

In [10]:
lf8 = df.lazy().with_columns(col('Number').cast(pl.UInt8))

frequencies = lf8.select(col('Number').value_counts(sort=True)) # for better understanding I set sort=True.
df_frequency = frequencies.collect().unnest('Number')

print(frequencies.collect(), extra_info("frequencies"))
print(df_frequency, extra_info(f"{color_text('df_frequency')} = frequencies.collect().unnest('Number')"))

# Let's assume we set above sort=False. Now pick the Top 5 Maximum Frquencies.
print(df_frequency.top_k(k=5, by='count'), extra_info("df_frequency.top_k(k=5, by='count') = filtering df_frequency for Top 5 Maximum Counts"))

shape: (54, 1)
┌───────────┐
│ Number    │
│ ---       │
│ struct[2] │
╞═══════════╡
│ {5,23}    │
│ {3,21}    │
│ {0,20}    │
│ {1,20}    │
│ {8,19}    │
│ …         │
│ {45,1}    │
│ {37,1}    │
│ {52,1}    │
│ {88,1}    │
│ {null,1}  │
└───────────┘  [1;92m-->[0m frequencies
------------------------------------------------------------------------------------------------------------------------

shape: (54, 2)
┌────────┬───────┐
│ Number ┆ count │
│ ---    ┆ ---   │
│ u8     ┆ u32   │
╞════════╪═══════╡
│ 5      ┆ 23    │
│ 3      ┆ 21    │
│ 0      ┆ 20    │
│ 1      ┆ 20    │
│ 8      ┆ 19    │
│ …      ┆ …     │
│ 45     ┆ 1     │
│ 37     ┆ 1     │
│ 52     ┆ 1     │
│ 88     ┆ 1     │
│ null   ┆ 1     │
└────────┴───────┘  [1;92m-->[0m [1;92mdf_frequency[0m = frequencies.collect().unnest('Number')
------------------------------------------------------------------------------------------------------------------------

shape: (5, 2)
┌────────┬───────┐
│ Number ┆ count │
│ --

#                                                 filter / select / with_columns
```js
        1. df.filter(expression to perform on the df) :
                When based on an operation we want to display 'specific columns or specific values from the dataframe OR the entire dataframe', we do filter(..) which returns the ENTIRE New DataFrame but we can select(..) specific columns to display.

        2. df.select(list of columns to display OR expression to perform) :
                When we want to 'select specific columns to display' OR we want to find the sum()/prod()/mean()/std() etc such CALCULATION on the entire or specific columns OR we want to have only the 'Boolean Mask result', not the actual output, we do select().
        
        3. df.with_columns(changes on ALL or specific columns seperated by comma) :
                Assume we have 10 columns. Now we want to change 2 or 3 columns DataType by 'casting' or change their values and AFTER the changes on specific columns we want to have the ENTIRE DATAFRAME having those changes on those specific columns and the REST UNCHANGED COLUMNS as well. For this we do df.with_columns(). We can even set the new dataframe to the old dataframe.
```

In [11]:
print(df.filter( pl.col('Name').str.contains('Avery') ), '\n')
df.filter( pl.col('Name').str.contains('Avery') ).select(['Name', 'Height']) # YOU CAN'T WRITE df.filter( pl.col('Name').str.contains('Avery').select(['Name', 'Height']) )

mask = df.select(pl.col('Name').str.contains('Avery')).to_series()
print(mask, '\n')

df.select(pl.all().count()) # won't work if you write filter instead of select

shape: (1, 9)
┌───────────────┬────────────────┬────────┬──────────┬───┬────────┬────────┬─────────┬────────────┐
│ Name          ┆ Team           ┆ Number ┆ Position ┆ … ┆ Height ┆ Weight ┆ College ┆ Salary     │
│ ---           ┆ ---            ┆ ---    ┆ ---      ┆   ┆ ---    ┆ ---    ┆ ---     ┆ ---        │
│ str           ┆ str            ┆ f64    ┆ str      ┆   ┆ str    ┆ f64    ┆ str     ┆ f64        │
╞═══════════════╪════════════════╪════════╪══════════╪═══╪════════╪════════╪═════════╪════════════╡
│ Avery Bradley ┆ Boston Celtics ┆ 0.0    ┆ PG       ┆ … ┆ 6-2    ┆ 180.0  ┆ Texas   ┆ 7.730337e6 │
└───────────────┴────────────────┴────────┴──────────┴───┴────────┴────────┴─────────┴────────────┘ 

shape: (458,)
Series: 'Name' [bool]
[
	true
	false
	false
	false
	false
	…
	false
	false
	false
	false
	null
] 



Name,Team,Number,Position,Age,Height,Weight,College,Salary
u32,u32,u32,u32,u32,u32,u32,u32,u32
457,457,457,457,457,457,457,373,446


#                                                        filter()
```js
        1. df.filter(conditions seperated by comma) : It returns the Whole New DataFrame after applying the filter on 'df'. But if you want specific columns => df.filter(conditions seperated by comma).select([column names]) or df.filter(conditions seperated by comma)[[column names]]

        2. df.lazy().filter(conditions seperated by comma) : Polars known for laziness and it gives FASTER EXECUTION reducing the unnecessery operations which occurs INTERNALLY like :

            df.filter(conditions seperated by comma) returns the whole DataFrame and then with .select([column names]), it gives us the specfic columns. BUT
            in df.lazy().filter(conditions seperated by comma).select([column names]) : polars doesn't even return the whole new dataframe when we do 'df.filter(..)' because polars see that we selected only specific columns by '.select([..])', so it directly returns only those specific columns we selected after 'df.filter(..)'.' Thats why it RUNS FASTER!
```

In [12]:
start = time()
pp = df.filter(pl.col('Number') > 50).select(['Name', 'Age'])
print(time() - start)
# pp[0, 'Name'] = 'AAAAA'
# print(pp['Name'][0], df['Name'][0]) # AAAAA, Avery Bradley. I.E. pp is not a view but A NEW DataFrame

start = time()
pp1 = df.lazy().filter(pl.col('Number') > 50).select(['Name', 'Age']).collect()
print(time() - start) # Always do lazy operation
# pp1[0, 'Name'] = 'AAAAA'
# print(pp1['Name'][0], df['Name'][0]) # AAAAA, Avery Bradley. I.E. pp1 is not a view but A NEW DataFrame

0.0039920806884765625
0.0009987354278564453


In [13]:
df.head(2)

Name,Team,Number,Position,Age,Height,Weight,College,Salary
str,str,f64,str,f64,str,f64,str,f64
"""Avery Bradley""","""Boston Celtics""",0.0,"""PG""",25.0,"""6-2""",180.0,"""Texas""",7730337.0
"""Jae Crowder""","""Boston Celtics""",99.0,"""SF""",25.0,"""6-6""",235.0,"""Marquette""",6796117.0


#                                   rename(), is_duplicated(), null_count(), is_null()

In [14]:
# df.columns = np.arange(10, df.width+10, dtype='u8').astype('str') # it will change the column names PERMANENTLY.
print(df.rename({'Number' : 'Marks', 'Salary' : 'Wage'}), extra_info("RENAMING SPECIFIC COLUMNS"))

print(df.is_duplicated()) # returns a Boolean Mask i.e. SERIES. True = That row is DUPLICATED.
print(df.null_count(), extra_info("count nulls on Each COLUMN.")) # count nulls on Each COLUMN.

# DataFrame doesn't have isnull(), only Series has isnull(). So with select(..) traverse EACH COLUMN/SERIES and apply isnull().
print(df.select( pl.all().is_null() ), extra_info("Boolean DataFrame after is_null() applied on each column"))
print(df.select( pl.all().is_null().sum() ), extra_info("'pl.all().is_null().sum()' applied on each column"))

print(df.select( pl.all().has_nulls() ), extra_info("Columns having at least One Null Value = True, else False"))

shape: (458, 9)
┌────────────────┬───────────┬───────┬──────────┬───┬────────┬────────┬───────────────┬────────────┐
│ Name           ┆ Team      ┆ Marks ┆ Position ┆ … ┆ Height ┆ Weight ┆ College       ┆ Wage       │
│ ---            ┆ ---       ┆ ---   ┆ ---      ┆   ┆ ---    ┆ ---    ┆ ---           ┆ ---        │
│ str            ┆ str       ┆ f64   ┆ str      ┆   ┆ str    ┆ f64    ┆ str           ┆ f64        │
╞════════════════╪═══════════╪═══════╪══════════╪═══╪════════╪════════╪═══════════════╪════════════╡
│ Avery Bradley  ┆ Boston    ┆ 0.0   ┆ PG       ┆ … ┆ 6-2    ┆ 180.0  ┆ Texas         ┆ 7.730337e6 │
│                ┆ Celtics   ┆       ┆          ┆   ┆        ┆        ┆               ┆            │
│ Jae Crowder    ┆ Boston    ┆ 99.0  ┆ SF       ┆ … ┆ 6-6    ┆ 235.0  ┆ Marquette     ┆ 6.796117e6 │
│                ┆ Celtics   ┆       ┆          ┆   ┆        ┆        ┆               ┆            │
│ John Holland   ┆ Boston    ┆ 30.0  ┆ SG       ┆ … ┆ 6-5    ┆ 205.0  ┆ Bos

#                                               drop_nulls() on DataFrame
```js
        Polars.drop_nulls() traverse through each column i.e. vertically i.e. 'ROW-wise' and while traversing vertically if polars see a column's value is Null, it delete that 'ROW'.' So at the end in the result DataFrame you won't see a single ROW which has any Null value.

        'pandas drop_na(subset, axis=1 or 0, how='any' or 'all')' has these 3 important parameters which is really really beneficial. Polars drop_nulls() equivalent to Pandas drop_na(subset, axis=0, how='any'). So I created below after the next block, 'drop_nulls(lazyframe, subset, drop='rows' or 'columns', how='any' or 'all')'.

------> Dealing with 'drop=rows':
        -------------------------
                We want to delete 'WHOLE ROW', means doesnt matter if we want to drop_nulls() based on specific columns or all the columns. E.G.

                        True      False       True        True   True                              True
                        False     False       False       False  False         =>                  False
                        True      True        True        True   True                              True
                        ---------------------------      --------------              -------------------------------------
                             pl.all().is_null()       pl.col(subset).is_null()       pl.any_horizontal( on both left df)

                                                                                     = For both left Boolean DataFrame, The Output is same AND 'Both Giving OUTPUT in A Series i.e. A SINGLE COLUMN (NOT 1D array like [True, False. True])'. We can use this SINGLE COLUMN to filter('filter works only on A SERIES/A SINGLE COLUMN BOOLEAN MASK') the ROWS we want.
                                                                                     Of course True = It has NULLS and we dont want that row. So ~pl.any_horizontal().
------> Dealing with 'drop=columns':
        ----------------------------
                We want to delete 'WHOLE COLUMN'.

                        True      False       True                                   
                        False     False       False              =>                  
                        True      False       True                              True    False   True
                        ---------------------------                    -------------------------------------
                             pl.all().is_null()                                 pl.all().is_null().any() (Result : (1, 3) DataFrame)

                                                                       = We cant use A Single Row DataFrame as A Boolean Mask. You may think to use `df[0]` to select the Single Row BUT 'if we select just A SINGLE ROW, polars still give us a DataFrame with that single row having column names above'. For this use '.row(index=0)' which 'returns a 1D tuple'. Since we want to delete `COLUMNS` i.e. FILTER 'COLUMNS', filter() wont help us (as filter() filters 'ROWS' only) BUT 'lazyframe.collect()[:, boolean mask or column names], lazyframe.select(pl.col(column names))' will.

                                                                       We can use a 1D tuple e.g. [True, False, True] to select columns like 'lazyframe.collect()[:, [True, False, True]]' BUT we dont want to do 'lazyframe.collect()' to filter specific columns, its expensive. So we need to 'find the column names' and pass it to 'lazyframe.select(pl.col(..Here..))'. 
                                                                       zip(('name', True), ('toy', False), ('born', True)) => Now we can 'find the column names'.

                        True      False                                     
                        False     False               =>                  
                        True      False                           True    False
                        ---------------------------        ------------------------------
                          pl.col(subset).is_null()         pl.all().is_null().any() (Result : (1, 3) DataFrame)

                                                           = Since we are using 'subset' means finding the columns name for [True, False] wont work as 'WE NEED THE OTHER COLUMNS also in the result Data/Lazy-Frame which is not selected in the subset'.

                                                           'bool_columns_dict' = {subset[0] : True, subset[1] : False}.
                                                           Now we traverse the LazyFrames.columns Name 'SERIALLY' and check if the current columnName is in the 'set(subset)':
                                                                        if in the set(subset) => bool_columns_dict[columnname]
                                                                        else => 'False' means this columns 'doesnt have anY null'.
                                                           By doing this we will get the 'WHOLE 1D BOOLEAN ROW' and we can use it to select columns by passing the columns in 'lazyframe.select(pl.col(..Here..))'
        
        Note :  pl.all() denoted we selected all the columns.
                pl.all().is_null().all() => The last 'all()' does 'BITWISE AND OPERATION' on 'EACH BOOLEAN COLUMN'. THE LAST 'all()' IS NOT 'pl.all()' (which selects all the columns) but 'DATA/LAZY-FRAME.all()' (which does bitwise operation on each column).

                expression = pl.all().is_null()
                pl.any_horizontal(expression) means 'HORIZONTALLY BITWISE OR OPERATION ON THE ENTIRE DATA/LAZY-FRAME'. `It doesn't mean we are selecting rows to do is_null() row by row`.
```

In [15]:
df_drop = pl.LazyFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
                        "toy" : [None, 'Batmobile', 'Bullwhip'],
                        "born": [None, '1940-4-25', None]})

df_drop = df_drop.with_columns(pl.col('born').str.to_date()) # or ....to_datetime() if you also have time.
df_drop.collect()

name,toy,born
str,str,date
"""Alfred""",,
"""Batman""","""Batmobile""",1940-04-25
"""Catwoman""","""Bullwhip""",


In [16]:
def drop_nulls(lazyframe:pl.LazyFrame = None, subset:str|List[str] = None, drop:str = 'rows', how:str = 'any', outputType:str= 'lf') -> pl.LazyFrame|pl.DataFrame:
    """
        1. input_lazyframe = A LazyFrame. default = None
        2. subset = A Column Name or A List of "Column Names". default = None, means All Columns Selected.
        3. drop = 'rows' means Delete ROWS.
           drop = 'columns means Delete COLUMNS. default = 'rows'.
        4. how = 'any' or 'all'. by default how = 'any'.
        4. outputType = 'lf' = LazyFrame or
                        'df' = DataFrame.
    """
    all_column_names = np.array(lazyframe.collect_schema().names())

    if drop == 'rows':
        pl_all_or_subset = pl.all() if subset == None else pl.col(subset)
        expression: pl.Expr = pl.any_horizontal(pl_all_or_subset.is_null()) if how == 'any' else pl.all_horizontal(pl_all_or_subset.is_null())
        lazy_output = lazyframe.filter(~expression)

        return lazy_output if outputType == 'lf' else lazy_output.collect()
    
    else: # drop == 'columns'
        if subset == None or len(subset) == 0:
            expression: pl.Expr = pl.all().is_null().any() if how == 'any' else pl.all().is_null().all()
            bool_columns = lazyframe.select(expression).collect().row(0)  # (False, True, True, True)
            selected_columns = (colName for colName, true in zip(all_column_names, bool_columns) if not true)
            lazy_output = lazyframe.select(pl.col(selected_columns))

            return lazy_output if outputType == 'lf' else lazy_output.collect()
        
        else: #
            subset_set = set(subset) if subset != None else None
            expression: pl.Expr = pl.col(subset).is_null().any() if how == 'any' else pl.col(subset).is_null().all()
            bool_columns = lazyframe.select(expression).collect().row(0)  # (False, True)

            bool_columns_dict = dict(zip(subset, bool_columns))
            whole_bool_columns = (bool_columns_dict[colName] if colName in subset_set else False for colName in all_column_names)

            selected_columns = (colName for colName, true in zip(all_column_names, whole_bool_columns) if not true)
            lazy_output = lazyframe.select(pl.col(selected_columns))

            return lazy_output if outputType == 'lf' else lazy_output.collect()

In [17]:
print(df_drop.collect(), extra_info(color_text("df_drop")))

print(drop_nulls(df_drop, drop='rows', how='any', outputType='df'), extra_info("drop_nulls(drop='rows', how='any')"))
print(drop_nulls(df_drop, drop='rows', how='all', outputType='df'), extra_info("drop_nulls(drop='rows', how='all')"))

print(df_drop.collect(), extra_info(color_text("df_drop")))

print(drop_nulls(df_drop, subset=['name', 'toy'], drop='rows', how='any', outputType='df'), extra_info("drop_nulls(subset=['name', 'toy'], drop='rows', how='any')"))
print(drop_nulls(df_drop, subset=['name', 'toy'], drop='rows', how='all', outputType='df'), extra_info("drop_nulls(subset=['name', 'toy'], drop='rows', how='all')"))

df_drop = df_drop.with_columns(ALL_NULL = None) # New Column 'ALL_NULL' with all null values.
print(df_drop.collect(), extra_info(color_text("df_drop")))

print(drop_nulls(df_drop, drop='columns', how='any', outputType='df'), extra_info("drop_nulls(drop='columns', how='any')"))
print(drop_nulls(df_drop, drop='columns', how='all', outputType='df'), extra_info("drop_nulls(drop='columns', how='all')"))

print(df_drop.collect(), extra_info(color_text("df_drop")))
print(drop_nulls(df_drop, subset=['toy', 'name'], drop='columns', how='any', outputType='df'), extra_info("drop_nulls(subset=['toy', 'name'], drop='columns', how='any')"))
print(drop_nulls(df_drop, subset=['ALL_NULL', 'toy'], drop='columns', how='all', outputType='df'), extra_info("drop_nulls(subset=['ALL_NULL', 'toy'], drop='columns', how='all')"))

shape: (3, 3)
┌──────────┬───────────┬────────────┐
│ name     ┆ toy       ┆ born       │
│ ---      ┆ ---       ┆ ---        │
│ str      ┆ str       ┆ date       │
╞══════════╪═══════════╪════════════╡
│ Alfred   ┆ null      ┆ null       │
│ Batman   ┆ Batmobile ┆ 1940-04-25 │
│ Catwoman ┆ Bullwhip  ┆ null       │
└──────────┴───────────┴────────────┘  [1;92m-->[0m [1;92mdf_drop[0m
------------------------------------------------------------------------------------------------------------------------

shape: (1, 3)
┌────────┬───────────┬────────────┐
│ name   ┆ toy       ┆ born       │
│ ---    ┆ ---       ┆ ---        │
│ str    ┆ str       ┆ date       │
╞════════╪═══════════╪════════════╡
│ Batman ┆ Batmobile ┆ 1940-04-25 │
└────────┴───────────┴────────────┘  [1;92m-->[0m drop_nulls(drop='rows', how='any')
------------------------------------------------------------------------------------------------------------------------

shape: (3, 3)
┌──────────┬───────────┬──────────

#       drop_duplicates() = unique(subset, keep=`first` or `last`, maintain_order), approx_n_unique().
```js
        1. pandas drop_duplicates(subset, keep) is the same as polars 'unique(subset, keep=`first` or `last`, maintain_order)'.
        2. unique(subset=None, keep='first', maintain_order=False) by default. And by default it returns 'unique rows DataFrame'. It doesnt work on Delete Duplicate 'columns', only on 'rows'.
        3.
        'keep'='first' : Among [1, 1, 3, 2, 1] it keeps the 'first' 1 and delete its next duplicate 1s.
              ='last'  : .................................. 'last'  1 .............. previous duplicate 1s.
        
        'subset'= A Single Column Name OR LIST of columns names based on what we want to delete duplicate rows :
                Lets say among ['name', 'age', 'marks', 'city'] I want to delete those rows whose 'name' and 'marks' are duplicated, doesnt matter if their 'age' and 'city' are also same or not. So drop_duplicates(subset=['name', 'marks'])
        
        'maintain_order'= After getting the unique rows, do you want to maintain the ORIGINAL ORDER ('True') or ANY RANDOM ORDER ('False').

        Note: print(df.select(pl.all().unique())) => won't work because Each Column doesn't have Same Number of Unique Values, so can't ----- form a DatFrame with unmatched number of columns i.e. A column has 4 unique values, another column has 5 unique values, unmatched number of columns can't create a DataFrame.
```

In [18]:
lf = pl.scan_csv("D:\\datasets\\drop_duplicate.csv")
print(lf.collect(), extra_info(color_text("lf LazyFrame")))

print(lf.unique(keep='last').collect(), extra_info("unique(keep='last')"))
print(lf.unique(subset=['name', 'marks'], keep='last').collect(), extra_info("unique(subset=['name', 'marks'], keep='last')"))

# LazyFrame.approx_n_unique() ESTIMATES -UNIQUE VALUES "ON EACH COLUMN"- , -not 'unique rows count'-.
# print(lf.approx_n_unique().collect(), extra_info("lf.approx_n_unique()")) # `LazyFrame.approx_n_unique` is deprecated.
print(lf.select(pl.all().approx_n_unique()).collect(),  extra_info("Unique Number of Values on Each Column"))

shape: (5, 4)
┌───────┬───────┬────────┬─────┐
│ name  ┆ marks ┆ city   ┆ age │
│ ---   ┆ ---   ┆ ---    ┆ --- │
│ str   ┆ i64   ┆ str    ┆ i64 │
╞═══════╪═══════╪════════╪═════╡
│ Maria ┆ 79    ┆ Dhaka  ┆ 23  │
│ Maria ┆ 79    ┆ Khulna ┆ 25  │
│ Akira ┆ 89    ┆ Dhaka  ┆ 21  │
│ Maria ┆ 79    ┆ Dhaka  ┆ 23  │
│ Saria ┆ 76    ┆ Ctg    ┆ 27  │
└───────┴───────┴────────┴─────┘  [1;92m-->[0m [1;92mlf LazyFrame[0m
------------------------------------------------------------------------------------------------------------------------

shape: (4, 4)
┌───────┬───────┬────────┬─────┐
│ name  ┆ marks ┆ city   ┆ age │
│ ---   ┆ ---   ┆ ---    ┆ --- │
│ str   ┆ i64   ┆ str    ┆ i64 │
╞═══════╪═══════╪════════╪═════╡
│ Maria ┆ 79    ┆ Dhaka  ┆ 23  │
│ Saria ┆ 76    ┆ Ctg    ┆ 27  │
│ Maria ┆ 79    ┆ Khulna ┆ 25  │
│ Akira ┆ 89    ┆ Dhaka  ┆ 21  │
└───────┴───────┴────────┴─────┘  [1;92m-->[0m unique(keep='last')
---------------------------------------------------------------------------------

#                                           drop(`*Column Names`, strict), drop rows
```js
        1. '*Column Names' = 'age', 'name'
                       or  = ['age', 'name'], ultimately it will be unpacked(*).
        2. strict = throw an exception if a column name does not exist(True) OR not(False).
        3. If you want to drop 'rows', use filter() as filter()'s job is to filter/drop rows based on a or multiple conditions.

           i) But dropping a row by 'index'? In that case we need to have a 'index column'.'lazyframe.with_row_index(Name, startIndex)' add a row index as the first column in the LazyFrame.
           ii) Or you can slice(index, length). [:] --> This slice wont work because [:] works with DataFrame or Series because in LazyFrame the table data has not made till we call collect() and without table data we can't use [:]'. For 'LazyFrame its slice(index, length)'. After using slice(..) concatenate() them by 'vertically', cant use 'vstack' for Lazayframes.
```

In [19]:
print(lf.collect(), extra_info(color_text("lf LazyFrame")))
print(lf.drop('age', 'name').collect(), extra_info("drop('age', 'name') columns."))

# Drop the Index 1 and 3. slice(4, length = None) means slice from index 4 to the last column.
# solution 1
lf_temp = pl.concat([lf.slice(0, 1), lf.slice(2, 1), lf.slice(4, length=None)], how='vertical_relaxed')
print(lf_temp.collect(), extra_info("Dropped the index 1 and 3"))

# solution 2
lf1 = lf.with_row_index(name='index')
print(lf1.collect(), extra_info(color_text("lf1 LazyFrame having Index Column at very First")))
print( lf1.filter(~pl.col('index').is_in([1, 3])).collect(), extra_info("Dropped the index 1 and 3") )

shape: (5, 4)
┌───────┬───────┬────────┬─────┐
│ name  ┆ marks ┆ city   ┆ age │
│ ---   ┆ ---   ┆ ---    ┆ --- │
│ str   ┆ i64   ┆ str    ┆ i64 │
╞═══════╪═══════╪════════╪═════╡
│ Maria ┆ 79    ┆ Dhaka  ┆ 23  │
│ Maria ┆ 79    ┆ Khulna ┆ 25  │
│ Akira ┆ 89    ┆ Dhaka  ┆ 21  │
│ Maria ┆ 79    ┆ Dhaka  ┆ 23  │
│ Saria ┆ 76    ┆ Ctg    ┆ 27  │
└───────┴───────┴────────┴─────┘  [1;92m-->[0m [1;92mlf LazyFrame[0m
------------------------------------------------------------------------------------------------------------------------

shape: (5, 2)
┌───────┬────────┐
│ marks ┆ city   │
│ ---   ┆ ---    │
│ i64   ┆ str    │
╞═══════╪════════╡
│ 79    ┆ Dhaka  │
│ 79    ┆ Khulna │
│ 89    ┆ Dhaka  │
│ 79    ┆ Dhaka  │
│ 76    ┆ Ctg    │
└───────┴────────┘  [1;92m-->[0m drop('age', 'name') columns.
------------------------------------------------------------------------------------------------------------------------

shape: (3, 4)
┌───────┬───────┬───────┬─────┐
│ name  ┆ marks ┆ city  ┆

#                                   sort(by, descending, nulls_last, ...) LazyFrame
```js
        by = based on what columns, e.g. ['age', 'marks']
        descending = for each column in 'by' do you want that column in descending or not, e.g. `[False, True]`.
        nulls_last = After sorting where the nulls value should be? At the 'last' or not, e.g. 'True' or 'False'
```

In [20]:
print(lf.collect(), extra_info(color_text("lf LazyFrame")))
lf.sort(by=['age', 'marks'], descending=[False, True], nulls_last=True).collect()

shape: (5, 4)
┌───────┬───────┬────────┬─────┐
│ name  ┆ marks ┆ city   ┆ age │
│ ---   ┆ ---   ┆ ---    ┆ --- │
│ str   ┆ i64   ┆ str    ┆ i64 │
╞═══════╪═══════╪════════╪═════╡
│ Maria ┆ 79    ┆ Dhaka  ┆ 23  │
│ Maria ┆ 79    ┆ Khulna ┆ 25  │
│ Akira ┆ 89    ┆ Dhaka  ┆ 21  │
│ Maria ┆ 79    ┆ Dhaka  ┆ 23  │
│ Saria ┆ 76    ┆ Ctg    ┆ 27  │
└───────┴───────┴────────┴─────┘  [1;92m-->[0m [1;92mlf LazyFrame[0m
------------------------------------------------------------------------------------------------------------------------



name,marks,city,age
str,i64,str,i64
"""Akira""",89,"""Dhaka""",21
"""Maria""",79,"""Dhaka""",23
"""Maria""",79,"""Dhaka""",23
"""Maria""",79,"""Khulna""",25
"""Saria""",76,"""Ctg""",27


#                           pl.when(`condition`).then(`do this`).otherwise(`do that`)
```js
        'condition' = An Expression.
        'do this'   = An Expression or A Scaler Value.
        'do that'   = An Expression or A Scaler Value.
    
        Dont skip -- otherwise(`do that`) -- part even if you just want to do -- pl.when(`condition`).then(`do this`) -- operation. 
```

In [21]:
print(lf.collect(), extra_info(color_text("lf LazyFrame")))

condition: pl.Expr = (pl.col('marks') % 10).is_between(7, 9)
round_it: pl.Expr = ((pl.col('marks') // 10) + 1) * 10 # e.g. turn 57|58|59 to 60.
keep_it_as_it_is = pl.col('marks')

lf.with_columns(pl.when(condition).then(round_it).otherwise(keep_it_as_it_is)).collect()

shape: (5, 4)
┌───────┬───────┬────────┬─────┐
│ name  ┆ marks ┆ city   ┆ age │
│ ---   ┆ ---   ┆ ---    ┆ --- │
│ str   ┆ i64   ┆ str    ┆ i64 │
╞═══════╪═══════╪════════╪═════╡
│ Maria ┆ 79    ┆ Dhaka  ┆ 23  │
│ Maria ┆ 79    ┆ Khulna ┆ 25  │
│ Akira ┆ 89    ┆ Dhaka  ┆ 21  │
│ Maria ┆ 79    ┆ Dhaka  ┆ 23  │
│ Saria ┆ 76    ┆ Ctg    ┆ 27  │
└───────┴───────┴────────┴─────┘  [1;92m-->[0m [1;92mlf LazyFrame[0m
------------------------------------------------------------------------------------------------------------------------



name,marks,city,age
str,i64,str,i64
"""Maria""",80,"""Dhaka""",23
"""Maria""",80,"""Khulna""",25
"""Akira""",90,"""Dhaka""",21
"""Maria""",80,"""Dhaka""",23
"""Saria""",76,"""Ctg""",27


#                                               group_by

```js
        Look at 'group_by object.png' which is in this 'polars' folder. After doing 'group_by(by=..)', it returns a 'LazyGroupBy' object. Assume 'group1' is the group_by object in that image. Each 'key' in 'group1' has its own 'value(LazyFrame)'.
        Now 'group1.count()' means this 'count()' will be applied on 'each value(LazyFrame)'. So doesnt matter what function you apply on '(group1)' except `map_groups()`, because that (function) will be applied on (each value(LazyFrame)).

        Usage:
        ------
        Why we create group_by object? To perform ANY OPERATION on each group inside that group_by object.
        1) Now when we perform sum(), mean(), first(), last() etc on a LazyFrame, we get 'A Single Value' for each column in that LazyFrame.
        2) But when we perform cum_sum(), is_null() etc on A LazyFrame, we get 'A Column' for each Column in that LazyFrame.
                However if you apply them on a 'LazyGroupBy' object, For Each Group(LazyFrame): You get 'A List of Values, list(A Column)' for each column, so the answer can lies on a 'Single Row' e.g. [1, 2, 3, 4].
           
           After typing 'group10.' you will see only some functions BUT with group10.agg( col(..).choose_any_function() ). And of course we cant do 'select(), with_columns(), filter()' in agg(), but you can sure select specific columns inside agg() like we do with select().

           REMEMBER : 'is_null()', 'cum_sum()' etc inside 'agg()' return 'A LIST OF VALUES'(i.e. 'A SINGLE VALUE') for each COLUMN and this is called 'AGGREGATION' since we are using 'agg()'.

           WARNING : ALWAYS USE .agg() in group_by object to select built-in functions to do AGGREGATION('A SINGLE VALUE(SCALER/LIST egal) FOR EACH COLUMN in EACH GROUP'), why?? 'group1.count()' is not explicitly saying if the count() being applied row or column wise BUT "group1.agg( col('*').count() )" explicitly saying that it is being applied column-wise. But for custom function use map_groups().
                     Be carefull when using agg(..) since agg() AGGREGATES the result into a SINGLE VALUE i.e. {A SINGLE VALUE(Scaler/List) for EACH COLUMN}. So if you want A COLUMN for EACH COLUMN, you need to use 'CUSTOM FUNCTION' i.e. map_groups().
        
        Output:
        -------
        Now after you apply functions on 'group1' you will get a 'LazyFrame' result where the 'First Column = 'keys' of groupby object 'group1' and it is in random order.
        
        Note:
        -----
        LazyFrame.map_groups(lambda column: ....) => LazyFrame is nothing but 'bunch of columns'. When we map_groups() on LazyFrame, 'first we grab a column' and 'then traverse through each value of that column manually OR can apply vectorize operation (column.is_null()..) on that column'. Similarly for rest columns.

        LazyFrameGroupByObject.map_groups(lambda group: .... ) => group_by object consists of many 'group(Talking about each LazyFrame, not the keys)'. When we map_groups() on a group_by_object, 'first we grab teh first group(LazyFrame)', 'then we can apply vectorize operation on the whole group(Lazyframe) like Lazyframe.count()' OR 'we can traverse each column MANUALLY like we do on LazyFrame.map_groups(lambda column: ....)'. Similarly for rest groups.
```

#                                                        map_groups() on group_by object.
```js
        How map_groups() works :

            1) 'splits' each group(LazyFrame) into a Accessible LazyFrame. Thats why in map_groups(lambda LazyFrame:...), we can use the LazyFrame keyword as a Real LazyFrame.
            2) Then do the 'operation(function)' we set into map_groups(..) and get 'output(Scaler value/LazyFrame(A Single Column)/LazyFrame(Multiple Columns))' for EACH group(LazyFrame).
            3) 'Merge' EACH 'output' and return them as A LazyFrame.
      
         Note: At Step 2, we 'must return the output' we want for each group(LazyFrame). Otherwise what will we MERGE at the end? YOUR 
               MUM? NO!
               Dont think about the other groups(LazyFrames), 'focus on the first group' that 'WHAT you want to RETURN' after doing the calculation on that group(LazyFrame). If its a Single Boolean Column, other groups will also return a Single Boolean Column from each, eventually all SEPERATED OUTPUT will be merged into a SINGLE OUTPUT. So if we understand what the first LazyFrame will return, we can guess what the final merged output will be. Thats why focus on the first group(LazyFrame).

                  For e.g. we want to return a LazyFrame adding a new column. (speaking for the first group(LazyFrame))

                  def add_new_column(lazyframe) -> pl.LazyFrame:
                     new_lazyframe_after_adding_a_column = lazyframe.with_columns(col('a').rank(descending=True).alias('ranking on a'))
                     return new_lazyframe_after_adding_a_column

                  result = groupby_object.map_groups(func= add_new_column, schema=None)
                                          --> map_groups(..) will also be applied on rest groups(LazyFrames), ultimately will return a big big LazyFrame MERGING THOSE NEW LazyFrames.
                  
               So again : 'focus on the first group' that 'WHAT you want to return from this group(LazyFrame)'.
                          Ohh! All the LazyFrame, Column we can access inside map_groups(..), they are all copy! Mess with them, change them, no issue.

               Even during 'agg(..)' 'focus on the first group' that 'WHAT you want to return from this group(LazyFrame)'..
```

In [22]:
names = ['Maria', 'Saria', 'Akira', 'Masha', 'Aliya', 'Alya', 'Mukail', 'Hova']
data = {                          
    'a': [5, 8, 7, 1, 6, 2, 7, 4],
    'b': [2, 1, 2, 3, 1, 3, 2, 1],
    'c': [8, 9, 0, 8, 5, 8, 3, 7],
    'd': np.random.choice(a=names, size=(8,), replace=False)}

lf2 = pl.LazyFrame(data)
group10 = lf2.group_by('b')
print(group10, extra_info("group_by object 'group10'"))

group10_appearance = lf2.sort(by='b').with_columns(b = pl.when( ~col('b').is_first_distinct() ).then(pl.lit('')).otherwise(col('b'))) .select(col('b', 'a'), col('*').exclude('b', 'a'))

print(group10_appearance.collect(), extra_info(f"{color_text("group10")}_appearance"))
print(group10.agg(col('a'), col('c'), col('d')).collect(), extra_info("group10.agg(col('a'), col('c'), col('d'))"))
# Above : col('a') inside agg(..) working as an AGGREGATION, means all the values inside column 'a' will be returned as a SINGLE VALUE(here, as a List). Same for col('c') and col('d').

print(lf2.collect().partition_by(by='b'), extra_info("lf2.collect().partition_by(by='b') = Divide the DataFrame by col('b') into 'list of dataframes'"))

<polars.lazyframe.group_by.LazyGroupBy object at 0x00000282515E2F00>  [1;92m-->[0m group_by object 'group10'
------------------------------------------------------------------------------------------------------------------------

shape: (8, 4)
┌─────┬─────┬─────┬────────┐
│ b   ┆ a   ┆ c   ┆ d      │
│ --- ┆ --- ┆ --- ┆ ---    │
│ str ┆ i64 ┆ i64 ┆ str    │
╞═════╪═════╪═════╪════════╡
│ 1   ┆ 8   ┆ 9   ┆ Maria  │
│     ┆ 6   ┆ 5   ┆ Akira  │
│     ┆ 4   ┆ 7   ┆ Alya   │
│ 2   ┆ 5   ┆ 8   ┆ Aliya  │
│     ┆ 7   ┆ 0   ┆ Saria  │
│     ┆ 7   ┆ 3   ┆ Masha  │
│ 3   ┆ 1   ┆ 8   ┆ Mukail │
│     ┆ 2   ┆ 8   ┆ Hova   │
└─────┴─────┴─────┴────────┘  [1;92m-->[0m [1;92mgroup10[0m_appearance
------------------------------------------------------------------------------------------------------------------------

shape: (3, 4)
┌─────┬───────────┬───────────┬─────────────────────────────┐
│ b   ┆ a         ┆ c         ┆ d                           │
│ --- ┆ ---       ┆ ---       ┆ ---      

In [23]:
print(color_text("                Whatever you do, the FIRST COLUMN will by the group_by object's key which is col('b')"), '\n')

# We can't access the keys of this group_by object 'group10'. Alternative is unique().
print( lf2.select(col('b').unique()).collect() , extra_info("lf2.select(col('b').unique()) = unique keys of 'group10'"))

print(group10.sum().collect(), extra_info("group10.sum()"))
print(group10.agg(col('*').sum()).collect(), extra_info("group10.agg(col('*').sum()"))

print(group10_appearance.collect(), extra_info(f"{color_text("group10")}_appearance"))
print(group10.agg(col('a', 'c').cum_sum()).collect(), extra_info("group10.agg(col('a', 'c').cum_sum()"))
print(group10.agg(col('a', 'c').is_null()).collect(), extra_info("group10.agg(col('a', 'c').is_null()"))

print(group10_appearance.collect(), extra_info(f"{color_text("group10")}_appearance"))
print(group10.len().collect(), extra_info("group10.len() = (key = The frequency of that key in col('b'))"))

print(group10.agg(col('a', 'd').first()).collect(), extra_info("group10.agg(col('a', 'd').first())"))

[1;92m                Whatever you do, the FIRST COLUMN will by the group_by object's key which is col('b')[0m 

shape: (3, 1)
┌─────┐
│ b   │
│ --- │
│ i64 │
╞═════╡
│ 1   │
│ 2   │
│ 3   │
└─────┘  [1;92m-->[0m lf2.select(col('b').unique()) = unique keys of 'group10'
------------------------------------------------------------------------------------------------------------------------

shape: (3, 4)
┌─────┬─────┬─────┬──────┐
│ b   ┆ a   ┆ c   ┆ d    │
│ --- ┆ --- ┆ --- ┆ ---  │
│ i64 ┆ i64 ┆ i64 ┆ str  │
╞═════╪═════╪═════╪══════╡
│ 1   ┆ 18  ┆ 21  ┆ null │
│ 3   ┆ 3   ┆ 16  ┆ null │
│ 2   ┆ 19  ┆ 11  ┆ null │
└─────┴─────┴─────┴──────┘  [1;92m-->[0m group10.sum()
------------------------------------------------------------------------------------------------------------------------

shape: (3, 4)
┌─────┬─────┬─────┬──────┐
│ b   ┆ a   ┆ c   ┆ d    │
│ --- ┆ --- ┆ --- ┆ ---  │
│ i64 ┆ i64 ┆ i64 ┆ str  │
╞═════╪═════╪═════╪══════╡
│ 1   ┆ 18  ┆ 21  ┆ null │
│ 3   ┆ 3   ┆ 16  ┆

In [24]:
# Q : Find all the names starts with 'M' in group10 object. (In other words: Filter the column 'd' with given condition for each group)

print(group10_appearance.collect(), extra_info(f"{color_text("group10")}_appearance"))

print(group10.agg( col('d').str.starts_with('M') ).collect()) # agg() useless here to find the names explictely. Reason below:
print(extra_info(f"group10.agg( col('d').str.starts_with('M') ) => starts_with() returns True/False. Since its used as AGGREGATE FUNCTION, we get list of True/False.\n{color_text("So be carefull when using agg(..) since agg() AGGREGATES the result.")}"))

# Ans = For Each Group(LazyFrame) FILTER the column 'd' which we can't do with agg() but filter(). For these we need custom fuction.
# Below Line : select('b', 'd') because if I do select('d'), only column 'd' will be in the output.
names_starts_with_M: Callable[[pl.LazyFrame], pl.LazyFrame] = lambda lazyframe: lazyframe.select('b', 'd').filter(col('d').str.starts_with('M'))
print(group10.map_groups(function=names_starts_with_M, schema=None).collect(), extra_info("group10.map_groups(function=names_starts_with_M, schema=None)"))
# Above : schema = Output Schema MANUALLY. schema=None means polars will decide what will be the OUTPUT DATATYPE for each column.

shape: (8, 4)
┌─────┬─────┬─────┬────────┐
│ b   ┆ a   ┆ c   ┆ d      │
│ --- ┆ --- ┆ --- ┆ ---    │
│ str ┆ i64 ┆ i64 ┆ str    │
╞═════╪═════╪═════╪════════╡
│ 1   ┆ 8   ┆ 9   ┆ Maria  │
│     ┆ 6   ┆ 5   ┆ Akira  │
│     ┆ 4   ┆ 7   ┆ Alya   │
│ 2   ┆ 5   ┆ 8   ┆ Aliya  │
│     ┆ 7   ┆ 0   ┆ Saria  │
│     ┆ 7   ┆ 3   ┆ Masha  │
│ 3   ┆ 1   ┆ 8   ┆ Mukail │
│     ┆ 2   ┆ 8   ┆ Hova   │
└─────┴─────┴─────┴────────┘  [1;92m-->[0m [1;92mgroup10[0m_appearance
------------------------------------------------------------------------------------------------------------------------

shape: (3, 2)
┌─────┬──────────────────────┐
│ b   ┆ d                    │
│ --- ┆ ---                  │
│ i64 ┆ list[bool]           │
╞═════╪══════════════════════╡
│ 2   ┆ [false, false, true] │
│ 3   ┆ [true, false]        │
│ 1   ┆ [true, false, false] │
└─────┴──────────────────────┘
 [1;92m-->[0m group10.agg( col('d').str.starts_with('M') ) => starts_with() returns True/False. Since its used as AGG

In [25]:
# Q: For each group(in group1) find the Rank based on column 'a' and create a new column('ranking on a') to set the ranking output.

print(group10_appearance.collect(), extra_info(f"{color_text("group10")}_appearance"))

def set_ranking_on_a(lazyframe: pl.LazyFrame) -> pl.LazyFrame: # this lazyframe is a COPY.
    return lazyframe.with_columns(col('a').rank(descending=True).alias('ranking on a'))

print(group10.map_groups(function=set_ranking_on_a, schema=None).collect())

print('''
The 'b' column is also shown in the output because (in my opinion) polars RANDOMLY select a group to do given calculation and
RANDOMLY show them in the output WHICH IS FASTER AND EFFICIENT CALCULATION since no maintaining order is needed. So to help to
recognize which values belongs to which group, polars show the column 'b' in the output.
''')

shape: (8, 4)
┌─────┬─────┬─────┬────────┐
│ b   ┆ a   ┆ c   ┆ d      │
│ --- ┆ --- ┆ --- ┆ ---    │
│ str ┆ i64 ┆ i64 ┆ str    │
╞═════╪═════╪═════╪════════╡
│ 1   ┆ 8   ┆ 9   ┆ Maria  │
│     ┆ 6   ┆ 5   ┆ Akira  │
│     ┆ 4   ┆ 7   ┆ Alya   │
│ 2   ┆ 5   ┆ 8   ┆ Aliya  │
│     ┆ 7   ┆ 0   ┆ Saria  │
│     ┆ 7   ┆ 3   ┆ Masha  │
│ 3   ┆ 1   ┆ 8   ┆ Mukail │
│     ┆ 2   ┆ 8   ┆ Hova   │
└─────┴─────┴─────┴────────┘  [1;92m-->[0m [1;92mgroup10[0m_appearance
------------------------------------------------------------------------------------------------------------------------

shape: (8, 5)
┌─────┬─────┬─────┬────────┬──────────────┐
│ a   ┆ b   ┆ c   ┆ d      ┆ ranking on a │
│ --- ┆ --- ┆ --- ┆ ---    ┆ ---          │
│ i64 ┆ i64 ┆ i64 ┆ str    ┆ f64          │
╞═════╪═════╪═════╪════════╪══════════════╡
│ 5   ┆ 2   ┆ 8   ┆ Aliya  ┆ 3.0          │
│ 7   ┆ 2   ┆ 0   ┆ Saria  ┆ 1.5          │
│ 7   ┆ 2   ┆ 3   ┆ Masha  ┆ 1.5          │
│ 1   ┆ 3   ┆ 8   ┆ Mukail ┆ 2.0          │
│ 

```js
                                Speed comparison between pandas apply() and polars map_groups().
```

In [26]:
# names = ['Maria', 'Saria', 'Akira', 'Masha', 'Aliya', 'Alya', 'Mukail', 'Hova']
# a = list(range(10))
# data = {                          
#     'a': np.random.choice(a= a, size=(1_00_00000,), replace=True),
#     'b': np.random.choice(a= a, size=(1_00_00000,), replace=True),
#     'c': np.random.choice(a= a, size=(1_00_00000,), replace=True),
#     'd': np.random.choice(a=names, size=(1_00_00000,), replace=True) }

# df0 = pl.LazyFrame(data)
# group0 = df0.group_by('b')

# df1 = pl.DataFrame(data)
# group1 = df1.group_by('b')

# df2 = pd.DataFrame(data)
# group2 = df2.groupby('b')

In [27]:
# start = time()
# find_names_M0: Callable[[pl.LazyFrame], pl.LazyFrame] = lambda lazyframe: lazyframe.filter(col('d').str.starts_with('M')).select(col('b', 'd'))
# r0 = group0.map_groups(function= find_names_M0, schema=None).sort(by='b').collect()
# print(time() - start)

# start = time()
# find_names_M1: Callable[[pl.DataFrame], pl.DataFrame] = lambda dataframe: dataframe.filter(col('d').str.starts_with('M')).select(col('b', 'd'))
# r1 = group1.map_groups(function= find_names_M1).sort(by='b')
# print(time() - start)

# start = time()
# find_names_M2: Callable[[pd.DataFrame], pd.DataFrame] = lambda dataframe: dataframe['d'] [dataframe['d'].str.startswith('M')]
# r2 =  group2.apply(func= find_names_M2, include_groups=False)
# print(time() - start)

# Ans :
print('''
    0.7093157768249512  => LazyFrame
    0.7516193389892578  => DataFrame (polars)
    6.648420572280884   => DataFrame (pandas)
''')


    0.7093157768249512  => LazyFrame
    0.7516193389892578  => DataFrame (polars)
    6.648420572280884   => DataFrame (pandas)



In [28]:
# print(r1, extra_info("polars dataframe"))
# print(r2.droplevel(1), extra_info("pandas"))

print('''
shape: (3_751_334, 2)
┌─────┬────────┐
│ b   ┆ d      │
│ --- ┆ ---    │
│ i64 ┆ str    │
╞═════╪════════╡
│ 0   ┆ Maria  │
│ 0   ┆ Mukail │
│ 0   ┆ Mukail │
│ 0   ┆ Masha  │
│ 0   ┆ Masha  │
│ …   ┆ …      │
│ 9   ┆ Masha  │
│ 9   ┆ Mukail │
│ 9   ┆ Mukail │
│ 9   ┆ Maria  │
│ 9   ┆ Masha  │
└─────┴────────┘  --> polars dataframe
------------------------------------------------------------------------------------------------------------------------

b
0     Maria
0    Mukail
0    Mukail
0     Masha
0     Masha
      ...  
9     Masha
9    Mukail
9    Mukail
9     Maria
9     Masha
Name: d, Length: 3751334, dtype: object  --> pandas
------------------------------------------------------------------------------------------------------------------------


''')


shape: (3_751_334, 2)
┌─────┬────────┐
│ b   ┆ d      │
│ --- ┆ ---    │
│ i64 ┆ str    │
╞═════╪════════╡
│ 0   ┆ Maria  │
│ 0   ┆ Mukail │
│ 0   ┆ Mukail │
│ 0   ┆ Masha  │
│ 0   ┆ Masha  │
│ …   ┆ …      │
│ 9   ┆ Masha  │
│ 9   ┆ Mukail │
│ 9   ┆ Mukail │
│ 9   ┆ Maria  │
│ 9   ┆ Masha  │
└─────┴────────┘  --> polars dataframe
------------------------------------------------------------------------------------------------------------------------

b
0     Maria
0    Mukail
0    Mukail
0     Masha
0     Masha
      ...  
9     Masha
9    Mukail
9    Mukail
9     Maria
9     Masha
Name: d, Length: 3751334, dtype: object  --> pandas
------------------------------------------------------------------------------------------------------------------------





#                                                       concat
```js
- Syntax : concat('items= list of LazyFrames you want to concatenate', how) // 'how' working as 'axis'.
                                // For teaching purpose we talked below with 2 LazyFrames.
- See 'vertical stacking.png' and 'horizontal stacking.png' in this same polars folder.

- how = 'vertical_relaxed': (DATATYPE CAN BE DIFFERENT OF THE SAME COLUMNS IN BOTH LAZYFRAME)
        If both LazyFrames "doesn't have" SAME ORDER OF COLUMN NAMES (['Name', 'Age', 'ID'], ['Age', 'Name', 'ID']) OR SAME NUMBER OF COLUMM NAMES (['Name', 'Age', 'ID'], ['Name', 'Age']), then 'vertical_relaxed' won't work!

        Solution : how = 'diagonal_relaxed'. BUT if both column has SAME ORDER OF COLUMN NAMES and SAME NUMBER OF COLUMM NAMES (datatypes can be different egal), then use 'how = vertical_relaxed'.

- how = 'horizontal':
        If ANY COLUMN of both LazyFrames has the SAME NAME, polars will throw error! Polars automatically doesnt add '_x, _y' suffixes for the duplicate columns.
```

In [29]:
lf3 = pl.scan_csv(r"d:\datasets\df8.csv")
lf4 = pl.scan_csv(r"d:\datasets\df9.csv")
lf3 = lf3.with_columns(col('ID').cast(pl.Float64), col('Age').cast(pl.UInt8))

print(lf3.collect(), extra_info(color_text("lf3")))
print(lf4.collect(), extra_info(color_text("lf4")))
print(pl.concat([lf3, lf4], how='diagonal_relaxed').collect(), extra_info("pl.concat([lf3, lf4], how='diagonal_relaxed')"))
print(pl.concat([lf4, lf3], how='diagonal_relaxed').collect(), extra_info("pl.concat([lf4, lf3], how='diagonal_relaxed')"))

lf4 = lf4.rename({'Age':'Age_', 'ID':'ID_'})
print(lf3.collect(), extra_info(color_text("lf3")))
print(lf4.collect(), extra_info(color_text("lf4")))

print(pl.concat([lf3, lf4], how='horizontal').collect(), extra_info("pl.concat([lf3, lf4], how='horizontal')"))
print(pl.concat([lf4, lf3], how='horizontal').collect(), extra_info("pl.concat([lf4, lf3], how='horizontal')"))

shape: (8, 3)
┌─────┬───────┬─────┐
│ ID  ┆ Name  ┆ Age │
│ --- ┆ ---   ┆ --- │
│ f64 ┆ str   ┆ u8  │
╞═════╪═══════╪═════╡
│ 1.0 ┆ Aaria ┆ 10  │
│ 2.0 ┆ Baria ┆ 11  │
│ 3.0 ┆ Caria ┆ 12  │
│ 4.0 ┆ Daria ┆ 13  │
│ 5.0 ┆ Earia ┆ 14  │
│ 6.0 ┆ Faria ┆ 15  │
│ 7.0 ┆ Garia ┆ 16  │
│ 8.0 ┆ Haria ┆ 17  │
└─────┴───────┴─────┘  [1;92m-->[0m [1;92mlf3[0m
------------------------------------------------------------------------------------------------------------------------

shape: (5, 2)
┌─────┬─────┐
│ Age ┆ ID  │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 20  ┆ 5   │
│ 21  ┆ 8   │
│ 22  ┆ 2   │
│ 23  ┆ 3   │
│ 24  ┆ 4   │
└─────┴─────┘  [1;92m-->[0m [1;92mlf4[0m
------------------------------------------------------------------------------------------------------------------------

shape: (13, 3)
┌─────┬───────┬─────┐
│ ID  ┆ Name  ┆ Age │
│ --- ┆ ---   ┆ --- │
│ f64 ┆ str   ┆ i64 │
╞═════╪═══════╪═════╡
│ 1.0 ┆ Aaria ┆ 10  │
│ 2.0 ┆ Baria ┆ 11  │
│ 3.0 ┆ Caria ┆ 12  │
│ 4.0 ┆ Daria

#                                                           join()
```js
        - Syntax : 'left_lazyframe'.join(other= 'right_lazyframe', on, how= 'inner', left_on, right_on, join_nulls= False)
        - Watch [it](https://www.youtube.com/live/Ssy1EfK5S-o?si=46_frNd_UpzYopVo&t=2099) from [35:00].
        - Now see 'inner join.png' and 'left join.png' in this same 'polars' folder.
            - 'outer' join is very rarely or doesnt used and 'right' join is ugly and confusing which can be done with left join. Thats why only 'inner' and 'left' join are drawn.

        - e.g. lf6.merge(other=lf7, on='id', ...) => Now the values of lf7['id'] can be IN ANY ORDER i.e. the VALUES ORDER of both columns('id') in both lazyframe don't need to be matched which we saw in the images.

        left_on, right_on : 
            - What if The Column Name is not same? Then what is the 'on='? In that case 'left_on = Column Name from the Left(1st) LazyFrame', 'right_on = Column Name from the Right(2nd) LazyFrame'.
        
        how = 'right':
            - lf6.merge(other=lf7, on='ID', how='right') => So in that case my 2nd i.e. 'right' lazyframe working as the 'first lazyframe' and 'left' lazyframe working as the 'second lazyframe', right? Pretty confusing as we have to always look at the 'right' dataframe first. Why not just do :

                lf6.merge(other=lf7, on='ID', how='left')? Because we always look at the 'left' at FIRST, then 'right'! SO WE WILL NEVER DO 'how=right'!
        
        duplicate_values_on_left_lazyframe :
            - e.g. lf6.merge(other=lf7, on='id', how='inner') => Assume lf6['id'] has DUPLICATES and lf7['id'] doesnt. Even for duplicate values in lf6['id'], we will get the corresponding values (even if its DUPLICATE) from right_lazyframe, lf7.
```

In [30]:
#                                      Straight go to the OUTPUT. Don't need to see the code.
lf6 = pl.scan_csv(r"d:\datasets\containing 2 dfs.csv") .select(col('ID_X', 'Course_X')) .rename({'ID_X' : 'ID', 'Course_X' : 'Course'})
lf7 = pl.scan_csv(r"d:\datasets\containing 2 dfs.csv") .select(col('ID_Y', 'Section', 'Course_Y')) .rename({'ID_Y' : 'ID', 'Course_Y' : 'Course'})

print(lf6.collect(), extra_info(color_text("lf6")))
print(lf7.collect(), extra_info(color_text("lf7")))
print(lf6.join(other=lf7, on='ID', how='inner').collect(), extra_info("lf6.join(other=lf7, on='ID', how='inner')"))

print(lf6.collect(), extra_info(color_text("lf6")))
print(lf7.collect(), extra_info(color_text("lf7")))
print(lf6.join(other=lf7, on='ID', how='left').collect(), extra_info("lf6.join(other=lf7, on='ID', how='left')"))

print(lf6.collect(), extra_info(color_text("lf6")))
print(lf7.collect(), extra_info(color_text("lf7")))
print(lf6.join(other=lf7, on='ID', how='right').collect(), extra_info(f"lf6.join(other=lf7, on='ID', how='right'). It's {color_text("UGLY")}!"))
print(f"lf6.join(other=lf7, on='ID', how='right') {color_text('==')} lf7.join(other=lf6, on='ID', how='left). So NEVER USE 'right'!")
print(lf7.join(other=lf6, on='ID', how='left').collect(), extra_info(f"lf7.join(other=lf6, on='ID', how='left'). It's {color_text('Beautiful')}!"))

print(lf6.collect(), extra_info(color_text("lf6")))
print(lf7.collect(), extra_info(color_text("lf7")))
print(lf6.join(other=lf7, on='ID', how='full').collect(), extra_info("lf6.join(other=lf7, on='ID', how='full') == 'Left + Right' Join"))

print(lf6.collect(), extra_info(color_text("lf6")))
print(lf7.collect(), extra_info(color_text("lf7")))
print(f"{color_text("'anti' is like SET DIFFERENCE")}, A-B, e.g. {'{1, 2, 3}'} - {'{2, 3, 4}'} = {'{1}'}. This {'{1}'} = Set A, means {color_text("in 'anti' join only LEFT LazyFrame(lf6)'s value exists!")}")
print(lf6.join(other=lf7, on='ID', how='anti').collect(), extra_info("lf6.join(other=lf7, on='ID', how='anti')"))

print(lf6.collect(), extra_info(color_text("lf6")))
print(lf7.collect(), extra_info(color_text("lf7")))
print(f"{color_text("'semi' is the OPPOSITE OF 'anti'")}. {'{1, 2, 3}'} anti {'{2, 3, 4}'} = {'{2, 3}'}. Like 'anti', here only the {color_text('LEFT LAZYFRAME is DISPLAYED')}!")
print(lf6.join(other=lf7, on='ID', how='semi').collect(), extra_info("lf6.join(other=lf7, on='ID', how='semi')"))

shape: (7, 2)
┌─────┬────────┐
│ ID  ┆ Course │
│ --- ┆ ---    │
│ i64 ┆ str    │
╞═════╪════════╡
│ 1   ┆ A      │
│ 2   ┆ B      │
│ 3   ┆ C      │
│ 4   ┆ D      │
│ 5   ┆ null   │
│ 6   ┆ F      │
│ 7   ┆ null   │
└─────┴────────┘  [1;92m-->[0m [1;92mlf6[0m
------------------------------------------------------------------------------------------------------------------------

shape: (7, 3)
┌─────┬─────────┬────────┐
│ ID  ┆ Section ┆ Course │
│ --- ┆ ---     ┆ ---    │
│ i64 ┆ str     ┆ str    │
╞═════╪═════════╪════════╡
│ 13  ┆ H       ┆ O      │
│ 10  ┆ I       ┆ P      │
│ 1   ┆ J       ┆ Q      │
│ 7   ┆ K       ┆ null   │
│ 11  ┆ L       ┆ R      │
│ 3   ┆ null    ┆ null   │
│ 5   ┆ null    ┆ S      │
└─────┴─────────┴────────┘  [1;92m-->[0m [1;92mlf7[0m
------------------------------------------------------------------------------------------------------------------------

shape: (4, 4)
┌─────┬────────┬─────────┬──────────────┐
│ ID  ┆ Course ┆ Section ┆ Course_righ

In [31]:
nov_reg = pl.scan_csv(r"d:\datasets\CampusX\reg-november.csv")
dec_reg = pl.scan_csv(r"d:\datasets\CampusX\reg-december.csv")
courses = pl.scan_csv(r"d:\datasets\CampusX\courses.csv")

print(nov_reg.head(5).collect(), extra_info(color_text("nov_reg") + ".head(5)")) # 'student_id' may contain duplicate values.
print(dec_reg.head(5).collect(), extra_info(color_text("dec_reg") + ".head(5)")) # ----------------- Same -------------------
print(courses.head(5).collect(), extra_info(color_text("courses") + ".head(5)")) # 'course_id' doesn't contain any duplicate values.
# The Question is at below blocks.

shape: (5, 2)
┌────────────┬───────────┐
│ student_id ┆ course_id │
│ ---        ┆ ---       │
│ i64        ┆ i64       │
╞════════════╪═══════════╡
│ 23         ┆ 1         │
│ 15         ┆ 5         │
│ 18         ┆ 6         │
│ 23         ┆ 4         │
│ 16         ┆ 9         │
└────────────┴───────────┘  [1;92m-->[0m [1;92mnov_reg[0m.head(5)
------------------------------------------------------------------------------------------------------------------------

shape: (5, 2)
┌────────────┬───────────┐
│ student_id ┆ course_id │
│ ---        ┆ ---       │
│ i64        ┆ i64       │
╞════════════╪═══════════╡
│ 3          ┆ 5         │
│ 16         ┆ 7         │
│ 12         ┆ 10        │
│ 12         ┆ 1         │
│ 14         ┆ 9         │
└────────────┴───────────┘  [1;92m-->[0m [1;92mdec_reg[0m.head(5)
------------------------------------------------------------------------------------------------------------------------

shape: (5, 3)
┌───────────┬──────────────────┬──

In [32]:
# (Assume we are selling courses)   Question 1 : How much income we did by selling our courses in both nov and dec month.
#                           -------------------------------   --------------------------------
all_course_id = pl.concat( [nov_reg.select(col('course_id')), dec_reg.select(col('course_id'))], how='vertical_relaxed')
all_course_id_price = all_course_id.join( courses.select(col('course_id', 'price')), on='course_id', how='inner')

all_course_id_price.select(col('price').sum()).collect()

price
i64
154247


In [33]:
#                Question 2 : How much income we did by selling our courses in both nov and dec month SEPARATELY.


price_nov = ( nov_reg.select(col('course_id'))
              .join(courses.select(col('course_id', 'price')), on='course_id', how='inner') .select(col('price').sum()) )

price_dec = ( dec_reg.select(col('course_id'))
              .join(courses.select(col('course_id', 'price')), on='course_id', how='inner') .select(col('price').sum()) )

print(price_nov.collect(), extra_info("Total Price for November"))
print(price_dec.collect(), extra_info("Total Price for December"))

shape: (1, 1)
┌───────┐
│ price │
│ ---   │
│ i64   │
╞═══════╡
│ 89175 │
└───────┘  [1;92m-->[0m Total Price for November
------------------------------------------------------------------------------------------------------------------------

shape: (1, 1)
┌───────┐
│ price │
│ ---   │
│ i64   │
╞═══════╡
│ 65072 │
└───────┘  [1;92m-->[0m Total Price for December
------------------------------------------------------------------------------------------------------------------------



In [34]:
students = pl.scan_csv(r"d:\Datasets\CampusX\students.csv")
print(students.collect(), extra_info(color_text("students")))
print(nov_reg.head(5).collect(), extra_info(color_text("nov_reg") + ".head(5)")) # 'student_id' may contain duplicate values.
print(dec_reg.head(5).collect(), extra_info(color_text("dec_reg") + ".head(5)")) # ----------------- Same -------------------
print(courses.head(5).collect(), extra_info(color_text("courses") + ".head(5)")) # 'course_id' doesn't contain any duplicate values.
# The Q is in the below block.

shape: (25, 3)
┌────────────┬─────────────────┬─────────┐
│ student_id ┆ name            ┆ partner │
│ ---        ┆ ---             ┆ ---     │
│ i64        ┆ str             ┆ i64     │
╞════════════╪═════════════════╪═════════╡
│ 1          ┆ Kailash Harjo   ┆ 23      │
│ 2          ┆ Esha Butala     ┆ 1       │
│ 3          ┆ Parveen Bhalla  ┆ 3       │
│ 4          ┆ Marlo Dugal     ┆ 14      │
│ 5          ┆ Kusum Bahri     ┆ 6       │
│ …          ┆ …               ┆ …       │
│ 21         ┆ Seema Kota      ┆ 15      │
│ 22         ┆ Yash Sethi      ┆ 21      │
│ 23         ┆ Chhavi Lachman  ┆ 18      │
│ 24         ┆ Radhika Suri    ┆ 17      │
│ 25         ┆ Shashank D’Alia ┆ 2       │
└────────────┴─────────────────┴─────────┘  [1;92m-->[0m [1;92mstudents[0m
------------------------------------------------------------------------------------------------------------------------

shape: (5, 2)
┌────────────┬───────────┐
│ student_id ┆ course_id │
│ ---        ┆ ---       │
│

In [35]:
#               Question 3 : Find those students info who enrolled in our courses in BOTH NOVEMBER & DECEMBER.
start = time()

all_reg_students_id = np.intersect1d( nov_reg.select(col('student_id')).collect(), dec_reg.select(col('student_id')).collect() )
ans1 = pl.LazyFrame({'student_id' : all_reg_students_id}) .join(students, on='student_id', how='left') # join() is expensive!
print(ans1.collect(), extra_info(f"sub_ans1 = {time() - start}"))

start = time()

all_reg_students_id = np.intersect1d( nov_reg.select(col('student_id')).collect(), dec_reg.select(col('student_id')).collect() )
ans2 = students.filter( col('student_id').is_in(all_reg_students_id) ) # Faster because is_in() is vectorize operation.
print(ans2.collect(), extra_info(f"sub_ans2 = {time() - start}"))

shape: (9, 3)
┌────────────┬────────────────────┬─────────┐
│ student_id ┆ name               ┆ partner │
│ ---        ┆ ---                ┆ ---     │
│ i64        ┆ str                ┆ i64     │
╞════════════╪════════════════════╪═════════╡
│ 1          ┆ Kailash Harjo      ┆ 23      │
│ 3          ┆ Parveen Bhalla     ┆ 3       │
│ 7          ┆ Tarun Thaker       ┆ 9       │
│ 11         ┆ David Mukhopadhyay ┆ 20      │
│ 16         ┆ Elias Dodiya       ┆ 25      │
│ 17         ┆ Yasmin Palan       ┆ 7       │
│ 18         ┆ Fardeen Mahabir    ┆ 13      │
│ 22         ┆ Yash Sethi         ┆ 21      │
│ 23         ┆ Chhavi Lachman     ┆ 18      │
└────────────┴────────────────────┴─────────┘  [1;92m-->[0m sub_ans1 = 0.008000850677490234
------------------------------------------------------------------------------------------------------------------------

shape: (9, 3)
┌────────────┬────────────────────┬─────────┐
│ student_id ┆ name               ┆ partner │
│ ---        ┆ ---  

In [36]:
#                               Question 4 :  From 'students' table Find Partner Name for Each Student.
_ = '''             students                                                       students
    student_id        name       "partner"                         "student_id"      name        partner
        1       Kailash Harjo       23                                  1       Kailash Harjo       23
        2         Esha Butala        1                                  2         Esha Butala        1
        3      Parveen Bhalla        3                                  3      Parveen Bhalla        3
   .......................................                        .......................................
        23     Chhavi Lachman       18                                  23     Chhavi Lachman       18
'''
# 'partner's ID is the student_id itself. See 'inner join.png' in this 'polars' folder if can't think the answer. 

print( students.join(other=students.select(col('student_id', 'name')),
                     left_on='partner', right_on='student_id', how='left')
               .select(col('name', 'name_right'))
               .collect() )

shape: (25, 2)
┌─────────────────┬────────────────────┐
│ name            ┆ name_right         │
│ ---             ┆ ---                │
│ str             ┆ str                │
╞═════════════════╪════════════════════╡
│ Kailash Harjo   ┆ Chhavi Lachman     │
│ Esha Butala     ┆ Kailash Harjo      │
│ Parveen Bhalla  ┆ Parveen Bhalla     │
│ Marlo Dugal     ┆ Pranab Natarajan   │
│ Kusum Bahri     ┆ Lakshmi Contractor │
│ …               ┆ …                  │
│ Seema Kota      ┆ Preet Sha          │
│ Yash Sethi      ┆ Seema Kota         │
│ Chhavi Lachman  ┆ Fardeen Mahabir    │
│ Radhika Suri    ┆ Yasmin Palan       │
│ Shashank D’Alia ┆ Esha Butala        │
└─────────────────┴────────────────────┘


In [37]:
print(students.head(5).collect(), extra_info(color_text("students") + ".head(5)"))
print(nov_reg.head(5).collect() , extra_info(color_text("nov_reg") + ".head(5)")) # 'student_id' may contain duplicate values.
print(dec_reg.head(5).collect() , extra_info(color_text("dec_reg") + ".head(5)")) # ----------------- Same -------------------
print(courses.head(5).collect() , extra_info(color_text("courses") + ".head(5)")) # 'course_id' doesn't contain any duplicate values.
# The Q in the below block.

shape: (5, 3)
┌────────────┬────────────────┬─────────┐
│ student_id ┆ name           ┆ partner │
│ ---        ┆ ---            ┆ ---     │
│ i64        ┆ str            ┆ i64     │
╞════════════╪════════════════╪═════════╡
│ 1          ┆ Kailash Harjo  ┆ 23      │
│ 2          ┆ Esha Butala    ┆ 1       │
│ 3          ┆ Parveen Bhalla ┆ 3       │
│ 4          ┆ Marlo Dugal    ┆ 14      │
│ 5          ┆ Kusum Bahri    ┆ 6       │
└────────────┴────────────────┴─────────┘  [1;92m-->[0m [1;92mstudents[0m.head(5)
------------------------------------------------------------------------------------------------------------------------

shape: (5, 2)
┌────────────┬───────────┐
│ student_id ┆ course_id │
│ ---        ┆ ---       │
│ i64        ┆ i64       │
╞════════════╪═══════════╡
│ 23         ┆ 1         │
│ 15         ┆ 5         │
│ 18         ┆ 6         │
│ 23         ┆ 4         │
│ 16         ┆ 9         │
└────────────┴───────────┘  [1;92m-->[0m [1;92mnov_reg[0m.head(5)
----

In [38]:
#                           Question 5 : Find TOP 3 Students who has the MOST ENROLLMENT NUMBERS.
#                             ---------------------------------  ---------------------------------
all_students_ID = pl.concat([ nov_reg.select(col('student_id')), dec_reg.select(col('student_id')) ])

top_3_student_ID = ( all_students_ID.select(col('student_id').value_counts(sort=False))
                     .collect() .unnest('student_id').top_k(k=3, by='count') # for duplicate frequencies a random one chosen.
                     .select(col('student_id')) )

students.filter(col('student_id').is_in(top_3_student_ID)) .collect()

student_id,name,partner
i64,str,i64
7,"""Tarun Thaker""",9
12,"""Radha Dutt""",19
23,"""Chhavi Lachman""",18


In [39]:
#                               Question 6 : Top 3 Students who spent most money in buying courses.

start = time()
print(  pl.concat([nov_reg, dec_reg], how='vertical_relaxed')
        .join(other= courses.select(col('course_id', 'price')), on='course_id', how='inner')
        .join(other= students.select(col('student_id', 'name')), on='student_id', how='inner')
        .group_by('student_id', 'name') .agg( col('price').sum().alias('total price') )
        .top_k(k=3, by='total price')
        .collect()  , extra_info(f"First Way = {time() - start}")  )

# -------------------------------------------------------------------------------------------------------------------------------

start = time()
top_3_student_ID = ( pl.concat([nov_reg, dec_reg], how='vertical_relaxed')
                    .join(other= courses.select(col('course_id', 'price')), on='course_id', how='inner')
                    .group_by('student_id') .agg( col('price').sum().alias('total price') )
                    .top_k(k=3, by='total price') )

their_names = top_3_student_ID.join(other= students.select(col('student_id', 'name')), on='student_id', how='left')
print(their_names.collect(), extra_info(f"Second Way = {time() - start}"))

_ = '''
        MUST OPERATION : concat() nov_reg and dec_reg vertically, THEN join() it with 'courses' (only col('course_id', 'price')).

        First Way : After 'MUST OPERATION' I also did join() it with 'students' (only col('student_id', 'name')) to get the names
                    of each student_id which is uncessery because we don't need all the names, only the top 3!

        Second Way : Here I just skipped the 'First Way' and got the result, a LazyFrame with 2 columns(student_id, total price).
                     Now since we have the top_3_student_ID, we can just do 'left join' with 'students' and get the names.
        
        As you can see the SECOND Way MORE EFFICIENT and a little faster!
'''

shape: (3, 3)
┌────────────┬──────────────────┬─────────────┐
│ student_id ┆ name             ┆ total price │
│ ---        ┆ ---              ┆ ---         │
│ i64        ┆ str              ┆ i64         │
╞════════════╪══════════════════╪═════════════╡
│ 23         ┆ Chhavi Lachman   ┆ 22594       │
│ 14         ┆ Pranab Natarajan ┆ 15096       │
│ 19         ┆ Qabeel Raman     ┆ 13498       │
└────────────┴──────────────────┴─────────────┘  [1;92m-->[0m First Way = 0.0009965896606445312
------------------------------------------------------------------------------------------------------------------------

shape: (3, 3)
┌────────────┬─────────────┬──────────────────┐
│ student_id ┆ total price ┆ name             │
│ ---        ┆ ---         ┆ ---              │
│ i64        ┆ i64         ┆ str              │
╞════════════╪═════════════╪══════════════════╡
│ 23         ┆ 22594       ┆ Chhavi Lachman   │
│ 14         ┆ 15096       ┆ Pranab Natarajan │
│ 19         ┆ 13498       ┆ Qab

In [40]:
lf10 = pl.scan_csv(r"c:\Users\user\Downloads\Literature review 20 paper for fitz patrick - Sheet1.csv")

lf10.select(col('Paper Name')) .collect().to_pandas().to_string()

'                                                                                                                                            Paper Name\n0                                           Evaluating Deep Neural Networks Trained on Clinical Images in Dermatology With the Fitzpatrick 17k Dataset\n1                                          SkinCon: A skin disease dataset densely annotated by domain experts for fine-grained debugging and analysis\n2                                                                      SkinCAP: A Multi-modal Dermatology Dataset Annotated with Rich Medical Captions\n3                                                      Assessing GPT-4’s Diagnostic Accuracy with Darker Skin Tones: Underperformance and Implications\n4                                                                      Achieving Fairness Through Channel Pruning for Dermatological Disease Diagnosis\n5                                                                                      

In [41]:
lf10 = pl.scan_csv(r"d:\Datasets\For Research Paper\Fitzpatrick17k_thesis_practice.csv").drop_nulls(subset='Name')
# lf10.select(col('Name'), col('Research_motivation').str.len_chars()).collect()
print( lf10.group_by('Name', maintain_order=True).agg(col('*').str.len_chars()) .collect() )

shape: (3, 6)
┌────────┬──────────────────┬─────────────┬──────────────────┬──────────────────┬──────────────────┐
│ Name   ┆ Research_motivat ┆ Literature_ ┆ The_necessity_of ┆ Novelty_and_main ┆ Organization_and │
│ ---    ┆ ion              ┆ review      ┆ _the_research_…  ┆ _contributions…  ┆ _structure_of_…  │
│ str    ┆ ---              ┆ ---         ┆ ---              ┆ ---              ┆ ---              │
│        ┆ list[u32]        ┆ list[u32]   ┆ list[u32]        ┆ list[u32]        ┆ list[u32]        │
╞════════╪══════════════════╪═════════════╪══════════════════╪══════════════════╪══════════════════╡
│ Sir    ┆ [403]            ┆ [null]      ┆ [null]           ┆ [null]           ┆ [null]           │
│ Habib  ┆ [513]            ┆ [null]      ┆ [null]           ┆ [null]           ┆ [null]           │
│ Shuchi ┆ [756]            ┆ [null]      ┆ [null]           ┆ [null]           ┆ [null]           │
└────────┴──────────────────┴─────────────┴──────────────────┴───────────────

#                                           clear
```js
        syntax : explode(n = 'number of rows you want to have but with NULLS')
        
        by default n = 0 but it will still copy the lazyframe's schema.
```

In [42]:
lf11 = pl.LazyFrame(
    {
        "a": [None, 2, 3, 4],
        "b": [0.5, None, 2.5, 13],
        "c": [True, True, False, None],
    }
)
print(lf11.collect(), extra_info(color_text("lf11")))

lf12 = lf11.clear(n=2)
print(lf12.collect(), extra_info("lf11.clear(n=2)"))

shape: (4, 3)
┌──────┬──────┬───────┐
│ a    ┆ b    ┆ c     │
│ ---  ┆ ---  ┆ ---   │
│ i64  ┆ f64  ┆ bool  │
╞══════╪══════╪═══════╡
│ null ┆ 0.5  ┆ true  │
│ 2    ┆ null ┆ true  │
│ 3    ┆ 2.5  ┆ false │
│ 4    ┆ 13.0 ┆ null  │
└──────┴──────┴───────┘  [1;92m-->[0m [1;92mlf11[0m
------------------------------------------------------------------------------------------------------------------------

shape: (2, 3)
┌──────┬──────┬──────┐
│ a    ┆ b    ┆ c    │
│ ---  ┆ ---  ┆ ---  │
│ i64  ┆ f64  ┆ bool │
╞══════╪══════╪══════╡
│ null ┆ null ┆ null │
│ null ┆ null ┆ null │
└──────┴──────┴──────┘  [1;92m-->[0m lf11.clear(n=2)
------------------------------------------------------------------------------------------------------------------------



#                                   explode
```js
        syntax: explode(columns='scaler' OR 'list/array') e.g. columns='numbers' or columns=['numbers', 'alphabe']

        The VALUES of the column/columns must be LIST OR ARRAY, 'NOT STRUCT' because struct is not ITERABLE.

        Output: Returns the WHOLE NEW DATAFRAME after doing explode() on any column/columns.

        How 'explode()' works on a SINGLE COLUMN :
        ------------------------------------------

                id       numbers          explode(columns='number')         id      number
                ---      ------           ------------------------>        ---     ------
                'a'      [1, 2]                                             'a'       1
                                                                            'a'       2
                As you can see explode() working as A CARTESIAN PRODUCT.
        
        How 'explode()' works on MULTIPLE COLUMNS :
        -------------------------------------------

                id       numbers       alphabe               explode(columns=['number', 'alphabe'])
                ---      ------       ---------              ------------------------------------->
                'a'      [1, 2]       [3, 4, 5]                                                                 'Errors'
                                                                                      

        Assume a = [1, 2], b = [3, 4, 5] and of course a and b on the SAME ROW in different column.

        Since len(a) != len(b), explode(columns=['numbers', 'alphabe']) wouldn’t work because its not like explode('numbers') working first and then with that resulting lazyframe explode('alphabe') works. NO! Now see the 'explode.png' in this same polars folder.
        So explode() is nothing but A CARTESIAN PRODUCT. 'a' is doing CARTESIAN PRODUCT with both 'numbers' and 'alphabe' at the same time. When 'a' saw 'numbers' doesnt have 3rd element but 'alphabe' has, so it threw ERROR!

        But explode('numbers').explode('alphabe') => works because AT FIRST explode('numbers') giving a temporary lazyframe and on that temporary lazyframe if we do explode('alphabe'), it will obviously work just like a normal explode() on a Single Column e.g. explode('numbers').
        ```

In [43]:
lf14 = pl.LazyFrame({
        "letters": ["a", "a", "b", "c"],
        "numbers": [[0, 1], [2, 3], [4, 5], [6, 7, 8]],
        "alphabe": [list('ab'), list('cd'), list('ef'), list('ghi')]})

print(lf14.collect(), extra_info(color_text("lf14")))
print(lf14.explode(columns='numbers').collect(), extra_info("lf14.explode(columns='numbers')"))

print(lf14.collect(), extra_info(color_text("lf14")))
print(lf14.explode(columns=['numbers', 'alphabe']).collect(), extra_info("lf14.explode(columns=['numbers', 'alphabe'])"))

shape: (4, 3)
┌─────────┬───────────┬─────────────────┐
│ letters ┆ numbers   ┆ alphabe         │
│ ---     ┆ ---       ┆ ---             │
│ str     ┆ list[i64] ┆ list[str]       │
╞═════════╪═══════════╪═════════════════╡
│ a       ┆ [0, 1]    ┆ ["a", "b"]      │
│ a       ┆ [2, 3]    ┆ ["c", "d"]      │
│ b       ┆ [4, 5]    ┆ ["e", "f"]      │
│ c       ┆ [6, 7, 8] ┆ ["g", "h", "i"] │
└─────────┴───────────┴─────────────────┘  [1;92m-->[0m [1;92mlf14[0m
------------------------------------------------------------------------------------------------------------------------

shape: (9, 3)
┌─────────┬─────────┬─────────────────┐
│ letters ┆ numbers ┆ alphabe         │
│ ---     ┆ ---     ┆ ---             │
│ str     ┆ i64     ┆ list[str]       │
╞═════════╪═════════╪═════════════════╡
│ a       ┆ 0       ┆ ["a", "b"]      │
│ a       ┆ 1       ┆ ["a", "b"]      │
│ a       ┆ 2       ┆ ["c", "d"]      │
│ a       ┆ 3       ┆ ["c", "d"]      │
│ b       ┆ 4       ┆ ["e", "f"]      

In [44]:
lf15 = pl.LazyFrame({
        "letters": ["a", "a", "b", "c"],
        "numbers": [[0, 1], [2, 3], [4, 5], [6, 7, 8]],
        "alphabe": [list('abc'), list('def'), list('gh'), list('ijkl')]})
print(lf15.collect(), extra_info(color_text("lf15")))

print(extra_info(f"lf14.explode(columns=['numbers', 'alphabe']) {color_text("throws error")} since " +
        f"{color_text("CORRESPONDING values")} in both 'numbers' and 'alphabe'\n{' ' * 50}doesn't have the same length always"))

print(lf15.explode(columns='numbers').explode(columns='alphabe').collect(),
      extra_info("lf15.explode(columns='numbers').explode(columns='alphabe')"))

shape: (4, 3)
┌─────────┬───────────┬───────────────────┐
│ letters ┆ numbers   ┆ alphabe           │
│ ---     ┆ ---       ┆ ---               │
│ str     ┆ list[i64] ┆ list[str]         │
╞═════════╪═══════════╪═══════════════════╡
│ a       ┆ [0, 1]    ┆ ["a", "b", "c"]   │
│ a       ┆ [2, 3]    ┆ ["d", "e", "f"]   │
│ b       ┆ [4, 5]    ┆ ["g", "h"]        │
│ c       ┆ [6, 7, 8] ┆ ["i", "j", … "l"] │
└─────────┴───────────┴───────────────────┘  [1;92m-->[0m [1;92mlf15[0m
------------------------------------------------------------------------------------------------------------------------

 [1;92m-->[0m lf14.explode(columns=['numbers', 'alphabe']) [1;92mthrows error[0m since [1;92mCORRESPONDING values[0m in both 'numbers' and 'alphabe'
                                                  doesn't have the same length always
------------------------------------------------------------------------------------------------------------------------

shape: (28, 3)
┌─────────┬───

#                                           unnest()
```js
            syntax: unnest(columns='scaler' or 'list-like')
            Output : AFter unnest() on any columns it will return THE WHOLE NEW DATAFRAME.

            Decompose 'struct' columns into separate columns for each of their fields.
            The new columns will be inserted into the DataFrame at the location of the 'struct' column.

            Note : columns = Must be column/columns that has 'struct' values.
                   And when you do pl.struct(..) you get every value having same length. So there's no Q like What if my values has different length.
```

In [45]:
lf16 = pl.LazyFrame({
                    "before": ["foo", "bar"],
                    "t_a": [1, 2],
                    "t_b": ["a", "b"],    # t_a, t_b, t_c, t_d has same column numbers. That's why pl.struct(col("^t_.$"))
                    "t_c": [True, None],  # is possible. If any one of them has different length, it would throw error.
                    "t_d": [[1, 2], [3]],
                    "after": ["baz", "womp"]}).select(
                                                        col('before'),
                                                        pl.struct(col("^t_.$")).alias("structs"),
                                                        col("after"))

print(lf16.collect(), extra_info(color_text("lf16.collect()")))
print(lf16.unnest(columns='structs').collect(), extra_info("lf16.unnest(columns='t_structs')"))

lf17 = pl.LazyFrame({
    'a' : [1, 2, 3, 2, 1, 5],
    'b' : [5, 3, 2, 5, 3, 2]})

print(lf17.collect(), extra_info(color_text("lf17.collect()")))

lf17_count = lf17.select(col("b").value_counts().alias("b_count"))

print(lf17_count.collect(), extra_info("""lf17_count = lf17.select(col("b").value_counts().alias("b_count"))"""))
print(lf17_count.unnest(columns='b_count').collect(), extra_info("lf17_count.unnest(columns='b_count')"))

shape: (2, 3)
┌────────┬─────────────────────┬───────┐
│ before ┆ structs             ┆ after │
│ ---    ┆ ---                 ┆ ---   │
│ str    ┆ struct[4]           ┆ str   │
╞════════╪═════════════════════╪═══════╡
│ foo    ┆ {1,"a",true,[1, 2]} ┆ baz   │
│ bar    ┆ {2,"b",null,[3]}    ┆ womp  │
└────────┴─────────────────────┴───────┘  [1;92m-->[0m [1;92mlf16.collect()[0m
------------------------------------------------------------------------------------------------------------------------

shape: (2, 6)
┌────────┬─────┬─────┬──────┬───────────┬───────┐
│ before ┆ t_a ┆ t_b ┆ t_c  ┆ t_d       ┆ after │
│ ---    ┆ --- ┆ --- ┆ ---  ┆ ---       ┆ ---   │
│ str    ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str   │
╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
│ foo    ┆ 1   ┆ a   ┆ true ┆ [1, 2]    ┆ baz   │
│ bar    ┆ 2   ┆ b   ┆ null ┆ [3]       ┆ womp  │
└────────┴─────┴─────┴──────┴───────────┴───────┘  [1;92m-->[0m lf16.unnest(columns='t_structs')
-------------------------------

#                                   pivot
```js
        syntax : pivot(index, on, values, aggregate_function)

        You can use pivot to reshape a dataframe from "long" to "wide" format.
        
        This pivot() is same as pandas pivot(index, columns, values) and pivot_table(index, columns, values, aggfunc).
        So watch pandas pivot() and pivot_table() explanation first in this same repository and then polars documentation.

        LazyFrame doesn't support pivot() which can be done with group_by()'.
```

In [46]:
lf18 = pl.LazyFrame({
                    "name": ["Cady", "Cady", "Karen", "Karen"],
                    "subject": ["maths", "physics", "maths", "physics"],
                    "test_1": [98, 99, 61, 58],
                    "test_2": [100, 100, 60, 60]})

print(lf18.collect(), extra_info(color_text("lf18")))
print(lf18.collect().pivot(index='name', on='subject', values='test_1'),
      extra_info("lf18.collect().pivot(index='name', on='subject', values='test_1')"))

lf19 = pl.LazyFrame(sns.load_dataset('tips').rename(columns={'sex' : 'gender'}))
print(lf19.collect().limit(5), extra_info(color_text("lf19.collect().limit(5)")))
print(lf19.collect().pivot(index='gender', values='total_bill', on='smoker', aggregate_function='sum'), 
      extra_info("lf19.collect().pivot(index='gender', values='total_bill', on='smoker', aggregate_function='sum')"))

shape: (4, 4)
┌───────┬─────────┬────────┬────────┐
│ name  ┆ subject ┆ test_1 ┆ test_2 │
│ ---   ┆ ---     ┆ ---    ┆ ---    │
│ str   ┆ str     ┆ i64    ┆ i64    │
╞═══════╪═════════╪════════╪════════╡
│ Cady  ┆ maths   ┆ 98     ┆ 100    │
│ Cady  ┆ physics ┆ 99     ┆ 100    │
│ Karen ┆ maths   ┆ 61     ┆ 60     │
│ Karen ┆ physics ┆ 58     ┆ 60     │
└───────┴─────────┴────────┴────────┘  [1;92m-->[0m [1;92mlf18[0m
------------------------------------------------------------------------------------------------------------------------

shape: (2, 3)
┌───────┬───────┬─────────┐
│ name  ┆ maths ┆ physics │
│ ---   ┆ ---   ┆ ---     │
│ str   ┆ i64   ┆ i64     │
╞═══════╪═══════╪═════════╡
│ Cady  ┆ 98    ┆ 99      │
│ Karen ┆ 61    ┆ 58      │
└───────┴───────┴─────────┘  [1;92m-->[0m lf18.collect().pivot(index='name', on='subject', values='test_1')
------------------------------------------------------------------------------------------------------------------------

shape: (5,

#                                               unpivot
```js
        syntax: unpivot(index='columns not to unpivot()', variable_name, value_name, on='which values to be used')

        This unpivot() is same as pandas melt(id_vars = 'columns not to melt()', ignore_index, var_name, value_name) except melt() doesn't have 'on' parameter.

        Wide Dataframe = column numbers >> row numbers
        Long Dataframe = row numbers >> column numbers

        unpivot() converts a wide format dataframe into Longgg format dataframe. So 'it lessens the column numbers significantly'!

        See 'melt.png' in the pandas folder. As you can see all 'column names' inside a column 'variable' and all values of that dataframe inside a column 'value'. Thats how unpivot() works.

        But the output of that image didnt make any sense! Like why 'cse', 'ece', 'mecha' are inside value column? We can skip any columns that we dont want to unpivot() using 'index' e.g. index = 'branch' or ['branch', 'trees']

        We can also change the name of 'variable', 'value' using 'variable_name', 'value_name' respectively.
        We cam also select which values to use in the result lazyframe using 'on' e.g. on = ['2020', '2024']
```

In [47]:
lf20 = pl.LazyFrame({
        'branch':['cse','ece','mech'],
        '2020':[100,150,60],
        '2021':[120,130,80],
        '2022':[150,140,70]})

print(lf20.collect(), extra_info(color_text("lf20")))

print(lf20.unpivot(index='branch').collect(), extra_info("lf20.unpivot(index='branch').collect()"))
print(lf20.unpivot(index='branch', on=['2020', '2022'], variable_name='columns', value_name='values').collect(),
      extra_info("lf20.unpivot(index='branch', on=['2020', '2022'], variable_name='columns', value_name='values')"))

shape: (3, 4)
┌────────┬──────┬──────┬──────┐
│ branch ┆ 2020 ┆ 2021 ┆ 2022 │
│ ---    ┆ ---  ┆ ---  ┆ ---  │
│ str    ┆ i64  ┆ i64  ┆ i64  │
╞════════╪══════╪══════╪══════╡
│ cse    ┆ 100  ┆ 120  ┆ 150  │
│ ece    ┆ 150  ┆ 130  ┆ 140  │
│ mech   ┆ 60   ┆ 80   ┆ 70   │
└────────┴──────┴──────┴──────┘  [1;92m-->[0m [1;92mlf20[0m
------------------------------------------------------------------------------------------------------------------------

shape: (9, 3)
┌────────┬──────────┬───────┐
│ branch ┆ variable ┆ value │
│ ---    ┆ ---      ┆ ---   │
│ str    ┆ str      ┆ i64   │
╞════════╪══════════╪═══════╡
│ cse    ┆ 2020     ┆ 100   │
│ ece    ┆ 2020     ┆ 150   │
│ mech   ┆ 2020     ┆ 60    │
│ cse    ┆ 2021     ┆ 120   │
│ ece    ┆ 2021     ┆ 130   │
│ mech   ┆ 2021     ┆ 80    │
│ cse    ┆ 2022     ┆ 150   │
│ ece    ┆ 2022     ┆ 140   │
│ mech   ┆ 2022     ┆ 70    │
└────────┴──────────┴───────┘  [1;92m-->[0m lf20.unpivot(index='branch').collect()
-------------------------

#                                                   with_row_index
```js
        syntax: with_row_index(name='name of the index column', offset='the first number which will be incremented by 1 for the next rows')

        It adds a column at the very first to be used as Index Column.
```

In [48]:
lf20 = pl.LazyFrame({
        'branch':['cse','ece','mech'],
        '2020':[100,150,60],
        '2021':[120,130,80],
        '2022':[150,140,70]})

print(lf20.collect(), extra_info(color_text("lf20")))
print(lf20.with_row_index(name='ID', offset=1601).collect(), extra_info("lf20.with_row_index(name='ID', offset=1601)"))

shape: (3, 4)
┌────────┬──────┬──────┬──────┐
│ branch ┆ 2020 ┆ 2021 ┆ 2022 │
│ ---    ┆ ---  ┆ ---  ┆ ---  │
│ str    ┆ i64  ┆ i64  ┆ i64  │
╞════════╪══════╪══════╪══════╡
│ cse    ┆ 100  ┆ 120  ┆ 150  │
│ ece    ┆ 150  ┆ 130  ┆ 140  │
│ mech   ┆ 60   ┆ 80   ┆ 70   │
└────────┴──────┴──────┴──────┘  [1;92m-->[0m [1;92mlf20[0m
------------------------------------------------------------------------------------------------------------------------

shape: (3, 5)
┌──────┬────────┬──────┬──────┬──────┐
│ ID   ┆ branch ┆ 2020 ┆ 2021 ┆ 2022 │
│ ---  ┆ ---    ┆ ---  ┆ ---  ┆ ---  │
│ u32  ┆ str    ┆ i64  ┆ i64  ┆ i64  │
╞══════╪════════╪══════╪══════╪══════╡
│ 1601 ┆ cse    ┆ 100  ┆ 120  ┆ 150  │
│ 1602 ┆ ece    ┆ 150  ┆ 130  ┆ 140  │
│ 1603 ┆ mech   ┆ 60   ┆ 80   ┆ 70   │
└──────┴────────┴──────┴──────┴──────┘  [1;92m-->[0m lf20.with_row_index(name='ID', offset=1601)
-----------------------------------------------------------------------------------------------------------------------

#                                           struct
```js
            In pandas split() returns a 'list of strings'(['a', 'b']) but in polars splitn() / split_exact() returns a 'struct of strings'({'a', 'b'})

            This struct is same as in C, C++ like :

            Struct {
                title  = 'MD',
                first  = 'Hasibul',
                second = 'Habib'
            }

            Keeping aside the variable names, Struct = {'MD', 'Hasibul', 'Habib'} but each value has a variable BEHIND THE SCENE.

            In polars sometimes it returns a column with Struct datatype. If the datatype is 'struct[3]' it means there is 3 values inside each struct value.
            We can name these 3 value's variables name as : ('Add .struct before that struct datatype column to access all the struct's operation')
                    
                    col('names').struct.rename_fields(['title', 'first', 'second']) :  This helps when you do unnest(), by unnest() we will have 3 columns including 'title', 'first', 'second'. If we dont rename_fields(..), after unnest() we will have random names in our column names.
            
            How to access specific values in Struct?
            ----------------------------------------
                    Assuming each value inside each struct value has a variable name like above : ('title', 'first', 'second')
                        col('names').struct.field(name='title') or col('names').struct.field(name=['title', 'second'])

                    We can use index to access ONE VALUE:
                        col('names').struct[0]

```

#                                               str
```js
        If the INPUT/OUTPUT column|series has STRING OBJECT and you want to perform STRING operation on that series/column, then you have to add '.str' before that column|series e.g. col('name').str.upper()
```

#                                               split()
```js
        There are 3 split
            1) split(by, inclusive)                       by = the sperator
            2) splitn(by, n), here n = number of items after splitting, so if I want to split 1 times, n = 2
            3) split_exact(by, inclusive, n), n = the exact number to split.

        But split_exact(..) doesn't work well with by=' ' since its build for inclusive=True.
            splitn(..) works with anything well.
```

In [49]:
lf21 = pl.scan_csv(r"d:\Datasets\CampusX\titanic.csv").select(col('Name')) # Name  = "Surname, Title. RestName" (RestName can have 1 or more name) 
print(lf21.collect().head(5), extra_info(color_text("lf21.head(5)")))

#  Q : Create Separate Columns as Surname, Title, RestName
lf21.with_columns(
    col('Name').str.splitn(by=',', n=2).struct[0] .alias('Surname'),
    col('Name').str.splitn(by=',', n=2).struct[1] .str.strip_chars() .str.splitn(by=' ', n=2).struct.rename_fields(['Title', 'Restname']) .alias('Rests')
).unnest(columns='Rests') .collect()

shape: (5, 1)
┌─────────────────────────────────┐
│ Name                            │
│ ---                             │
│ str                             │
╞═════════════════════════════════╡
│ Braund, Mr. Owen Harris         │
│ Cumings, Mrs. John Bradley (Fl… │
│ Heikkinen, Miss. Laina          │
│ Futrelle, Mrs. Jacques Heath (… │
│ Allen, Mr. William Henry        │
└─────────────────────────────────┘  [1;92m-->[0m [1;92mlf21.head(5)[0m
------------------------------------------------------------------------------------------------------------------------



Name,Surname,Title,Restname
str,str,str,str
"""Braund, Mr. Owen Harris""","""Braund""","""Mr.""","""Owen Harris"""
"""Cumings, Mrs. John Bradley (Fl…","""Cumings""","""Mrs.""","""John Bradley (Florence Briggs …"
"""Heikkinen, Miss. Laina""","""Heikkinen""","""Miss.""","""Laina"""
"""Futrelle, Mrs. Jacques Heath (…","""Futrelle""","""Mrs.""","""Jacques Heath (Lily May Peel)"""
"""Allen, Mr. William Henry""","""Allen""","""Mr.""","""William Henry"""
…,…,…,…
"""Montvila, Rev. Juozas""","""Montvila""","""Rev.""","""Juozas"""
"""Graham, Miss. Margaret Edith""","""Graham""","""Miss.""","""Margaret Edith"""
"""Johnston, Miss. Catherine Hele…","""Johnston""","""Miss.""","""Catherine Helen ""Carrie"""""
"""Behr, Mr. Karl Howell""","""Behr""","""Mr.""","""Karl Howell"""


#                                                        slicing, contains_any and regex

In [50]:
lf22 = lf21.head(10)
print(lf22.collect(), extra_info(color_text("lf22.head(5)")))

print(lf22.select(col('Name').str.slice(offset=0, length=5)) .collect(),
                        extra_info("lf22.select(col('Name').str.slice(offset=0, length=5))")) # slicing

print(lf22.filter(col('Name').str.contains_any([' Mr.', ' Miss.'])) .collect(),
                        extra_info("lf22.filter(col('Name').str.contains_any([' Mr.', ' Miss.']))")) # contains_any. contains() works on SINGLE VALUE.

print(extra_info(f"""In polars if you want your regex pattern to be CASE INSENSITIVE, add {color_text("(?i)")} before the regex"""))

print(lf22.filter(col('Name').str.contains(pattern="(?i)^[AMB].*[^jk]$", strict=True)) .collect(), 
                        extra_info("lf22.filter(col('Name').str.contains(pattern=\"(?i)^[AMB].*[^jk]$\", strict=True))"))

shape: (10, 1)
┌─────────────────────────────────┐
│ Name                            │
│ ---                             │
│ str                             │
╞═════════════════════════════════╡
│ Braund, Mr. Owen Harris         │
│ Cumings, Mrs. John Bradley (Fl… │
│ Heikkinen, Miss. Laina          │
│ Futrelle, Mrs. Jacques Heath (… │
│ Allen, Mr. William Henry        │
│ Moran, Mr. James                │
│ McCarthy, Mr. Timothy J         │
│ Palsson, Master. Gosta Leonard  │
│ Johnson, Mrs. Oscar W (Elisabe… │
│ Nasser, Mrs. Nicholas (Adele A… │
└─────────────────────────────────┘  [1;92m-->[0m [1;92mlf22.head(5)[0m
------------------------------------------------------------------------------------------------------------------------

shape: (10, 1)
┌───────┐
│ Name  │
│ ---   │
│ str   │
╞═══════╡
│ Braun │
│ Cumin │
│ Heikk │
│ Futre │
│ Allen │
│ Moran │
│ McCar │
│ Palss │
│ Johns │
│ Nasse │
└───────┘  [1;92m-->[0m lf22.select(col('Name').str.slice(offset=0, length=5))
-

#                 Read other str.__ functions from [https://docs.pola.rs/api/python/stable/reference/series/string.html]

#                                               datetime()

In [107]:
time  = pl.datetime(2024, 6, 24, 1, 45) # A single datetime object
time1 = pl.date(2024, 4, 11) # A single date object

times = pl.Series(name="datetime", values=["2024-06-24 12:45:00", "2024-06-25 14:00:00"])
times = times.str.to_datetime(format="%Y-%m-%d %H:%M:%S") # there's also str.to_date(..)
# I wrote %Y-%m-%d, not %Y/%m/%d because in 'times' each value has '-' not '/'. BE CAREFUL HERE!!

print(times, extra_info(color_text("times")))
print(times.dt.day(), extra_info("times.dt.day()"))
print(times.dt.month(), extra_info("times.dt.month()"))
print(times.dt.year(), extra_info("times.dt.year()"))
print(times.dt.hour(), extra_info("times.dt.hour()"))
print(times.dt.minute(), extra_info("times.dt.minute()"))
print(times.dt.second(), extra_info("times.dt.second()"))

# There's others functions you can get access with 'times.dt.' To extract day/month name see the below block

shape: (2,)
Series: 'datetime' [datetime[μs]]
[
	2024-06-24 12:45:00
	2024-06-25 14:00:00
]  [1;92m-->[0m [1;92mtimes[0m
------------------------------------------------------------------------------------------------------------------------

shape: (2,)
Series: 'datetime' [i8]
[
	24
	25
]  [1;92m-->[0m times.dt.day()
------------------------------------------------------------------------------------------------------------------------

shape: (2,)
Series: 'datetime' [i8]
[
	6
	6
]  [1;92m-->[0m times.dt.month()
------------------------------------------------------------------------------------------------------------------------

shape: (2,)
Series: 'datetime' [i32]
[
	2024
	2024
]  [1;92m-->[0m times.dt.year()
------------------------------------------------------------------------------------------------------------------------

shape: (2,)
Series: 'datetime' [i8]
[
	12
	14
]  [1;92m-->[0m times.dt.hour()
----------------------------------------------------------------

#                                           strftime
```js
        You can get minute, hour, day, month, year etc but not day name, month name or more other stuffs with only 'times.dt'. For this we need to use 'times.dt.strftime'.

        'strftime' = string-format-time, it returns time in STRING FORMAT, not datetime(). Here is the original pdf = [https://docs.rs/chrono/latest/chrono/format/strftime/index.html]
        But I coded the important one below blocks.
```

In [72]:
times = ( pl.Series(name="datetime", values=["2024-04-24 12:45:00", "2024-06-25 14:00:00"])
          .str.to_datetime(format="%Y-%m-%d %H:%M:%S") )

print(times, extra_info(color_text("times")))
print(times.dt.strftime("%B"), extra_info("times.dt.strftime(\"%B\") = Full month name"))
print(times.dt.strftime("%A"), extra_info("times.dt.strftime(\"%A\") = Full weekday name"))
print(times.dt.strftime("%U"), extra_info("times.dt.strftime(\"%U\") = Week number starting with Sunday (00-53)"))
print(times.dt.strftime("%j"), extra_info("times.dt.strftime(\"%j\") = Day of the year (001-366)"))

shape: (2,)
Series: 'datetime' [datetime[μs]]
[
	2024-04-24 12:45:00
	2024-06-25 14:00:00
]  [1;92m-->[0m [1;92mtimes[0m
------------------------------------------------------------------------------------------------------------------------

shape: (2,)
Series: 'datetime' [str]
[
	"April"
	"June"
]  [1;92m-->[0m times.dt.strftime("%B") = Full month name
------------------------------------------------------------------------------------------------------------------------

shape: (2,)
Series: 'datetime' [str]
[
	"Wednesday"
	"Tuesday"
]  [1;92m-->[0m times.dt.strftime("%A") = Full weekday name
------------------------------------------------------------------------------------------------------------------------

shape: (2,)
Series: 'datetime' [str]
[
	"16"
	"25"
]  [1;92m-->[0m times.dt.strftime("%U") = Week number starting with Sunday (00-53)
------------------------------------------------------------------------------------------------------------------------

shape: (2,

In [78]:
print(times, extra_info(color_text("times")))
print(times.dt.strftime("%D"), extra_info("times.dt.strftime(\"%D\") = Month-day-year format. Same as %m/%d/%y."))
print(times.dt.strftime("%x"), extra_info("times.dt.strftime(\"%x\") = Locale's date representation (e.g., 12/31/99)."))
print(times.dt.strftime("%F"), extra_info("times.dt.strftime(\"%x\") = Year-month-day format (ISO 8601). Same as %Y-%m-%d."))
print(times.dt.strftime("%v"),
    extra_info(f"times.dt.strftime(\"{color_text('%v')}\") = Day-month-year format. Same as %e-%b-%Y. " +
               color_text("LOOOOKKKKK AT THIS FORMAT!!!")))

shape: (2,)
Series: 'datetime' [datetime[μs]]
[
	2024-04-24 12:45:00
	2024-06-25 14:00:00
]  [1;92m-->[0m [1;92mtimes[0m
------------------------------------------------------------------------------------------------------------------------

shape: (2,)
Series: 'datetime' [str]
[
	"04/24/24"
	"06/25/24"
]  [1;92m-->[0m times.dt.strftime("%D") = Month-day-year format. Same as %m/%d/%y.
------------------------------------------------------------------------------------------------------------------------

shape: (2,)
Series: 'datetime' [str]
[
	"04/24/24"
	"06/25/24"
]  [1;92m-->[0m times.dt.strftime("%x") = Locale's date representation (e.g., 12/31/99).
------------------------------------------------------------------------------------------------------------------------

shape: (2,)
Series: 'datetime' [str]
[
	"2024-04-24"
	"2024-06-25"
]  [1;92m-->[0m times.dt.strftime("%x") = Year-month-day format (ISO 8601). Same as %Y-%m-%d.
--------------------------------------------

In [84]:
print(times, extra_info(color_text("times")))
print(times.dt.strftime("%R"), extra_info("times.dt.strftime(\"%R\") = Hour-minute format. Same as %H:%M"))
print(times.dt.strftime("%T"), extra_info("times.dt.strftime(\"%T\") = Hour-minute-second format. Same as %H:%M:%S."))
print(times.dt.strftime("%c"), extra_info("times.dt.strftime(\"%c\") = Locale's date and time (e.g., Thu Mar 3 23:05:25 2005)." +
                                color_text(" LOOOOKKKKK AT THIS FORMAT!!!")))

shape: (2,)
Series: 'datetime' [datetime[μs]]
[
	2024-06-24 12:45:00
	2024-06-25 14:00:00
]  [1;92m-->[0m [1;92mtimes[0m
------------------------------------------------------------------------------------------------------------------------

shape: (2,)
Series: 'datetime' [str]
[
	"12:45"
	"14:00"
]  [1;92m-->[0m times.dt.strftime("%R") = Hour-minute format. Same as %H:%M
------------------------------------------------------------------------------------------------------------------------

shape: (2,)
Series: 'datetime' [str]
[
	"12:45:00"
	"14:00:00"
]  [1;92m-->[0m times.dt.strftime("%T") = Hour-minute-second format. Same as %H:%M:%S.
------------------------------------------------------------------------------------------------------------------------

shape: (2,)
Series: 'datetime' [str]
[
	"Mon Jun 24 12:45:00 2024"
	"Tue Jun 25 14:00:00 2024"
]  [1;92m-->[0m times.dt.strftime("%c") = Locale's date and time (e.g., Thu Mar 3 23:05:25 2005).[1;92m LOOOOKKKKK AT THIS F

#                                           date_range
```js
        syntax: date_range(start='start date using pl.date(..)', end='end date using pl.date(..)', interval, closed='Define which sides of the range are closed (inclusive)', eager='Evaluate immediately and return a Series. If set to False (default), return an expression instead.')

        interval :
        ----------
                    = 'd' means 'day'
                    = '1d'  means from start to end the day will increase by 1.
                    = You can put '2d', '3d', '4d' any 'd'.

                    and just like this we have 1w, 1mo, 1y, 1q(1 calendar quarter). We can also combine them e.g. '1y6mo'
```

In [85]:
print(pl.date_range(start=pl.date(2024, 1, 11), end=pl.date(2024, 1, 30), interval='3d', eager=True), extra_info(
    f"pl.date_range(start=pl.date(2024, 1, 11), end=pl.date(2024, 1, 30), interval={color_text('3d')}, eager=True)"))

print(pl.date_range(start=pl.date(2024, 1, 11), end=pl.date(2030, 5, 30), interval='1y6mo5d', eager=True), extra_info(
    f"pl.date_range(start=pl.date(2024, 1, 11), end=pl.date(2030, 5, 30), interval={color_text('1y6mo5d')}, eager=True)"))

shape: (7,)
Series: 'date' [date]
[
	2024-01-11
	2024-01-14
	2024-01-17
	2024-01-20
	2024-01-23
	2024-01-26
	2024-01-29
]  [1;92m-->[0m pl.date_range(start=pl.date(2024, 1, 11), end=pl.date(2024, 1, 30), interval=[1;92m3d[0m, eager=True)
------------------------------------------------------------------------------------------------------------------------

shape: (5,)
Series: 'date' [date]
[
	2024-01-11
	2025-07-16
	2027-01-21
	2028-07-26
	2030-01-31
]  [1;92m-->[0m pl.date_range(start=pl.date(2024, 1, 11), end=pl.date(2030, 5, 30), interval=[1;92m1y6mo5d[0m, eager=True)
------------------------------------------------------------------------------------------------------------------------



#                                       datetime_range()
```js

        This is same as date_range(), but here you can work with time(hour, minute, second) also. So the 'interval' :

            1ns (1 nanosecond)
            1us (1 microsecond)
            1ms (1 millisecond)
            1s  (1 second)
            1m  (1 minute)
            1h  (1 hour)
            1d  (1 calendar day)
            1w  (1 calendar week)
            1mo (1 calendar month)
            1q  (1 calendar quarter)
            1y  (1 calendar year)

            You can also combine here too e.g. '2y3mo3d2h10m', '2y3mo2h10m'(here I skipped 'd', so the result will contain same 'day')
        
```

In [90]:
print(pl.datetime_range(start=pl.date(2024, 1, 11), end=pl.date(2030, 5, 30), interval='2y3mo3d2h10m', eager=True),
      extra_info("pl.datetime_range(start=pl.date(2024, 1, 11), end=pl.date(2030, 5, 30), interval='2y3mo3d2h10m', eager=True)"))

print(pl.datetime_range(start=pl.date(2024, 1, 11), end=pl.date(2030, 5, 30), interval='2y3mo2h10m', eager=True),
      extra_info("pl.datetime_range(start=pl.date(2024, 1, 11), end=pl.date(2030, 5, 30), interval='2y3mo2h10m', eager=True)"))

shape: (3,)
Series: 'date' [datetime[μs]]
[
	2024-01-11 00:00:00
	2026-04-14 02:10:00
	2028-07-17 04:20:00
]  [1;92m-->[0m pl.datetime_range(start=pl.date(2024, 1, 11), end=pl.date(2030, 5, 30), interval='2y3mo3d2h10m', eager=True)
------------------------------------------------------------------------------------------------------------------------

shape: (3,)
Series: 'date' [datetime[μs]]
[
	2024-01-11 00:00:00
	2026-04-11 02:10:00
	2028-07-11 04:20:00
]  [1;92m-->[0m pl.datetime_range(start=pl.date(2024, 1, 11), end=pl.date(2030, 5, 30), interval='2y3mo2h10m', eager=True)
------------------------------------------------------------------------------------------------------------------------



In [118]:
lf23 = pl.scan_csv(r"d:\Datasets\CampusX\expense_data.csv") .select(col('Date', 'Income/Expense', 'Amount'))
lf23 = lf23.with_columns(Date = col('Date').str.to_datetime(format="%m/%d/%Y %H:%M"))

#                       Q : In which day he(dataset owner) spent the most?

lf23 = lf23.with_columns(Day = col('Date').dt.strftime("%A")) # as we need 'day' but we've only date.
print(lf23.limit(5).collect(), extra_info("lf23.limit(5)"))

( lf23.group_by('Day').agg( Total_Amount = col('Amount').sum() ) # returns a lazyframe of [Day, Total_Amount]
  .top_k(k=1, by='Total_Amount') # returns the FIRST ROW having MAXIMUM Total_Amount
  .collect() )

shape: (5, 4)
┌─────────────────────┬────────────────┬────────┬───────────┐
│ Date                ┆ Income/Expense ┆ Amount ┆ Day       │
│ ---                 ┆ ---            ┆ ---    ┆ ---       │
│ datetime[μs]        ┆ str            ┆ f64    ┆ str       │
╞═════════════════════╪════════════════╪════════╪═══════════╡
│ 2022-03-02 10:11:00 ┆ Expense        ┆ 50.0   ┆ Wednesday │
│ 2022-03-02 10:11:00 ┆ Expense        ┆ 300.0  ┆ Wednesday │
│ 2022-03-01 19:50:00 ┆ Expense        ┆ 78.0   ┆ Tuesday   │
│ 2022-03-01 18:56:00 ┆ Expense        ┆ 30.0   ┆ Tuesday   │
│ 2022-03-01 18:22:00 ┆ Expense        ┆ 67.0   ┆ Tuesday   │
└─────────────────────┴────────────────┴────────┴───────────┘  [1;92m-->[0m lf23.limit(5)
------------------------------------------------------------------------------------------------------------------------



Day,Total_Amount
str,f64
"""Saturday""",34421.02
