# Task to experiment
1. Fetch data with schema overwriting
2. Do some basic analysis
3. Best time to use `polars`
4. Compare with `pandas` in respect of both time and efficiency

----------------------------------
- *Calculate time for each task*
----------------------------------

In [52]:
import time
import polars as pl
import pandas as pd
import numpy as np

# 1. Read data as dataframe

- Get csv data (~103 MB)
- Overwrite schema to make sure proper datatype
- `ignore` infer_schema: it scans all rows to find the proper datatype. That's why it slow and risky for large dataset 

- polars code

In [53]:
start_time = time.time()

In [54]:
df = pl.read_csv(source='data8277.csv'
                 , has_header=True
                 , separator=','
                 , try_parse_dates=True
                 , schema_overrides={"count": pl.Int32}
                #  , infer_schema=True  # costly: traberse all rows to find out correct data type
                 , ignore_errors=True
                 , encoding='utf8')

In [55]:
execution_time = time.time() - start_time
print(f"Time to fetch the csv file: {execution_time}")

Time to fetch the csv file: 2.195941209793091


In [56]:
df.head()

Year,Age,Ethnic,Sex,Area,count
i64,i64,i64,i64,i64,i32
2018,0,1,1,1,795
2018,0,1,1,2,5067
2018,0,1,1,3,2229
2018,0,1,1,4,1356
2018,0,1,1,5,180


- pandas code

In [57]:
start_time_pd = time.time()

In [58]:
df_pd = pd.read_csv(filepath_or_buffer='data8277.csv', 
                header=0, 
                delimiter=',', 
                parse_dates=True,  
                encoding='utf8')

  df_pd = pd.read_csv(filepath_or_buffer='data8277.csv',


In [59]:
execution_time_pd = time.time() - start_time_pd
print(f"Time to fetch the csv file: {execution_time_pd}")

Time to fetch the csv file: 8.393279790878296


In [60]:
df_pd.head(5)

Unnamed: 0,Year,Age,Ethnic,Sex,Area,count
0,2018,0,1,1,1,795
1,2018,0,1,1,2,5067
2,2018,0,1,1,3,2229
3,2018,0,1,1,4,1356
4,2018,0,1,1,5,180


- findings
    - super fast to parse data 
        - polars: ~2 sec
        - pandas: ~10 sec
    - supports polars native data type, not external numpy based datatype
    - try `parse dates param` makes it very efficient to detect datetime related col
    - ignore errors param helps to prevent to break the code while retrieving data

In [61]:
df.glimpse()  # a snapshot of data

Rows: 34959672
Columns: 6
$ Year   <i64> 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018
$ Age    <i64> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ Ethnic <i64> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ Sex    <i64> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ Area   <i64> 1, 2, 3, 4, 5, 6, 7, 8, 9, 12
$ count  <i32> 795, 5067, 2229, 1356, 180, 738, 630, 1188, 2157, 177



# 2. Basic data analysis

a. get specific columns

- select only cols
- basic calculations with selected cols

b. create dereived col

- to make a derive col from str input as condition use `lit`

c. filter

- basic filtereing
- range

d. sort

e. group by

f. combining DF

## 2.a: Selecting cols

- polars code

In [62]:
start_time_selecting_cols = time.time()

In [63]:
df_year_age = df.select(['Year', 'Age'])
df_year_age.head()

Year,Age
i64,i64
2018,0
2018,0
2018,0
2018,0
2018,0


In [64]:
df_year_age = df.select(
    pl.col('Year')
    , (pl.col('Age') * 1.0).alias('Age*1.0')
)
df_year_age.head()

Year,Age*1.0
i64,f64
2018,0.0
2018,0.0
2018,0.0
2018,0.0
2018,0.0


In [65]:
execution_time_for_selecting_cols = time.time() - start_time_selecting_cols
print(f"execution time for selecting cols: {execution_time_for_selecting_cols}")

execution time for selecting cols: 0.09650206565856934


- pandas code

In [66]:
start_time_selecting_cols_pd = time.time()

In [67]:
df_year_age_pd = df_pd[['Year', 'Age']]
df_year_age_pd.head()

Unnamed: 0,Year,Age
0,2018,0
1,2018,0
2,2018,0
3,2018,0
4,2018,0


In [68]:
df_year_age_pd["Age*1.0"] = df_pd['Age']*1.0
df_year_age_pd = df_year_age_pd[["Year", "Age*1.0"]]
df_year_age_pd.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_year_age_pd["Age*1.0"] = df_pd['Age']*1.0


Unnamed: 0,Year,Age*1.0
0,2018,0.0
1,2018,0.0
2,2018,0.0
3,2018,0.0
4,2018,0.0


In [69]:
execution_time_for_selecting_cols_pd = time.time() - start_time_selecting_cols_pd
print(f"execution time for selecting cols: {execution_time_for_selecting_cols_pd}")

execution time for selecting cols: 0.7340149879455566


- findings
    - time comparison 
        - polars: ~0.1 sec
        - pandas: ~0.7 sec

## 2.b: Derive col

- polars code

In [70]:
start_time_creating_derive_cols = time.time()

In [71]:
df_derive = df.with_columns(
    gender = pl.when(pl.col("Sex") == 1)
    .then(pl.lit('male'))
    .when(pl.col("Sex") == 2)
    .then(pl.lit('female'))
    .otherwise(pl.lit('others'))
    )

df_derive = df_derive.drop(['Sex'])
df_derive.head()

Year,Age,Ethnic,Area,count,gender
i64,i64,i64,i64,i32,str
2018,0,1,1,795,"""male"""
2018,0,1,2,5067,"""male"""
2018,0,1,3,2229,"""male"""
2018,0,1,4,1356,"""male"""
2018,0,1,5,180,"""male"""


In [72]:
execution_time_creating_derive_cols = time.time() - start_time_creating_derive_cols
print(f"time to create derive col: {execution_time_creating_derive_cols}")

time to create derive col: 0.20151972770690918


- pandas code

In [73]:
start_time_creating_derive_cols_pd = time.time()

In [74]:
conditions = [
    (df_pd['Sex'] == 1),
    (df_pd['Sex'] == 2)
    ]

# create a list of the values we want to assign for each condition
values = ['male', 'female']

# create a new column and use np.select to assign values to it using our lists as arguments
df_derive_pd = df_pd.copy()
df_derive_pd['gender'] = np.select(conditions, values, default='others')
df_derive_pd.drop(['Sex'], axis=1, inplace=True)

# display updated DataFrame
df_derive_pd.head()

Unnamed: 0,Year,Age,Ethnic,Area,count,gender
0,2018,0,1,1,795,male
1,2018,0,1,2,5067,male
2,2018,0,1,3,2229,male
3,2018,0,1,4,1356,male
4,2018,0,1,5,180,male


In [75]:
execution_time_creating_derive_cols_pd = time.time() - start_time
print(f"Time to create derive col: {execution_time_creating_derive_cols_pd}")

Time to create derive col: 20.06527304649353


- findings
    - time comparison 
        - polars: ~0.25 sec
        - pandas: ~25 sec
    - `pandas` a bit lengthy to code 

## 2.c: Filter

- polars code

In [76]:
start_time_for_filtering = time.time()

In [77]:
df_basic_filter = df_derive.filter(
        df_derive['Year'] < 2007
    )

df_basic_filter.head()

Year,Age,Ethnic,Area,count,gender
i64,i64,i64,i64,i32,str
2006,0,1,1,615,"""male"""
2006,0,1,2,5142,"""male"""
2006,0,1,3,1809,"""male"""
2006,0,1,4,1110,"""male"""
2006,0,1,5,168,"""male"""


In [78]:
df_basic_filter_range = df_derive.filter(
    df_derive['Year'].is_between(2006, 2013)  # upper limit inclusive
    )

df_basic_filter_range.head()

Year,Age,Ethnic,Area,count,gender
i64,i64,i64,i64,i32,str
2013,0,1,1,660,"""male"""
2013,0,1,2,5502,"""male"""
2013,0,1,3,1971,"""male"""
2013,0,1,4,1212,"""male"""
2013,0,1,5,168,"""male"""


In [79]:
execution_time_for_filtering = time.time() - start_time_for_filtering
print(f"time to filter data: {execution_time_for_filtering}")

time to filter data: 0.10926365852355957


- pandas code

In [80]:
start_time_for_filtering_pd = time.time()

In [81]:
df_basic_filter_pd = df_derive_pd[df_derive_pd["Year"] < 2007]
df_basic_filter_pd.head()

Unnamed: 0,Year,Age,Ethnic,Area,count,gender
23306448,2006,0,1,1,615,male
23306449,2006,0,1,2,5142,male
23306450,2006,0,1,3,1809,male
23306451,2006,0,1,4,1110,male
23306452,2006,0,1,5,168,male


In [82]:
df_basic_filter_range_pd = df_derive_pd[(df_derive_pd["Year"] >= 2006) & (df_derive_pd["Year"] <= 2013)]
df_basic_filter_range_pd.head()

Unnamed: 0,Year,Age,Ethnic,Area,count,gender
11653224,2013,0,1,1,660,male
11653225,2013,0,1,2,5502,male
11653226,2013,0,1,3,1971,male
11653227,2013,0,1,4,1212,male
11653228,2013,0,1,5,168,male


In [83]:
execution_time_for_filtering_pd = time.time() - start_time_for_filtering_pd
print(f"time to filter data: {execution_time_for_filtering_pd}")

time to filter data: 3.1859970092773438


- findings
    - time comparison 
        - polars: ~0.6 sec
        - pandas: ~3 sec

## 2.d: Sort

- polars code

In [84]:
start_time_for_sorting = time.time()

In [85]:
df_derive = df_derive.sort(by=["Year", "count"],
                            nulls_last=True)
df_derive.head()

Year,Age,Ethnic,Area,count,gender
i64,i64,i64,i64,i32,str
2006,0,5,5,0,"""male"""
2006,0,5,12,0,"""male"""
2006,0,5,15,0,"""male"""
2006,0,5,16,0,"""male"""
2006,0,5,1,0,"""female"""


In [86]:
execution_time_for_sorting = time.time() - start_time_for_sorting
print(f"Time to sort data: {execution_time_for_sorting}")

Time to sort data: 4.089326858520508


- pandas code

In [87]:
start_time_for_sorting_pd = time.time()

In [88]:
df_derive_pd = df_derive_pd.sort_values(by=['Year', 'count'])
df_derive_pd.head()

Unnamed: 0,Year,Age,Ethnic,Area,count,gender
23306483,2006,0,1,99,..C,female
23306521,2006,0,2,99,..C,male
23306540,2006,0,2,99,..C,female
23306578,2006,0,3,99,..C,male
23306597,2006,0,3,99,..C,female


In [89]:
execution_time_for_sorting_pd = time.time() - start_time_for_sorting_pd
print(f"Time to sort data: {execution_time_for_sorting_pd}")

Time to sort data: 5.11201024055481


- findings
    - time comparison 
        - polars: ~4 sec
        - pandas: ~5 sec

## 2.e: Group by

In [90]:
start_time_for_grouping = time.time()

In [91]:
df_year_wise_count = df_derive.group_by(
    ["Year"],
    maintain_order=True
    ).agg(
        pl.col("count"). \
        sum(). \
        alias('year_wise_total_count'),

        pl.col("count"). \
        mean(). \
        round(2). \
        alias('year_wise_avg_count'),

        pl.col('gender')
        )

df_year_wise_count.head()


Year,year_wise_total_count,year_wise_avg_count,gender
i64,i32,f64,list[str]
2006,1049590488,204.45,"[""male"", ""male"", … ""others""]"
2013,1072735711,202.39,"[""male"", ""male"", … ""others""]"
2018,1217611015,212.4,"[""male"", ""male"", … ""others""]"


In [92]:
execution_time_for_grouping = time.time() - start_time_for_grouping
print(f"time to aggregate data: {execution_time_for_grouping}")

time to aggregate data: 0.4487879276275635


- pandas code: `wip...`

## 2.f: combining DF

i. joining. [doc](https://docs.pola.rs/user-guide/transformations/joins/#quick-reference-table)

ii. concat

In [93]:
df2 = pl.DataFrame(
    {
        "Year": [2006, 2013, 2018, 2019],
    }
)

df2.head()

Year
i64
2006
2013
2018
2019


## 2.f.i: joining

- polars code

In [94]:
start_time_for_joining = time.time()

In [None]:
df_left_join = df2.join(df, 
                       on="Year",
                       how="left"). \
                        sort(by=["Year"], 
                             descending=True)

df_left_join.head()

Year,Age,Ethnic,Sex,Area,count
i64,i64,i64,i64,i64,i32
2019,,,,,
2018,0.0,1.0,1.0,1.0,795.0
2018,0.0,1.0,1.0,2.0,5067.0
2018,0.0,1.0,1.0,3.0,2229.0
2018,0.0,1.0,1.0,4.0,1356.0


: 

In [None]:
df_inner_join = df.join(df2, 
                       on="Year",
                       how="inner").sort(by=["Year"], descending=True)

df_inner_join.head()

In [None]:
execution_time_for_joining = time.time() - start_time_for_joining
print(f"Time to join data: {execution_time_for_joining}")

Time to join data: 4.048587083816528


- pandas code: `wip...`

- findings
    - despite having traditional joins, it has some extra join techniques like `semi`, `anti` like `PySpark`

## 2.f.ii: concatinating

In [None]:
df3 = pl.DataFrame(
    {
        "Year": [2020, 2021, 2022, 2023],
        "Age": [0,0,0,0],
        "Ethnic": [1, 2, 3, 4],
        "Sex": [1, 2, 1, 2], 
        "Area": [1, 2, 3, 4],
        "count": [1000, 2000, 3000, 4000]
    }
)

- polars code

In [None]:
start_time_for_concatinating = time.time()

In [None]:
df_concat = pl.concat([df, df3], 
                      how="vertical_relaxed")  # vertical_relaxed: best for datatype missmatched, Int32 -> Int64

df_concat.sort(by='Year', 
                      descending=True).head(5)

Year,Age,Ethnic,Sex,Area,count
i64,i64,i64,i64,i64,i64
2023,0,4,2,4,4000
2022,0,3,1,3,3000
2021,0,2,2,2,2000
2020,0,1,1,1,1000
2018,0,1,1,1,795


In [None]:
execution_time_for_concatinating = time.time() - start_time_for_concatinating
print(f"time to concat data: {execution_time_for_concatinating}")

time to concat data: 1.4922740459442139


- pandas code: `wip...`