In [None]:
import numpy as np

In [None]:
import polars as pl

In [None]:
pl.__version__

In [None]:
# should be the raw download page link
!wget https://raw.githubusercontent.com/mattharrison/2023-Pycon-Polars/refs/heads/main/__mharrison__2020-2021.csv

In [None]:
file = '__mharrison__2020-2021.csv'
df = pl.read_csv(file)

# or read it directly from url
url = 'https://raw.githubusercontent.com/mattharrison/2023-Pycon-Polars/refs/heads/main/__mharrison__2020-2021.csv'
df = pl.read_csv(url)

In [None]:
df

In [None]:
df.schema # sees column names and types

In [None]:
df.describe() # some metadata about your dataframe. Not all the information may be of importance. but it does tell
# you the number of nulls in each column


In [None]:
df.quantile(0.25) # get the row at the 25% quantile

In [None]:
print(f'first row\n {df[0]}') # get first row
print(f'second row\n {df[1]}') # get second row
print(f'row 1-4\n {df[1:5]}') # get first row

In [None]:
print(dir(pl)) # see things available in pl package

In [None]:
# polars is written from the ground up. Under the python code in polars is a Rust layer.
# to get the speed and performance benefits, polars introuced "expressions" i.e polars.expr.expr.Expr. Expressions are just a way to describe
# what you intend to do. It is just a way to hold your execution plan. Expressions don't hold any data.
# so Expressions enable deferred computation — they are a recipe for how to compute something on a DataFrame or Series.
# You can think of it as a lazy formula that Polars will evaluate later when needed. Polars uses expressions to build efficient,
# vectorized, and parallel query plans — much like SQL or Spark.

# These expressions can be either evaluated eagerly or lazily. I will show later how we can create lazy data frames and inspect our evaluation
# plan using api's such as "describe_plan(), describe_optimized_plan(), and explain()"

# if you use the python layer, yes you get some flexibility but it comes at the expense of speed.

print(type(pl.col('foo'))) # pl.col('foo') is polars.expr.expr.Expr

# You can do many things with expressions we will see later but lets just print the dir of the expressions.
print(dir(pl.col('foo'))) # see things available in polars columns


In [None]:
df.index # no index in polars by default. need to use with_row_count()

In [None]:
df.columns

In [None]:
df.sample(20).transpose() # just a way of seeing some initial data.

In [None]:
df.dtypes # these are pyarrow types

In [None]:
df.estimated_size() # rough bytes

In [None]:
df[1:5] # print rows 1, 4

In [None]:
# Prefer to use Expressions instead of working directly on dataframe
# use pl.all() instead of pl.col('*')

# Mostly you will be using 4 common apis in polars. we will see them in action next.
# select(), with_columns(), filter(), groupby().agg()

# df.select() is used to choose columns (you can give it multiple expressions as positional arguments, or even keyword expressions as 
# keyword arguments. the keys you use become the name(alias) for your columns
df.select(pl.all()) # get all columns.

In [None]:
df.select(pl.col('Tweet id').alias('Rajat')).head() # select only one column and change its column name

In [None]:
# Use select to get all columns of float64 type
df.select(pl.col(pl.Float64)).head(5)

In [None]:
# pl.col/pl.all is very flexible. Think of it as a where clause in a select statement.
# exclude coloumns using exclude function, apply regex also on columns filtering. See examples below
df.select(pl.all().exclude('Tweet id')) # select and where clause

# pl.exclude is just a syntactic sugar over pl.all().exclude()
df.select(pl.exclude("^Tweet")) # remove all columns that begin with Tweet

df.select(pl.exclude([pl.Float64])) # exclude Float64 columns. exclude can take a list

df.select(["impressions"]).head()

In [None]:
# df.select(['impressions', 'impressions']) # polars doesnt allow duplicate columns

# Workaround
df.select(['impressions', pl.col('impressions').alias('impressions_2')]) # polars doesnt allow it

In [None]:
# select a column where all cell values are "0 if impressions is less than 100, or else 1"
expr = (pl.when(pl.col('impressions') < 100).then(0).otherwise(1))
new_df = df.select(expr.alias('threshold'))
print(new_df)
print(new_df.unique())

In [None]:
# Renaming and replacing columns is really trivial.

# renaming - Just use df.rename() method
test_df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ['a', 'b', 'c']})
print(test_df)
print(test_df.rename({'foo': 'apple'})) # just a dict of old vs new names
print(test_df.rename(lambda x : "c" + x[1:])) # a lambda to rename all columns

# replacing
apples = pl.Series("apple", [10, 20, 30])
print(test_df.replace_column(0, apples)) # replace column at position 0

In [None]:
# note that most operations return a copy of the dataframe and do not modify the original dataframe
# with_columns(pl.lit('Q0).alias('Describe')) # with_columns means add a new column, pl.lit('Q0') means add a literal column, i.e. a column with Q0 as data.
# and rename it to Describe
df.quantile(0).with_columns(pl.lit('Q0').alias('Describe')).head()

In [None]:
# add describe column at the beginning instead

# DeprecationWarning: `NUMERIC_DTYPES` was deprecated in version 1.0.0. Define your own data type groups or use the `polars.selectors` module
# for selecting columns of a certain data type.
df = df.select([
    pl.lit('Q1').alias('Describe'),
    pl.col(pl.NUMERIC_DTYPES).quantile(0)
])
df

In [None]:
# Same as above but using selectors
# selectors allow for more intuitive selection of columns from DataFrame or LazyFrame. They extend on the pl.col() functionality.
import polars.selectors as cs

df = df.select([
    pl.lit('Q0').alias('Describe'),
    cs.by_dtype(pl.Float64).quantile(0)
])
df

In [None]:
# Look at df.describe() command. We want to add a new statistic of Q0, Q0.25, Q0.5, Q0.75, Q1
df.describe() # may be initialize the dataframe again and then see the clean output.


In [None]:
# df.quantile(0).with_columns(pl.lit(f'Q0').alias('Describe')).select(pl.col('Describe'), pl.exclude('Describe'))
# debug statement
  

# how we can do that. we can use pl.concat to concat multiple dataframes.
# lets fetch the Q0, Q0.25, Q0.5, Q0.75, Q1 quantiles rows and a new column "Describe" (at the front) with values Q0, Q0.25, Q0.5, Q0.75, Q1 
# then lets fetch the df.describe() dataframe and just connect both. Easy peasy!!.

# quantile(val) returns a dataframe with the exact same schema and structure as df.describe() so they can be concatenated together.
(pl
  .concat(
    [ 
        # 1st dataframe
        *[df
        .quantile(val)
        .with_columns(pl.lit(f'Q{val}').alias('statistic')) # problem is that Describe will be added at the end. We need to move it to the beginning
        .select(pl.col('statistic'), pl.exclude('statistic'))
        for val in [0, 0.25, 0.5, 0.75, 1]
        ],
        # 6th dataframe (1st to 5th dataframes are for the quantiles
        df.describe() 
    ],
    how='vertical'
  )
)

In [None]:
# lets just put this into a function and call dataframe.pipe(myfunction)

def pd_describe(a_df: pl.DataFrame, *args, **kwargs):
    cols = ['statistic', *a_df.columns]
    return (pl
        .concat(
            [ 
                # 1st dataframe
                *[a_df
                .quantile(val)
                .with_columns(pl.lit(f'Q{val}').alias('statistic')) # problem is that Describe will be added at the end. We need to move it to the beginning
                .select(cols)
                for val in [0, 0.25, 0.5, 0.75, 1]
                ],
                # 6th dataframe (1st to 5th dataframes are for the quantiles
                a_df.describe() 
            ],
            how='vertical'
          )
     )

In [None]:
# df.pipe offers a structured way to apply a sequence of user-defined functions (UDFs).
df.pipe(pd_describe, 1, 2, 3, name='Rajat') # pd_describe receives df as the a_df argument. The rest of the arguments 1, 2, 3 and name are passed as *args and **kwargs

# pd_describe function is a very handy way to look at statistics for any partial data in your data frame.
# you could select just all the I64 columns and call pipe on it and it will give you statistics for that partial data. See example below


In [None]:
# df.pipe offers a structured way to apply a sequence of user-defined functions (UDFs).
df.select(pl.col(pl.Float64)).pipe(pd_describe)

In [None]:
# Just like you call pipe() on a dataframe, you can call pipe on a pl.Expr object.
# and just like DataFrame.pipe() returns a DataFrame, calling pipe() on pl.Expr should return another pl.Expr.

# Lets say you have a dataFrame with one column where values are "a: 1", "b: -2", "c: 3", "d: -4".
# try to write a pipe based function which extracts the integer out and creates a new col based on it.
# if it is even. change to negative, otherwise leave it untouched. then multiply it by 5. so result should be 5, -10, 15. -20

# we will create two User defined functions which will take a pl.Expr and return it to achieve the above.
temp_df = pl.DataFrame({'val': ["a: 1", "b: -2", "c: 3", "d: -4"]})
temp_df

def extract_int_from_str(col: pl.Expr) -> pl.Expr:
    return col.str.extract(r"\d+", 0).cast(pl.Int64)

def scale_negative_even(expr: pl.Expr, k: int=5) -> pl.Expr:
    expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr)
    return expr * k

temp_df.with_columns(udfs=pl.col('val').pipe(extract_int_from_str).pipe(scale_negative_even, 2))


In [None]:
# casting
df.select(pl.col('impressions').cast(pl.Int32))


In [None]:
# stick the new Int32 impressinos column back in the main dataframe
df.with_columns(pl.col('impressions').cast(pl.Int32))

In [None]:
import numpy as np

In [None]:
np.iinfo(np.int64)

In [None]:
df.select(cs.numeric()).columns # cs.numeric means all integer and floating types.

In [None]:
## 
(df
 .select(cs.numeric())  # take all the numeric cols
 .pipe(pd_describe) # do a describe on that
 .select([ # select numeric cols where max value is <= 255
     pl.col(col)
       for col in df.select(cs.numeric()).columns
     if df[col].max() <= 255 
 ])
) 

In [None]:
df.select(pl.col('Tweet id')).head(3)

In [None]:
# Laziness
# in polars, you can actually create a chain of User defined operations on your dataframe and then hand it together to polars
# polars will then analyze it, optimize it and run it as one big operations instead of running many smaller un-optimized operations

lf = df.lazy() # convert you dataframe to a lazy dataframe
res = lf.with_columns((pl.col("Tweet id") + 10).alias('Tweet id + 10')) # this wont return any data unless you call collect on it.
print(type(res))


In [None]:
res.collect().select(['Tweet id', 'Tweet id + 10']).head(3)

In [None]:
temp_df = pl.DataFrame({
    "x": [1, 2, 3],
    "y": [10, 20, 30]
})
print(temp_df)
# create a new column which is a square of the element in the first column plus the value in second column
def squared(expr: pl.Expr) -> pl.Expr:
    return expr * expr

# 1st way, use pipe() api
print(temp_df.with_columns((pl.col('x').pipe(squared) + pl.col('y')).alias('new_value')))

# 2nd way, use map_elements() which takes a lambda
# Expr.map_elements is significantly slower than the native expressions API above.
# Only use if you absolutely CANNOT implement your logic otherwise.
print(temp_df.with_columns((pl.col('x').map_elements(lambda c : c**2) + pl.col('y')).alias('new_value')))

In [None]:
# how groupby() and aggregation() works?
temp_df = pl.DataFrame({
    "team": ["A", "A", "B", "B"],
    "score": [5, 7, 6, 8]
})

# groupby "team" and calculate the "total score"
print(temp_df.group_by('team').agg(pl.col('score').sum())) # the agg() api gets the score column as a pl.Series and applies sum() on it

# groupby "team" and calculate the "difference in max and min scores"
# print(temp_df.group_by('team').agg(pl.col('score').apply(lambda score_series: score_series.max() - score_series.min()).alias('score_range'))) 

# groupby "team" and accumulate the values in y and also calculate their mean.
print(temp_df.group_by('team').agg([pl.col('score').alias('all_scores'), pl.mean('score').alias('scores_mean')]).sort(by=pl.col('team')))

In [None]:
temp_df = pl.DataFrame({"x": {"a": 1, "b": 2}})
print(temp_df)

In [None]:
# What is above? You can create DataFrame and Series where data type is struct. A struct contains fields. Each field is identified by pl.Field() object
# which has a name and an associated data type. Lets create a Struct type using a list of fields or a map/dict of field name and types.
from typing import List
import datetime
s1 = pl.Struct([pl.Field('a', pl.Int64), pl.Field('b', pl.Datetime), pl.Field('c', pl.List(pl.String))])
s2 = pl.Struct({'a': pl.Int64, 'b':pl.Datetime, 'c': pl.List(pl.String)})
# s1 and s2 are same
print(type(s1))

temp_series = pl.Series([
                         {'a': 1, 'b': datetime.datetime.now(), 'c': ['Rajat', 'Vidhu']},
                         {'a': 2, 'b': datetime.datetime(2025, 10, 10), 'c': ['Raman', 'Aashish']}
                        ], dtype=s1)
print(temp_series)

In [None]:
temp_df = pl.DataFrame(temp_series)
print(temp_df)