In [None]:
import numpy as np

In [None]:
import polars as pl

In [None]:
pl.__version__

In [None]:
# should be the raw download page link
!wget https://raw.githubusercontent.com/mattharrison/2023-Pycon-Polars/refs/heads/main/__mharrison__2020-2021.csv

In [None]:
file = '__mharrison__2020-2021.csv'
df = pl.read_csv(file)

# or read it directly from url
url = 'https://raw.githubusercontent.com/mattharrison/2023-Pycon-Polars/refs/heads/main/__mharrison__2020-2021.csv'
df = pl.read_csv(url)

In [None]:
df

In [None]:
df.schema # sees column names and types

In [None]:
df.describe() # some metadata about your dataframe


In [None]:
df.quantile(0.25) # get the row at the 25% quantile

In [None]:
df[0]

In [None]:
print(dir(pl)) # see things available in pl package

In [None]:
print(type(pl.col('foo'))) # there is a notion of columns in polars. type of pl.col('foo') is polars.expr.expr.Expr

# You can do many things with expressions we will see later but lets just print the dir of the expressions.
print(dir(pl.col('foo'))) # see things available in polars columns


In [None]:
df.index # no index in polars by default. need to use with_row_count()

In [None]:
df.columns

In [None]:
df.sample(20).transpose() # just a way of seeing some initial data.

In [None]:
df.dtypes # these are pyarrow types

In [None]:
df.estimated_size() # rough bytes

In [None]:
df[1:5] # print rows 1, 4

In [None]:
# Prefer to use Expressions instead of working directly on dataframe
# use pl.all() instead of pl.col('*')

# so polars has an apache arrow at its core. followed by a rust layer on top, then followed by a python layer.
# as long as possible, if you stick with Expressions, you remain at the rust layer and get speed benefits. 

# if you use the python later, yes you get some flexibility but it comes at the expense of speed.

# df.select() is used to choose columns (you can give it multiple expressions as positional arguments, or even keyword expressions as 
# keyword arguments. the keys you use become the name(alias) for your columns
df.select(pl.all()) # get all columns.

In [None]:
df.select(pl.col('Tweet id')) # get all columns.

In [None]:
# pl.col/pl.all is very flexible. Think of it as a where clause in a select statement.
# you can actually select data types i.e. pl.col(pl.Float64) will return all f64 cols
# exclude coloumns using exclude function, apply regex also on columns filtering. See examples below
df.select(pl.all().exclude('Tweet id')) # select and where clause

# pl.exclude is just a syntactic sugar over pl.all().exclude()
df.select(pl.exclude("^Tweet")) # remove all columns that begin with Tweet

df.select(pl.exclude([pl.Float64])) # exclude Float64 columns. exclude can take a list

df.select(["impressions"]).head()

In [None]:
# df.select(['impressions', 'impressions']) # polars doesnt allow duplicate columns

# Workaround
df.select(['impressions', pl.col('impressions').alias('impressions_2')]) # polars doesnt allow it

In [None]:
# select a column where all cell values are 0 if impressions is less than 100, or else 1
# df.select(threshold=pl.pl.cols('impressions') > 100 #incomplete

In [None]:
# note that most operations return a copy of the dataframe and do not modify the original dataframe
# with_columns(pl.lit('Q0).alias('Describe')) # with_columns means add a new column, pl.lit('Q0') means add a literal column, i.e. a column with Q0 as data.
# and rename it to Describe
df.quantile(0).with_columns(pl.lit('Q0').alias('Describe')).head()

In [None]:
# add describe column at the beginning instead

# DeprecationWarning: `NUMERIC_DTYPES` was deprecated in version 1.0.0. Define your own data type groups or use the `polars.selectors` module
# for selecting columns of a certain data type.
df = df.select([
    pl.lit('Q1').alias('Describe'),
    pl.col(pl.NUMERIC_DTYPES).quantile(0)
])
df

In [None]:
# Same as above but using selectors
import polars.selectors as cs

df = df.select([
    pl.lit('Q0').alias('Describe'),
    cs.by_dtype(pl.Float64).quantile(0)
])
df

In [None]:
# Look at df.describe() command. We want to add a new statistic of Q0, Q0.25, Q0.5, Q0.75, Q1
df.describe() # may be initialize the dataframe again and then see the clean output.


In [None]:
# df.quantile(0).with_columns(pl.lit(f'Q0').alias('Describe')).select(pl.col('Describe'), pl.exclude('Describe'))
# debug statement
  

# how we can do that. we can use pl.concat to concat multiple dataframes.
# lets fetch the Q0, Q0.25, Q0.5, Q0.75, Q1 quantiles rows and a new column "Describe" (at the front) with values Q0, Q0.25, Q0.5, Q0.75, Q1 
# then lets fetch the df.describe() dataframe and just contact both. Easy peasy!!.
(pl
  .concat(
    [ 
        # 1st dataframe
        *[df
        .quantile(val)
        .with_columns(pl.lit(f'Q{val}').alias('statistic')) # problem is that Describe will be added at the end. We need to move it to the beginning
        .select(pl.col('statistic'), pl.exclude('statistic'))
        for val in [0, 0.25, 0.5, 0.75, 1]
        ],
        # 6th dataframe (1st to 5th dataframes are for the quantiles
        df.describe() 
    ],
    how='vertical'
  )
)

In [None]:
# lets just put this into a function and call dataframe.pipe(myfunction)

def pd_describe(a_df: pl.DataFrame):
    cols = ['statistic', *a_df.columns]
    return (pl
        .concat(
            [ 
                # 1st dataframe
                *[a_df
                .quantile(val)
                .with_columns(pl.lit(f'Q{val}').alias('statistic')) # problem is that Describe will be added at the end. We need to move it to the beginning
                .select(cols)
                for val in [0, 0.25, 0.5, 0.75, 1]
                ],
                # 6th dataframe (1st to 5th dataframes are for the quantiles
                a_df.describe() 
            ],
            how='vertical'
          )
     )

In [None]:
# df.pipe offers a structured way to apply a sequence of user-defined functions (UDFs).
df.pipe(pd_describe) # pd_describe this df as the a_df argument

# your pd_describe function is a very handy way to look at statistics for any partial data in your data frame.
# you could select just all the I64 columns and call pipe on it and it will give you statistics for that partial data. See example below


In [None]:
# df.pipe offers a structured way to apply a sequence of user-defined functions (UDFs).
df.select(pl.col(pl.Int64)).pipe(pd_describe)

In [None]:
# casting
df.select(pl.col('impressions').cast(pl.Int32))

In [None]:
# stick the new Int32 impressinos column back in the main dataframe
df.with_columns(pl.col('impressions').cast(pl.Int32))

In [111]:
import numpy as np

In [112]:
np.iinfo(np.uint32)

iinfo(min=0, max=4294967295, dtype=uint32)

In [120]:
df.select(cs.numeric()).columns

['Tweet id',
 'impressions',
 'engagements',
 'engagement rate',
 'retweets',
 'replies',
 'likes',
 'user profile clicks',
 'url clicks',
 'hashtag clicks',
 'detail expands',
 'permalink clicks',
 'app opens',
 'app installs',
 'follows',
 'email tweet',
 'dial phone',
 'media views',
 'media engagements']

In [121]:
## 
(df
 .select(cs.numeric())  # take all the numeric cols
 .pipe(pd_describe) # do a describe on that
 .select([ # select numeric cols where max value is <= 255
     pl.col(col)
       for col in df.select(cs.numeric()).columns
     if df[col].max() <= 255 
 ])
) 

engagement rate,replies,hashtag clicks,permalink clicks,app opens,app installs,follows,email tweet,dial phone
f64,f64,f64,f64,f64,f64,f64,f64,f64
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.007064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.016043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.040902,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.484127,207.0,12.0,0.0,3.0,0.0,191.0,0.0,0.0
…,…,…,…,…,…,…,…,…
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.007064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.016043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.040902,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Laziness
# in polars, you can actually create a chain of User defined operations on your dataframe and then hand it together to polars
# polars will then analyze it, optimize it and run it as one big operations instead of running many smaller un-optimized operations