## Introduction
This is a tutorial by Martin Bel which is a [playlist](https://www.youtube.com/playlist?list=PLo9Vi5B84_dfAuwJqNYG4XhZMrGTF3sBx) on his YouTube channel.

Here is a link to the [github repo](https://github.com/martinbel/polars-tutorial) for the tutorial.

## Environment

In [None]:
import polars as pl
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pl.__version__

## Read 9GB CSV file

In [None]:
df_pl = pl.read_csv("data/2019-Nov.csv")

In [None]:
df_pl.shape

In [None]:
df_pl.head()

In [None]:
df_pl.dtypes

In [None]:
df_pl.head().to_pandas()

## Selecting and Filtering Data

In [None]:
df_pl[0,:]

In [None]:
df_pl[:, ['event_time', 'price']].head()

In [None]:
df_pl.filter(pl.col("price") > 1000).head()

In [None]:
df_pl.sample(5)

In [None]:
df_pl.select(['brand', 'price']).head()

In [None]:
df_pl.with_columns(
    [(pl.col("price") * 100).alias("price_x_100")
]).head()

In [None]:
df_pl.filter(
    pl.col('brand').is_in(['apple', 'samsung', 'motorola'])
).head()

## Computation on the select context

In [None]:
df_pl.select([
    pl.col("product_id").n_unique()
])

In [None]:
df_pl.select([
    pl.col("price").min().alias("min"),
    pl.col("price").mean().alias("mean"),
    pl.col("price").median().alias("median"),
    pl.col("price").max().alias("max"),
    pl.col("price").std().alias("std_dev")
])

In [None]:
df_pl.select([
    pl.col("price")
]).describe()

In [None]:
price = df_pl.select([
    pl.col("price")
])

In [None]:
price.sample(100000).to_pandas().hist(bins=30);

In [None]:
probs = [0, 0.25, 0.5, 0.75, 1]
percentiles = [price.quantile(prob)[0,0] for prob in probs]

In [None]:
percentiles

In [None]:
pd.DataFrame(dict(probs=probs, percentiles=percentiles))

## How to update columns and intro to window function

In [None]:
df_top = df_pl.head()

In [None]:
df_top.with_columns(
    ("brand-" + pl.col("brand")).alias("brand2")
)

In [None]:
df_top.with_columns([
    ("brand-" + pl.col("brand")).alias("brand2"),
    (pl.col("price") * 100).alias("price2")
])

In [None]:
df_top.select([
    pl.all(),
    pl.col("price").mean().alias("price_avg")
])

In [None]:
df_pl.with_columns([
    pl.col("price").mean().over("category_code").alias("price_by_categorycode")
]).head(10)

In [None]:
df_cat_window = df_pl.select([
    pl.col("category_code"),
    pl.col("price"),
    pl.col("price").mean().over("category_code").alias("price_by_category"),
    (pl.col("price")/pl.col("price").mean().over("category_code") - 1).alias("price_div_cat_average")
])
df_cat_window.head()

In [None]:
df_cat_window_sample = df_cat_window.sample(100000).to_pandas()

In [None]:
df_cat_window_sample.describe()

In [None]:
df_cat_window_sample.query('price_div_cat_average == -1.0')

In [None]:
df_cat_window_sample.price_div_cat_average.hist(bins=30, range=[-2,5])

In [None]:
df_cat_window_sample.price_div_cat_average.quantile(np.arange(0, 1.1, 0.1))