In [None]:
# !pip install polars

In [None]:
import polars
polars.__version__

In [None]:
import polars as pl

df = pl.DataFrame(
     {
         'Model': ['Camry','Corolla','RAV4',
                   'Mustang','F-150','Escape',
                   'Golf','Tiguan'],
         'Year': [1982,1966,1994,1964,1975,2000,1974,2007],   
         'Engine_Min':[2.5,1.8,2.0,2.3,2.7,1.5,1.0,1.4],
         'Engine_Max':[3.5,2.0,2.5,5.0,5.0,2.5,2.0,2.0],
         'AWD':[False,False,True,False,True,True,True,True],
         'Company': ['Toyota','Toyota','Toyota','Ford',
                     'Ford','Ford','Volkswagen','Volkswagen'],
     }
)
df

In [None]:
df.dtypes

In [None]:
df.columns

In [None]:
df.rows()

# Selecting Columns

In [None]:
df.select(
    'Model'
)

In [None]:
df['Model']    # not recommended; anti-pattern

In [None]:
df.select(
    ['Model','Company']
)

In [None]:
# get all columns of type Utf8
df.select(
    pl.col(pl.String)
)

In [None]:
# piping together expressions
df.select(
    pl.col(['Year','Model','Engine_Max'])
    .sort_by(['Engine_Max','Year'],descending = [False,True])
)

In [None]:
df.select(
    [pl.col(pl.String), 'Year']
)

# Selecting Rows

In [None]:
df.row(0)  # get the first row

In [None]:
df[1:3]  # returns the first 3 rows

In [None]:
df.filter(
    pl.col('Company') == 'Toyota'
)

In [None]:
df.filter(
    (pl.col('Company') == 'Toyota') | (pl.col('Company') == 'Ford')
)

# parentheses are important

In [None]:
df.filter(
    (pl.col('Company') == 'Toyota') & (pl.col('Year') > 1980)
)

In [None]:
df.filter(
    ~(pl.col('Company') == 'Toyota')
)

In [None]:
df.filter(
(pl.col('Company') != 'Toyota')
)

# Selecting Rows and Columns

In [None]:
df.filter(
    pl.col('Company') == 'Toyota'
).select(
    'Model'
)

In [None]:
df.filter(
    pl.col('Company') == 'Toyota'
).select(
    ['Model','Year']
)

In [None]:
df.filter(
    pl.col('Company') == 'Toyota'
).select(
    '*'
)

In [None]:
ctx = pl.SQLContext(cars = df)
ctx.execute("SELECT * FROM cars", eager=True)

In [None]:
ctx.execute('''
SELECT Company,
  AVG(Engine_Min) AS avg_engine_min,
  AVG(Engine_Max) AS avg_engine_max
FROM cars
GROUP BY Company;
''', eager=True)

In [None]:
import pandas as pd
df = pd.read_csv('flights.csv')
df

In [None]:
df = pd.read_csv('flights.csv')
df = df[(df['MONTH'] == 5) &
        (df['ORIGIN_AIRPORT'] == 'SFO') &
        (df['DESTINATION_AIRPORT'] == 'SEA')]
df

In [None]:
# for mac
!brew install graphviz

In [None]:
import polars as pl
q = (
    pl.scan_csv('flights.csv')    
)
type(q)

In [None]:
df = pl.read_csv('flights.csv')
type(df)

In [None]:
q = pl.scan_csv('flights.csv')
q = q.select(['MONTH', 'ORIGIN_AIRPORT','DESTINATION_AIRPORT'])
q = q.filter(
    (pl.col('MONTH') == 5) &
    (pl.col('ORIGIN_AIRPORT') == 'SFO') &
    (pl.col('DESTINATION_AIRPORT') == 'SEA'))

In [None]:
q = (
pl.scan_csv('flights.csv')
.select(['MONTH', 'ORIGIN_AIRPORT','DESTINATION_AIRPORT'])
.filter(
    (pl.col('MONTH') == 5) &
    (pl.col('ORIGIN_AIRPORT') == 'SFO') &
    (pl.col('DESTINATION_AIRPORT') == 'SEA'))
)

In [None]:
q.show_graph(optimized=True)

In [None]:
q.show_graph(optimized=False)

In [None]:
q.collect()

In [None]:
df = (
    pl.read_csv('flights.csv')
    .select(['MONTH', 'ORIGIN_AIRPORT','DESTINATION_AIRPORT'])
    .filter(
        (pl.col('MONTH') == 5) &
        (pl.col('ORIGIN_AIRPORT') == 'SFO') &
        (pl.col('DESTINATION_AIRPORT') == 'SEA'))
)
df

In [None]:
q = (
    pl.read_csv('flights.csv')
    .lazy()
    .select(['MONTH', 'ORIGIN_AIRPORT','DESTINATION_AIRPORT'])
    .filter(
        (pl.col('MONTH') == 5) &
        (pl.col('ORIGIN_AIRPORT') == 'SFO') &
        (pl.col('DESTINATION_AIRPORT') == 'SEA'))
)
df = q.collect()
display(df)

In [None]:
import polars as pl
df = pl.DataFrame(
    {
        'Model': ['Camry','Corolla','RAV4',
                  'Mustang','F-150','Escape',
                  'Golf','Tiguan'],
        'Year': [1982,1966,1994,1964,1975,2000,1974,2007],
        'Engine_Min':[2.5,1.8,2.0,2.3,2.7,1.5,1.0,1.4],
        'Engine_Max':[3.5,2.0,2.5,5.0,5.0,2.5,2.0,2.0],
        'AWD':[False,False,True,False,True,True,True,True],
        'Company': ['Toyota','Toyota','Toyota','Ford',
        'Ford','Ford','Volkswagen','Volkswagen'],
    }
)    

In [None]:
!pip install pyarrow

In [None]:
import duckdb
result = duckdb.sql('''
  SELECT 
    *
  FROM df
''')
result

In [None]:
result.pl()

In [None]:
result.describe()

In [None]:
result.order('Year')

In [None]:
result.order('Year DESC')

In [None]:
result.apply('min', 'Year')

In [None]:
duckdb.sql('''
  SELECT Company, Model
  FROM df
  ORDER by Company, Model
''').pl()

In [None]:
duckdb.sql('''
  SELECT Company, count(Model) as count
  FROM df
  GROUP BY Company
''').pl()

In [None]:
result.pl().select(
    pl.col('Company').value_counts()
).unnest('Company')

In [None]:
import duckdb
conn = duckdb.connect()
conn.execute('''
  CREATE TABLE customers
    (customer_id INTEGER PRIMARY KEY, name STRING)
''')
conn.execute('''
  CREATE TABLE products
    (product_id INTEGER PRIMARY KEY, product_name STRING)
''')
conn.execute('''
  CREATE TABLE sales
    (customer_id INTEGER, product_id INTEGER, qty INTEGER,
  PRIMARY KEY(customer_id,product_id))
''')

In [None]:
customers_relation = conn.table('customers')

In [None]:
# convert to a pandas DataFrame
customers_relation.df()

# convert to a Polars DataFrame
customers_relation.pl()

In [None]:
customers_relation.insert([1, 'Alice'])
customers_relation.insert([2, 'Bob'])
customers_relation.insert([3, 'Charlie'])

In [None]:
products_relation = conn.table('products')
products_relation.insert([10, 'Paperclips'])
products_relation.insert([20, 'Staple'])
products_relation.insert([30, 'Notebook'])
sales_relation = conn.table("sales")
sales_relation.insert([1,20,1])
sales_relation.insert([1,10,2])
sales_relation.insert([2,30,7])
sales_relation.insert([3,10,3])
sales_relation.insert([3,20,2])

In [None]:
result = customers_relation.join(
    sales_relation,
    condition = "customer_id",
    how = "inner"
).join(
    products_relation,
    condition = "product_id",
    how = "inner"
)    

In [None]:
result

In [None]:
result.filter('customer_id = 1')

In [None]:
# execute a query on the result to fetch and print the joined data
conn.execute('''
  SELECT *
  FROM result
  WHERE customer_id = 1
''').pl()

In [None]:
result.aggregate('customer_id, MAX(name) AS Name, ' +
                 'SUM(qty) as "Total Qty"',
                 'customer_id')

In [None]:
result.project('name, qty, product_name')

In [None]:
result.limit(3)

In [None]:
result.limit(3,2) # display 3 rows, starting at offset 2 (third row)