In [1]:
%%capture
%pip install polars
import polars as pl

In [2]:
# A dimension describes the record, and facts are measurements of the record
quote_lines_raw = (
    pl.read_csv("quote_lines.csv")
    .rename({"Vendor":"vendor", "Program":"program"})
)

# name the first dataframe 'raw' so that it's clear it's not an intermediary step's dataset
quote_lines_raw.head()

vendor,program,quote_timestamp,CPU,GPU,RAM,SSD,HDD,MOBO,NIC,PSU,TRAY,TOR,CHASSIS
str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Vendor_7""","""Program_D""","""2024-09-25T03:52:19.637095""",305.84,409.06,53.43,196.14,103.49,107.72,20.98,94.19,11.57,746.67,1015.84
"""Vendor_6""","""Program_E""","""2024-09-12T00:26:22.414219""",324.21,446.35,44.91,194.27,184.53,100.12,28.8,89.99,70.05,728.08,1068.57
"""Vendor_6""","""Program_A""","""2024-09-01T04:29:41.575326""",325.75,404.6,52.24,181.26,130.24,114.01,22.66,106.22,28.0,719.14,1665.41
"""Vendor_5""","""Program_A""","""2024-09-23T18:35:08.078890""",300.29,414.38,48.73,187.55,105.42,103.5,21.64,102.29,22.98,734.33,1770.98
"""Vendor_5""","""Program_B""","""2024-09-24T11:33:41.034306""",333.53,445.41,48.16,193.25,104.02,90.87,23.0,113.9,80.13,616.13,1563.11


In [3]:
print(quote_lines_raw)

shape: (175, 14)
┌──────────┬───────────┬──────────────────────────┬────────┬───┬────────┬───────┬────────┬─────────┐
│ vendor   ┆ program   ┆ quote_timestamp          ┆ CPU    ┆ … ┆ PSU    ┆ TRAY  ┆ TOR    ┆ CHASSIS │
│ ---      ┆ ---       ┆ ---                      ┆ ---    ┆   ┆ ---    ┆ ---   ┆ ---    ┆ ---     │
│ str      ┆ str       ┆ str                      ┆ f64    ┆   ┆ f64    ┆ f64   ┆ f64    ┆ f64     │
╞══════════╪═══════════╪══════════════════════════╪════════╪═══╪════════╪═══════╪════════╪═════════╡
│ Vendor_7 ┆ Program_D ┆ 2024-09-25T03:52:19.6370 ┆ 305.84 ┆ … ┆ 94.19  ┆ 11.57 ┆ 746.67 ┆ 1015.84 │
│          ┆           ┆ 95                       ┆        ┆   ┆        ┆       ┆        ┆         │
│ Vendor_6 ┆ Program_E ┆ 2024-09-12T00:26:22.4142 ┆ 324.21 ┆ … ┆ 89.99  ┆ 70.05 ┆ 728.08 ┆ 1068.57 │
│          ┆           ┆ 19                       ┆        ┆   ┆        ┆       ┆        ┆         │
│ Vendor_6 ┆ Program_A ┆ 2024-09-01T04:29:41.5753 ┆ 325.75 ┆ … ┆ 106.22 ┆ 

In [4]:
# open and closed parentheses sets us up for piping
(
quote_lines_raw 
    .filter(pl.col("vendor") == "Vendor_7")
    .filter(pl.col("program") == "Program_D")
)

#quote_lines_raw.filter(pl.col("age") > 30)

vendor,program,quote_timestamp,CPU,GPU,RAM,SSD,HDD,MOBO,NIC,PSU,TRAY,TOR,CHASSIS
str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Vendor_7""","""Program_D""","""2024-09-25T03:52:19.637095""",305.84,409.06,53.43,196.14,103.49,107.72,20.98,94.19,11.57,746.67,1015.84
"""Vendor_7""","""Program_D""","""2024-09-14T21:51:34.330801""",324.41,454.64,45.32,195.39,123.9,107.94,23.35,118.26,74.04,588.74,1548.05
"""Vendor_7""","""Program_D""","""2024-09-01T22:47:48.325236""",303.85,427.39,53.49,183.37,149.99,114.59,26.4,86.88,14.5,746.95,1646.28
"""Vendor_7""","""Program_D""","""2024-09-21T23:15:38.417376""",326.83,431.64,45.17,192.39,130.6,96.78,24.66,89.94,55.8,604.53,1591.95


In [5]:
# example of a multiple filter criteria as a single filter command

# order does not matter, polars will run whatever it believes is fastest, first
df_7D = (
quote_lines_raw
    .filter([
        pl.col("vendor") == "Vendor_7",
        pl.col("program") == "Program_D", # in R, every iterator has a comma after it, even the last one
    ])
    
    
)

In [6]:
# Group by and aggregate in polars

df_7DCPU = (
    df_7D   
    .group_by("vendor", "program")
    .agg(pl.mean("CPU"))
    )

# df.groupby("age").agg([
#    pl.col("salary").mean().alias("avg_salary")
#])


In [7]:
(
    df_7DCPU   
    .with_columns(
        (pl.col("CPU") * 1.5).alias("CPU_increase")
    )
)

vendor,program,CPU,CPU_increase
str,str,f64,f64
"""Vendor_7""","""Program_D""",315.2325,472.84875


In [8]:
# create new column and use it in a new calculation

df_newcpu = (
    df_7DCPU   
    .with_columns(
        pl.lit(1.5).alias("increase_rate"),
    )
    .with_columns(
         (
             pl.col("CPU") * pl.col("increase_rate")
         ).alias("new_cpu_price")
         )
)


In [11]:
df_final = (
    df_newcpu
    .drop("increase_rate")
)

print(df_final)

shape: (1, 4)
┌──────────┬───────────┬──────────┬───────────────┐
│ vendor   ┆ program   ┆ CPU      ┆ new_cpu_price │
│ ---      ┆ ---       ┆ ---      ┆ ---           │
│ str      ┆ str       ┆ f64      ┆ f64           │
╞══════════╪═══════════╪══════════╪═══════════════╡
│ Vendor_7 ┆ Program_D ┆ 315.2325 ┆ 472.84875     │
└──────────┴───────────┴──────────┴───────────────┘


In [None]:
quote_lines_raw = (
    pl.read_csv("quote_lines.csv")
)


In [None]:

(
    quote_lines_raw
    .unpivot(
        index=("Vendor", "Program", "quote_timestamp"), 
        variable_name = "components",
        value_name = "cost",
    )
    .with_columns(
        (pl.col("cost") * 1.5).alias("new_cost")
    )
)