# Reading and Writing Data

In [22]:
import polars as pl
pl.__version__

'1.20.0'

## Format Overview

## Reading CSV Files

In [23]:
! cat data/penguins.csv

"rowid","species","island","bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g","sex","year"
"1","Adelie","Torgersen",39.1,18.7,181,3750,"male",2007
"2","Adelie","Torgersen",39.5,17.4,186,3800,"female",2007
"3","Adelie","Torgersen",40.3,18,195,3250,"female",2007
"4","Adelie","Torgersen",NA,NA,NA,NA,NA,2007
"5","Adelie","Torgersen",36.7,19.3,193,3450,"female",2007
"6","Adelie","Torgersen",39.3,20.6,190,3650,"male",2007
"7","Adelie","Torgersen",38.9,17.8,181,3625,"female",2007
"8","Adelie","Torgersen",39.2,19.6,195,4675,"male",2007
"9","Adelie","Torgersen",34.1,18.1,193,3475,NA,2007
"10","Adelie","Torgersen",42,20.2,190,4250,NA,2007
"11","Adelie","Torgersen",37.8,17.1,186,3300,NA,2007
"12","Adelie","Torgersen",37.8,17.3,180,3700,NA,2007
"13","Adelie","Torgersen",41.1,17.6,182,3200,"female",2007
"14","Adelie","Torgersen",38.6,21.2,191,3800,"male",2007
"15","Adelie","Torgersen",34.6,21.1,198,4400,"male",2007
"16","Adelie","Torgersen",36.6,17.8,185,3700,"female",2007
"17","Ade

In [24]:
penguins = pl.read_csv("data/penguins.csv")
penguins

rowid,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
i64,str,str,str,str,str,str,str,i64
1,"""Adelie""","""Torgersen""","""39.1""","""18.7""","""181""","""3750""","""male""",2007
2,"""Adelie""","""Torgersen""","""39.5""","""17.4""","""186""","""3800""","""female""",2007
3,"""Adelie""","""Torgersen""","""40.3""","""18""","""195""","""3250""","""female""",2007
4,"""Adelie""","""Torgersen""","""NA""","""NA""","""NA""","""NA""","""NA""",2007
5,"""Adelie""","""Torgersen""","""36.7""","""19.3""","""193""","""3450""","""female""",2007
…,…,…,…,…,…,…,…,…
340,"""Chinstrap""","""Dream""","""55.8""","""19.8""","""207""","""4000""","""male""",2009
341,"""Chinstrap""","""Dream""","""43.5""","""18.1""","""202""","""3400""","""female""",2009
342,"""Chinstrap""","""Dream""","""49.6""","""18.2""","""193""","""3775""","""male""",2009
343,"""Chinstrap""","""Dream""","""50.8""","""19""","""210""","""4100""","""male""",2009


## Parsing Missing Values Correctly

In [25]:
penguins = pl.read_csv("data/penguins.csv", null_values="NA")
penguins

rowid,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
i64,str,str,f64,f64,i64,i64,str,i64
1,"""Adelie""","""Torgersen""",39.1,18.7,181,3750,"""male""",2007
2,"""Adelie""","""Torgersen""",39.5,17.4,186,3800,"""female""",2007
3,"""Adelie""","""Torgersen""",40.3,18.0,195,3250,"""female""",2007
4,"""Adelie""","""Torgersen""",,,,,,2007
5,"""Adelie""","""Torgersen""",36.7,19.3,193,3450,"""female""",2007
…,…,…,…,…,…,…,…,…
340,"""Chinstrap""","""Dream""",55.8,19.8,207,4000,"""male""",2009
341,"""Chinstrap""","""Dream""",43.5,18.1,202,3400,"""female""",2009
342,"""Chinstrap""","""Dream""",49.6,18.2,193,3775,"""male""",2009
343,"""Chinstrap""","""Dream""",50.8,19.0,210,4100,"""male""",2009


In [26]:
penguins.null_count().transpose(
    include_header=True, column_names=["null_count"]
)

column,null_count
str,u32
"""rowid""",0
"""species""",0
"""island""",0
"""bill_length_mm""",2
"""bill_depth_mm""",2
"""flipper_length_mm""",2
"""body_mass_g""",2
"""sex""",11
"""year""",0


## Reading Files with Encodings Other Than UTF-8

In [6]:
# This raises a ComputeError:
# pl.read_csv("data/directors.csv")

In [7]:
pl.read_csv("data/directors.csv", encoding="EUC-CN")

name,born,country
str,i64,str
"""考侯""",1930,"""泣塑"""
"""Verhoeven""",1938,"""オランダ"""
"""弟宏""",1942,"""泣塑"""
"""Tarantino""",1963,"""势柜"""


In [8]:
import chardet


def detect_encoding(filename: str) -> str:
    """Return the most probable character encoding for a file."""

    with open(filename, "rb") as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        return result["encoding"]


detect_encoding("data/directors.csv")

'EUC-JP'

In [9]:
pl.read_csv("data/directors.csv", encoding="EUC-JP")

name,born,country
str,i64,str
"""深作""",1930,"""日本"""
"""Verhoeven""",1938,"""オランダ"""
"""宮崎""",1942,"""日本"""
"""Tarantino""",1963,"""米国"""


## Reading Excel Spreadsheets

In [27]:
songs = pl.read_excel("data/top2000-2023.xlsx")
songs

positie,titel,artiest,jaar
i64,str,str,i64
1,"""Bohemian Rhapsody""","""Queen""",1975
2,"""Roller Coaster""","""Danny Vera""",2019
3,"""Hotel California""","""Eagles""",1977
4,"""Piano Man""","""Billy Joel""",1974
5,"""Fix You""","""Coldplay""",2005
…,…,…,…
1996,"""Charlie Brown""","""Coldplay""",2011
1997,"""Beast Of Burden""","""Bette Midler""",1984
1998,"""It Was A Very Good Year""","""Frank Sinatra""",1968
1999,"""Hou Van Mij""","""3JS""",2008


## Working with Multiple Files

In [28]:
pl.read_csv("data/stock/nvda/201?.csv")

symbol,date,open,high,low,close,adj close,volume
str,str,f64,f64,f64,f64,f64,i64
"""NVDA""","""2010-01-04""",4.6275,4.655,4.5275,4.6225,4.240429,80020400
"""NVDA""","""2010-01-05""",4.605,4.74,4.605,4.69,4.30235,72864800
"""NVDA""","""2010-01-06""",4.6875,4.73,4.6425,4.72,4.32987,64916800
"""NVDA""","""2010-01-07""",4.695,4.715,4.5925,4.6275,4.245015,54779200
"""NVDA""","""2010-01-08""",4.59,4.67,4.5625,4.6375,4.254189,47816800
…,…,…,…,…,…,…,…
"""NVDA""","""2019-12-24""",59.549999,59.827499,59.205002,59.654999,59.422798,13886400
"""NVDA""","""2019-12-26""",59.689999,60.080002,59.5,59.797501,59.564739,18285200
"""NVDA""","""2019-12-27""",59.950001,60.084999,58.952499,59.217499,58.987,25464400
"""NVDA""","""2019-12-30""",58.997501,59.049999,57.764999,58.080002,57.853928,25805600


In [47]:
all_stocks = pl.read_csv("data/stock/**/*.csv")
all_stocks

symbol,date,open,high,low,close,adj close,volume
str,str,f64,f64,f64,f64,f64,i64
"""ASML""","""1999-01-04""",11.765625,12.28125,11.765625,12.140625,7.522523,1801867
"""ASML""","""1999-01-05""",11.859375,14.25,11.71875,13.96875,8.655257,8241600
"""ASML""","""1999-01-06""",14.25,17.601563,14.203125,16.875,10.456018,16400267
"""ASML""","""1999-01-07""",14.742188,17.8125,14.53125,16.851563,10.441495,17722133
"""ASML""","""1999-01-08""",16.078125,16.289063,15.023438,15.796875,9.787995,10696000
…,…,…,…,…,…,…,…
"""TSM""","""2023-06-26""",102.019997,103.040001,100.089996,100.110001,99.125954,8560000
"""TSM""","""2023-06-27""",101.150002,102.790001,100.019997,102.080002,101.076591,9732000
"""TSM""","""2023-06-28""",100.5,101.879997,100.220001,100.919998,99.927986,8160900
"""TSM""","""2023-06-29""",101.339996,101.519997,100.019997,100.639999,99.650742,7383900


In [48]:
import calendar

filenames = [
    f"data/stock/asml/{year}.csv"
    for year in range(1999, 2024)
    if calendar.isleap(year)
]

filenames

['data/stock/asml/2000.csv',
 'data/stock/asml/2004.csv',
 'data/stock/asml/2008.csv',
 'data/stock/asml/2012.csv',
 'data/stock/asml/2016.csv',
 'data/stock/asml/2020.csv']

In [30]:
pl.concat(pl.read_csv(f) for f in filenames)

symbol,date,open,high,low,close,adj close,volume
str,str,f64,f64,f64,f64,f64,i64
"""ASML""","""2000-01-03""",43.875,43.875,41.90625,43.640625,27.040424,1121600
"""ASML""","""2000-01-04""",41.953125,42.5625,40.59375,40.734375,25.239666,968800
"""ASML""","""2000-01-05""",39.28125,39.703125,37.757813,39.609375,24.542597,1458133
"""ASML""","""2000-01-06""",36.75,37.59375,35.226563,37.171875,23.032274,3517867
"""ASML""","""2000-01-07""",36.867188,38.0625,36.65625,38.015625,23.555077,1631200
…,…,…,…,…,…,…,…
"""ASML""","""2020-12-24""",478.950012,484.600006,477.079987,483.089996,468.836365,271900
"""ASML""","""2020-12-28""",487.140015,488.720001,478.429993,480.23999,466.070496,449300
"""ASML""","""2020-12-29""",489.450012,489.450012,482.51001,484.01001,469.729218,377200
"""ASML""","""2020-12-30""",488.130005,492.660004,488.0,489.910004,475.455231,381900


## Reading Parquet

In [14]:
%%time
trips = pl.read_parquet("data/taxi/yellow_tripdata_*.parquet")
trips

CPU times: user 5.04 s, sys: 15.4 s, total: 20.4 s
Wall time: 15 s


VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
i64,datetime[ns],datetime[ns],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,"""N""",142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,"""N""",236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,"""N""",166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,"""N""",114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,"""N""",68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2,2022-12-31 23:46:00,2023-01-01 00:11:00,,11.56,,,16,36,0,39.55,0.0,0.5,8.21,0.0,1.0,49.26,,
2,2022-12-31 23:13:24,2022-12-31 23:29:08,,5.06,,,75,50,0,26.23,0.0,0.5,0.0,0.0,1.0,30.23,,
2,2022-12-31 23:00:49,2022-12-31 23:26:57,,13.35,,,168,197,0,47.73,0.0,0.5,9.85,0.0,1.0,59.08,,
1,2022-12-31 23:02:50,2022-12-31 23:16:05,,0.0,,,238,116,0,12.74,0.0,0.5,0.0,0.0,1.0,16.74,,


## Reading JSON and NDJSON

### JSON

In [31]:
! cat data/pokedex.json

{
  "pokemon": [{
    "id": 1,
    "num": "001",
    "name": "Bulbasaur",
    "img": "http://www.serebii.net/pokemongo/pokemon/001.png",
    "type": [
      "Grass",
      "Poison"
    ],
    "height": "0.71 m",
    "weight": "6.9 kg",
    "candy": "Bulbasaur Candy",
    "candy_count": 25,
    "egg": "2 km",
    "spawn_chance": 0.69,
    "avg_spawns": 69,
    "spawn_time": "20:00",
    "multipliers": [1.58],
    "weaknesses": [
      "Fire",
      "Ice",
      "Flying",
      "Psychic"
    ],
    "next_evolution": [{
      "num": "002",
      "name": "Ivysaur"
    }, {
      "num": "003",
      "name": "Venusaur"
    }]
  }, {
    "id": 2,
    "num": "002",
    "name": "Ivysaur",
    "img": "http://www.serebii.net/pokemongo/pokemon/002.png",
    "type": [
      "Grass",
      "Poison"
    ],
    "height": "0.99 m",
    "weight": "13.0 kg",
    "candy": "Bulbasaur Candy",
    "candy_count": 100,
    "egg": "Not in Eggs",
    "spawn_chance": 0.042,
    "avg_spawns": 4.2,
    "spawn_time"

In [32]:
pokedex = pl.read_json("data/pokedex.json")
pokedex

pokemon
list[struct[17]]
"[{1,""001"",""Bulbasaur"",""http://www.serebii.net/pokemongo/pokemon/001.png"",[""Grass"", ""Poison""],""0.71 m"",""6.9 kg"",""Bulbasaur Candy"",""2 km"",0.69,69.0,""20:00"",[1.58],[""Fire"", ""Ice"", … ""Psychic""],25,null,[{""002"",""Ivysaur""}, {""003"",""Venusaur""}]}, {2,""002"",""Ivysaur"",""http://www.serebii.net/pokemongo/pokemon/002.png"",[""Grass"", ""Poison""],""0.99 m"",""13.0 kg"",""Bulbasaur Candy"",""Not in Eggs"",0.042,4.2,""07:00"",[1.2, 1.6],[""Fire"", ""Ice"", … ""Psychic""],100,[{""001"",""Bulbasaur""}],[{""003"",""Venusaur""}]}, … {151,""151"",""Mew"",""http://www.serebii.net/pokemongo/pokemon/151.png"",[""Psychic""],""0.41 m"",""4.0 kg"",""None"",""Not in Eggs"",0.0,0.0,""N/A"",null,[""Bug"", ""Ghost"", ""Dark""],null,null,null}]"


In [33]:
(
    pokedex.explode("pokemon")
    .unnest("pokemon")
    .select("id", "name", "type", "height", "weight")
)

id,name,type,height,weight
i64,str,list[str],str,str
1,"""Bulbasaur""","[""Grass"", ""Poison""]","""0.71 m""","""6.9 kg"""
2,"""Ivysaur""","[""Grass"", ""Poison""]","""0.99 m""","""13.0 kg"""
3,"""Venusaur""","[""Grass"", ""Poison""]","""2.01 m""","""100.0 kg"""
4,"""Charmander""","[""Fire""]","""0.61 m""","""8.5 kg"""
5,"""Charmeleon""","[""Fire""]","""1.09 m""","""19.0 kg"""
…,…,…,…,…
147,"""Dratini""","[""Dragon""]","""1.80 m""","""3.3 kg"""
148,"""Dragonair""","[""Dragon""]","""3.99 m""","""16.5 kg"""
149,"""Dragonite""","[""Dragon"", ""Flying""]","""2.21 m""","""210.0 kg"""
150,"""Mewtwo""","[""Psychic""]","""2.01 m""","""122.0 kg"""


### NDJSON

In [34]:
! cat data/wikimedia.ndjson

{"$schema":"/mediawiki/recentchange/1.0.0","meta":{"uri":"https://en.wikipedia.org/wiki/EFL_Championship","request_id":"ea0541fb-4e72-4fc3-82f0-6c26651b2043","id":"0416300b-980c-45bb-b0a2-c9d7a9e2b7eb","dt":"2023-07-29T07:51:39Z","domain":"en.wikipedia.org","stream":"mediawiki.recentchange","topic":"eqiad.mediawiki.recentchange","partition":0,"offset":4820784717},"id":1659529639,"type":"edit","namespace":0,"title":"EFL Championship","title_url":"https://en.wikipedia.org/wiki/EFL_Championship","comment":"/* League champions, runners-up and play-off finalists */","timestamp":1690617099,"user":"87.12.215.232","bot":false,"notify_url":"https://en.wikipedia.org/w/index.php?diff=1167689309&oldid=1166824248","minor":false,"length":{"old":91108,"new":91166},"revision":{"old":1166824248,"new":1167689309},"server_url":"https://en.wikipedia.org","server_name":"en.wikipedia.org","server_script_path":"/w","wiki":"enwiki","parsedcomment":"<span dir=\"auto\"><span class=\"autocomment\"><a href=\"/wik

In [35]:
from json import loads
from pprint import pprint

with open("data/wikimedia.ndjson") as f:
    pprint(loads(f.readline()))

{'$schema': '/mediawiki/recentchange/1.0.0',
 'bot': False,
 'comment': '/* League champions, runners-up and play-off finalists */',
 'id': 1659529639,
 'length': {'new': 91166, 'old': 91108},
 'meta': {'domain': 'en.wikipedia.org',
          'dt': '2023-07-29T07:51:39Z',
          'id': '0416300b-980c-45bb-b0a2-c9d7a9e2b7eb',
          'offset': 4820784717,
          'partition': 0,
          'request_id': 'ea0541fb-4e72-4fc3-82f0-6c26651b2043',
          'stream': 'mediawiki.recentchange',
          'topic': 'eqiad.mediawiki.recentchange',
          'uri': 'https://en.wikipedia.org/wiki/EFL_Championship'},
 'minor': False,
 'namespace': 0,
 'notify_url': 'https://en.wikipedia.org/w/index.php?diff=1167689309&oldid=1166824248',
 'parsedcomment': '<span dir="auto"><span class="autocomment"><a '
                  'href="/wiki/EFL_Championship#League_champions,_runners-up_and_play-off_finalists" '
                  'title="EFL Championship">→\u200eLeague champions, '
                  'ru

In [36]:
wikimedia = pl.read_ndjson("data/wikimedia.ndjson")
wikimedia

$schema,meta,id,type,namespace,title,title_url,comment,timestamp,user,bot,notify_url,minor,length,revision,server_url,server_name,server_script_path,wiki,parsedcomment
str,struct[9],i64,str,i64,str,str,str,i64,str,bool,str,bool,struct[2],struct[2],str,str,str,str,str
"""/mediawiki/recentchange/1.0.0""","{""https://en.wikipedia.org/wiki/EFL_Championship"",""ea0541fb-4e72-4fc3-82f0-6c26651b2043"",""0416300b-980c-45bb-b0a2-c9d7a9e2b7eb"",""2023-07-29T07:51:39Z"",""en.wikipedia.org"",""mediawiki.recentchange"",""eqiad.mediawiki.recentchange"",0,4820784717}",1659529639,"""edit""",0,"""EFL Championship""","""https://en.wikipedia.org/wiki/…","""/* League champions, runners-u…",1690617099,"""87.12.215.232""",false,"""https://en.wikipedia.org/w/ind…",false,"{91108,91166}","{1166824248,1167689309}","""https://en.wikipedia.org""","""en.wikipedia.org""","""/w""","""enwiki""","""<span dir=""auto""><span class=""…"
"""/mediawiki/recentchange/1.0.0""","{""https://en.wikipedia.org/wiki/Lim_Sang-choon"",""01a0f468-7553-48db-b553-7ac392b2187c"",""97e4dc39-fb32-4774-9c9a-b2caea391c9e"",""2023-07-29T07:51:42Z"",""en.wikipedia.org"",""mediawiki.recentchange"",""eqiad.mediawiki.recentchange"",0,4820784731}",1659529640,"""edit""",0,"""Lim Sang-choon""","""https://en.wikipedia.org/wiki/…","""""",1690617102,"""Preferwiki""",false,"""https://en.wikipedia.org/w/ind…",false,"{9807,10480}","{1167689034,1167689310}","""https://en.wikipedia.org""","""en.wikipedia.org""","""/w""","""enwiki""",""""""
"""/mediawiki/recentchange/1.0.0""","{""https://en.wikipedia.org/wiki/Higher"",""5f053899-a2ab-4dec-8e98-1d01cd86093d"",""0da41aa2-ceb6-443f-8a0a-0633f83de6ec"",""2023-07-29T07:51:44Z"",""en.wikipedia.org"",""mediawiki.recentchange"",""eqiad.mediawiki.recentchange"",0,4820784748}",1659529642,"""edit""",0,"""Higher""","""https://en.wikipedia.org/wiki/…","""/* Albums */ add""",1690617104,"""Ss112""",false,"""https://en.wikipedia.org/w/ind…",false,"{5452,5548}","{1162509981,1167689312}","""https://en.wikipedia.org""","""en.wikipedia.org""","""/w""","""enwiki""","""<span dir=""auto""><span class=""…"
"""/mediawiki/recentchange/1.0.0""","{""https://en.wikipedia.org/wiki/International_Poker_Rules"",""10c4886c-95f5-4cd2-8db9-333cb45f041b"",""af9cc405-26b2-485d-b20c-edd21b2a2a4c"",""2023-07-29T07:51:44Z"",""en.wikipedia.org"",""mediawiki.recentchange"",""eqiad.mediawiki.recentchange"",0,4820784763}",1659529643,"""edit""",0,"""International Poker Rules""","""https://en.wikipedia.org/wiki/…","""Nominated for deletion; see [[…",1690617104,"""Piotrus""",false,"""https://en.wikipedia.org/w/ind…",false,"{2452,2896}","{1055827921,1167689313}","""https://en.wikipedia.org""","""en.wikipedia.org""","""/w""","""enwiki""","""Nominated for deletion; see <a…"
"""/mediawiki/recentchange/1.0.0""","{""https://en.wikipedia.org/wiki/Abdul_Hamid_Khan_Bhashani"",""198e47ae-fa33-4059-970a-550536e7bc7c"",""2012e36f-9fa1-49dc-ba68-5946de740cd4"",""2023-07-29T07:51:45Z"",""en.wikipedia.org"",""mediawiki.recentchange"",""eqiad.mediawiki.recentchange"",0,4820784800}",1659529653,"""edit""",0,"""Abdul Hamid Khan Bhashani""","""https://en.wikipedia.org/wiki/…","""Rescuing 1 sources and tagging…",1690617105,"""InternetArchiveBot""",true,"""https://en.wikipedia.org/w/ind…",false,"{31503,31687}","{1163358967,1167689318}","""https://en.wikipedia.org""","""en.wikipedia.org""","""/w""","""enwiki""","""Rescuing 1 sources and tagging…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""/mediawiki/recentchange/1.0.0""","{""https://en.wikipedia.org/wiki/Havering_Residents_Association"",""d563edd1-2a19-491b-9604-aea1b7c57a67"",""f5b5b53d-8024-4c4d-810b-e3c04b8f2392"",""2023-07-29T07:53:58Z"",""en.wikipedia.org"",""mediawiki.recentchange"",""eqiad.mediawiki.recentchange"",0,4820788373}",1659529961,"""edit""",0,"""Havering Residents Association""","""https://en.wikipedia.org/wiki/…","""/* 2018 election */ +map""",1690617238,"""MRSC""",false,"""https://en.wikipedia.org/w/ind…",false,"{7590,7707}","{1167689281,1167689476}","""https://en.wikipedia.org""","""en.wikipedia.org""","""/w""","""enwiki""","""<span dir=""auto""><span class=""…"
"""/mediawiki/recentchange/1.0.0""","{""https://en.wikipedia.org/wiki/Olha_Kharlan"",""7caa4fe3-2ddf-4a57-b6d4-6eee505f5e49"",""b7b84f63-69f7-4317-82b2-df4a3e853723"",""2023-07-29T07:53:55Z"",""en.wikipedia.org"",""mediawiki.recentchange"",""eqiad.mediawiki.recentchange"",0,4820788404}",1659529963,"""edit""",0,"""Olha Kharlan""","""https://en.wikipedia.org/wiki/…","""Ce""",1690617235,"""2603:7000:2101:AA00:2C88:EF86:…",false,"""https://en.wikipedia.org/w/ind…",false,"{65119,65323}","{1167666634,1167689477}","""https://en.wikipedia.org""","""en.wikipedia.org""","""/w""","""enwiki""","""Ce"""
"""/mediawiki/recentchange/1.0.0""","{""https://en.wikipedia.org/wiki/Mukim_Kota_Batu"",""1032a354-1171-47a2-bdb3-cbcb78a070bf"",""a2b2601e-5adc-4ed2-b8fc-292d128009d7"",""2023-07-29T07:53:58Z"",""en.wikipedia.org"",""mediawiki.recentchange"",""eqiad.mediawiki.recentchange"",0,4820788418}",1659529964,"""edit""",0,"""Mukim Kota Batu""","""https://en.wikipedia.org/wiki/…","""""",1690617238,"""Pangalau""",false,"""https://en.wikipedia.org/w/ind…",false,"{7178,7189}","{1150066841,1167689474}","""https://en.wikipedia.org""","""en.wikipedia.org""","""/w""","""enwiki""",""""""
"""/mediawiki/recentchange/1.0.0""","{""https://en.wikipedia.org/wiki/User:IDK1213safasx/sandbox"",""0f29a0a7-de5a-4197-a8dd-98b6a4eb5f6f"",""5c2219b5-7b3e-4720-944b-94afb5764b76"",""2023-07-29T07:53:59Z"",""en.wikipedia.org"",""mediawiki.recentchange"",""eqiad.mediawiki.recentchange"",0,4820788422}",1659529965,"""edit""",2,"""User:IDK1213safasx/sandbox""","""https://en.wikipedia.org/wiki/…","""""",1690617239,"""94.101.29.27""",false,"""https://en.wikipedia.org/w/ind…",false,"{2122,2122}","{1167356449,1167689478}","""https://en.wikipedia.org""","""en.wikipedia.org""","""/w""","""enwiki""",""""""


In [37]:
(
    wikimedia.rename({"id": "edit_id"})
    .unnest("meta")
    .select("timestamp", "title", "user", "comment")
)

timestamp,title,user,comment
i64,str,str,str
1690617099,"""EFL Championship""","""87.12.215.232""","""/* League champions, runners-u…"
1690617102,"""Lim Sang-choon""","""Preferwiki""",""""""
1690617104,"""Higher""","""Ss112""","""/* Albums */ add"""
1690617104,"""International Poker Rules""","""Piotrus""","""Nominated for deletion; see [[…"
1690617105,"""Abdul Hamid Khan Bhashani""","""InternetArchiveBot""","""Rescuing 1 sources and tagging…"
…,…,…,…
1690617238,"""Havering Residents Association""","""MRSC""","""/* 2018 election */ +map"""
1690617235,"""Olha Kharlan""","""2603:7000:2101:AA00:2C88:EF86:…","""Ce"""
1690617238,"""Mukim Kota Batu""","""Pangalau""",""""""
1690617239,"""User:IDK1213safasx/sandbox""","""94.101.29.27""",""""""


## Other File Formats

In [38]:
import pandas as pd

url = "https://en.wikipedia.org/wiki/List_of_Latin_abbreviations"
pl.from_pandas(pd.read_html(url)[0])

abbreviation,Latin,translation,usage and notes
str,str,str,str
"""AD""","""anno Domini""","""""in the year of the Lord""""","""Used to label or number years …"
"""a.i.""","""ad interim""","""""temporarily""""","""Used in business organizationa…"
"""a.m.""","""ante meridiem""","""""before midday""[1]""","""Used on the twelve-hour clock …"
"""ca. c.""","""circa""","""""around"", ""about"", ""approximat…","""Used with dates to indicate ""a…"
"""Cap.""","""capitulus""","""""chapter""""","""Used before a chapter number o…"
…,…,…,…
"""SOS""","""si opus sit""","""""if there is need"", ""if occasi…","""A prescription indication that…"
"""sic""","""sic""","""""thus""""","""Used when quoting text that co…"
"""stat.""","""statim""","""""immediately""""","""Often used in medical contexts…"
"""viz.""","""videlicet""","""""namely"", ""to wit"", ""precisely…","""In contradistinction to ""i.e.""…"


## Querying Databases

In [None]:
pl.read_database_uri(
    query="""
    SELECT
        f.film_id,
        f.title,
        c.name AS category,
        f.rating,
        f.length / 60.0 AS length
    FROM
        film AS f,
        film_category AS fc,
        category AS c
    WHERE
        fc.film_id = f.film_id
        AND fc.category_id = c.category_id
    LIMIT 10
    """,
    uri="sqlite:::sakila.db",
)

In [None]:
db = "sqlite:::data/sakila.db"
films = pl.read_database_uri("SELECT * FROM film", db)
film_categories = pl.read_database_uri("SELECT * FROM film_category", db)
categories = pl.read_database_uri("SELECT * FROM category", db)

(
    films.join(film_categories, on="film_id", suffix="_fc")
    .join(categories, on="category_id", suffix="_c")
    .select(
        "film_id",
        "title",
        pl.col("name").alias("category"),
        "rating",
        pl.col("length") / 60,
    )
    .limit(10)
)

## Writing Data

### CSV Format

In [49]:
all_stocks.write_csv(".//data/all_stocks.csv")

### Excel Format

In [50]:
all_stocks.write_excel("data/all_stocks.xlsx")

<xlsxwriter.workbook.Workbook at 0x7bc19bfe7c80>

### Parquet Format

In [51]:
all_stocks.write_parquet("data/all_stocks.parquet")

### Other Considerations

## Takeaways