## DuckDB's httpfs Extension

In [1]:
import duckdb

conn = duckdb.connect()
conn.execute('''
  INSTALL httpfs;
  LOAD httpfs;
''')

<duckdb.duckdb.DuckDBPyConnection at 0x75e3d6c5c770>

## Querying CSV and Parquet Files Remotely

### Accessing CSV Files

In [2]:
conn.execute('''
  SELECT 
    *
  FROM 
    'https://raw.githubusercontent.com/weimenglee/DuckDB_Book/main/AMZN.csv';    
''').df()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1997-05-15,2.437500,2.500000,1.927083,1.958333,1.958333,72156000
1,1997-05-16,1.968750,1.979167,1.708333,1.729167,1.729167,14700000
2,1997-05-19,1.760417,1.770833,1.625000,1.708333,1.708333,6106800
3,1997-05-20,1.729167,1.750000,1.635417,1.635417,1.635417,5467200
4,1997-05-21,1.635417,1.645833,1.375000,1.427083,1.427083,18853200
...,...,...,...,...,...,...,...
5660,2019-11-11,1778.000000,1780.000000,1767.130005,1771.650024,1771.650024,1946000
5661,2019-11-12,1774.660034,1786.219971,1771.910034,1778.000000,1778.000000,2037600
5662,2019-11-13,1773.390015,1775.000000,1747.319946,1753.109985,1753.109985,2989500
5663,2019-11-14,1751.430054,1766.589966,1749.560059,1754.599976,1754.599976,2264800


In [3]:
conn.execute('''
SELECT
  *
FROM
  'https://raw.githubusercontent.com/weimenglee/DuckDB_Book/main/AMZN.csv'
WHERE year(Date) = 2018;
''').df()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2018-01-02,1172.000000,1190.000000,1170.510010,1189.010010,1189.010010,2694500
1,2018-01-03,1188.300049,1205.489990,1188.300049,1204.199951,1204.199951,3108800
2,2018-01-04,1205.000000,1215.869995,1204.660034,1209.589966,1209.589966,3022100
3,2018-01-05,1217.510010,1229.140015,1210.000000,1229.140015,1229.140015,3544700
4,2018-01-08,1236.000000,1253.079956,1232.030029,1246.869995,1246.869995,4279500
...,...,...,...,...,...,...,...
246,2018-12-24,1346.000000,1396.030029,1307.000000,1343.959961,1343.959961,7220000
247,2018-12-26,1368.890015,1473.160034,1363.010010,1470.900024,1470.900024,10411800
248,2018-12-27,1454.199951,1469.000000,1390.310059,1461.640015,1461.640015,9722000
249,2018-12-28,1473.349976,1513.469971,1449.000000,1478.020020,1478.020020,8829000


In [4]:
conn.execute('''
SELECT
  *
FROM read_csv([
  'https://raw.githubusercontent.com/weimenglee/DuckDB_Book/main/AMZN.csv',
  'https://raw.githubusercontent.com/weimenglee/DuckDB_Book/main/GOOG.csv'
]);
''').df()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1997-05-15,2.437500,2.500000,1.927083,1.958333,1.958333,72156000
1,1997-05-16,1.968750,1.979167,1.708333,1.729167,1.729167,14700000
2,1997-05-19,1.760417,1.770833,1.625000,1.708333,1.708333,6106800
3,1997-05-20,1.729167,1.750000,1.635417,1.635417,1.635417,5467200
4,1997-05-21,1.635417,1.645833,1.375000,1.427083,1.427083,18853200
...,...,...,...,...,...,...,...
5911,2020-07-13,1550.000000,1577.131958,1505.243042,1511.339966,1511.339966,1846400
5912,2020-07-14,1490.310059,1522.949951,1483.500000,1520.579956,1520.579956,1585000
5913,2020-07-15,1523.130005,1535.329956,1498.000000,1513.640015,1513.640015,1610700
5914,2020-07-16,1500.000000,1518.689941,1486.310059,1518.000000,1518.000000,1519300


### Accessing Parquet Files

In [5]:
conn.execute('''
SELECT
  *
FROM
  'https://github.com/weimenglee/DuckDB_Book' 
  '/raw/main/travel%20insurance.parquet';
''').df()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Gender,Age
0,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,81
1,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,71
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,65,AUSTRALIA,-49.5,29.70,,32
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,60,AUSTRALIA,-39.6,23.76,,32
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,79,ITALY,-19.8,11.88,,41
...,...,...,...,...,...,...,...,...,...,...,...
63321,JZI,Airlines,Online,Basic Plan,No,111,JAPAN,35.0,12.25,M,31
63322,JZI,Airlines,Online,Basic Plan,No,58,CHINA,40.0,14.00,F,40
63323,JZI,Airlines,Online,Basic Plan,No,2,MALAYSIA,18.0,6.30,M,57
63324,JZI,Airlines,Online,Basic Plan,No,3,VIET NAM,18.0,6.30,M,63


In [6]:
conn.execute('''
DESCRIBE TABLE
  'https://github.com/weimenglee/DuckDB_Book/raw/main/travel%20insurance.parquet';
''').df()

Unnamed: 0,column_name,column_type,null,key,default,extra
0,Agency,VARCHAR,YES,,,
1,Agency Type,VARCHAR,YES,,,
2,Distribution Channel,VARCHAR,YES,,,
3,Product Name,VARCHAR,YES,,,
4,Claim,VARCHAR,YES,,,
5,Duration,BIGINT,YES,,,
6,Destination,VARCHAR,YES,,,
7,Net Sales,DOUBLE,YES,,,
8,Commision (in value),DOUBLE,YES,,,
9,Gender,VARCHAR,YES,,,


In [7]:
conn.execute('''
SELECT
  Agency, "Agency Type"
FROM
  'https://github.com/weimenglee/DuckDB_Book'
  '/raw/main/travel%20insurance.parquet';
''').df()

Unnamed: 0,Agency,Agency Type
0,CBH,Travel Agency
1,CBH,Travel Agency
2,CWT,Travel Agency
3,CWT,Travel Agency
4,CWT,Travel Agency
...,...,...
63321,JZI,Airlines
63322,JZI,Airlines
63323,JZI,Airlines
63324,JZI,Airlines


In [8]:
conn.execute('''
SELECT
  avg(age)
FROM
  'https://github.com/weimenglee/DuckDB_Book'
  '/raw/main/travel%20insurance.parquet';
''').df()

Unnamed: 0,avg(age)
0,39.969981


In [9]:
conn.execute('''
SELECT
  count(*)
FROM
  'https://github.com/weimenglee/DuckDB_Book'
  '/raw/main/travel%20insurance.parquet';
''').df()

Unnamed: 0,count_star()
0,63326


## Querying Hugging Face Datasets

#### Reading the Dataset using hf:// Paths

In [10]:
import duckdb

conn = duckdb.connect()
conn.execute('''
  SELECT 
    *
  FROM
    'hf://datasets/scikit-learn/tips/tips.csv';
''').df()
# Note the structure of the Hugging Face URL for accessing datasets via DuckDB:
# hf:// + datasets/ + {username/dataset_name} + / + {file_name.extension}

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,False,Sun,Dinner,2
1,10.34,1.66,Male,False,Sun,Dinner,3
2,21.01,3.50,Male,False,Sun,Dinner,3
3,23.68,3.31,Male,False,Sun,Dinner,2
4,24.59,3.61,Female,False,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,False,Sat,Dinner,3
240,27.18,2.00,Female,True,Sat,Dinner,2
241,22.67,2.00,Male,True,Sat,Dinner,2
242,17.82,1.75,Male,False,Sat,Dinner,2


In [11]:
conn.execute('''
CREATE TABLE Tips AS
FROM
  'hf://datasets/scikit-learn/tips/tips.csv';
''')

<duckdb.duckdb.DuckDBPyConnection at 0x75e39a6052b0>

In [12]:
conn.execute('''
SELECT
  *
FROM
  Tips
''').df()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,False,Sun,Dinner,2
1,10.34,1.66,Male,False,Sun,Dinner,3
2,21.01,3.50,Male,False,Sun,Dinner,3
3,23.68,3.31,Male,False,Sun,Dinner,2
4,24.59,3.61,Female,False,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,False,Sat,Dinner,3
240,27.18,2.00,Female,True,Sat,Dinner,2
241,22.67,2.00,Male,True,Sat,Dinner,2
242,17.82,1.75,Male,False,Sat,Dinner,2


#### Accessing Files Within a Folder

In [13]:
conn.execute('''
SELECT
  *
FROM
  'hf://datasets/AiresPucrs/adult-census-income'
  '/data/train-00000-of-00001-7e70ed54d8cbb057.parquet'
''').df()
# Note the structure of the Hugging Face URL for accessing datasets via DuckDB when saved in a folder:
# hf:// + datasets/ + {username/dataset_name} + / + {folder_name/} + {file_name.extension}

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


#### Querying Multiple Files Using the Glob Syntax

Here is a list of glob pattterns:
| Glob | Description |
|-------|---------------------------------------------------------------|
| * | Matches any number of characters (including none) |
| ** | Matches any number of subdirectories (including none) |
| ? | Matches any single character |
| [abc] | Matches one character given in the brackets |
| [a-z] | Matches one character from the range given in the brackets |

In [14]:
conn.execute('''
SELECT
  *
FROM
  'hf://datasets/gabrielwu/city_country/*.csv';
''').df()
# Note: All the files you are loading using the glob pattern must have the same schema. If they don't, an exception will occur.

Unnamed: 0,question,answer0,answer1,answer2,answer3,label
0,The city of Quebec is in,Canada,Martinique,Uruguay,Uzbekistan,0
1,The city of Sao Paulo is in,Brazil,Grenada,Japan,Montenegro,0
2,The city of Klang is in,Malaysia,Denmark,Iran,Chile,0
3,The city of Sorong is in,Indonesia,Macedonia,Madagascar,Bahrain,0
4,The city of Zhengzhou is in,China,Swaziland,Mongolia,Djibouti,0
...,...,...,...,...,...,...
2474,The city of Karaj is in,Iran,Cyprus,Botswana,Tokelau,0
2475,The city of Yunfu is in,China,New Caledonia,Liberia,Brunei Darussalam,0
2476,The city of Huaibei is in,China,Ghana,Oman,Morocco,0
2477,The city of Luoyang is in,China,Niue,Brazil,New Caledonia,0


In [15]:
conn.execute('''
SELECT
  *
FROM
  'hf://datasets/gabrielwu/city_country/*.csv'
WHERE question LIKE '%Huaibei%';
''').df()

Unnamed: 0,question,answer0,answer1,answer2,answer3,label
0,The city of Huaibei is in,China,Ghana,Oman,Morocco,0


In [16]:
conn.execute('''
SELECT
  question
FROM
  'hf://datasets/Stanford/web_questions/data/*.parquet'
WHERE question LIKE '%happened%';
''').df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,question
0,what happened after mr. sugihara died?
1,what happened in bosnia in the 90s?
2,what happened to jill valentine?
3,what happened at chernobyl?
4,what happened to dunkirk during ww2?
5,what happened at benghazi?
6,what happened to pope john paul ii?
7,what do they think happened to natalee holloway?
8,what happened to daddy yankee?
9,what would have happened if germany had won ww1?


### Working with Private Hugging Face Datasets

#### Performing authentication - CONFIG provider method

In [17]:
import duckdb

conn = duckdb.connect()
conn.execute('''
CREATE SECRET hf_token (
  TYPE HUGGINGFACE,
  TOKEN '<HuggingFace_Token>'
);
''')
# Note: Replace <HuggingFace_Token> with your actual Hugging Face token. You can get a token by signing up at https://huggingface.co/join and then generating a token from your account settings.
# This method is ideal when you need to authenticate using a specific, known token, such as when you're working in environments where tokens are rotated frequently or you need to set explicit credentials for each service.

<duckdb.duckdb.DuckDBPyConnection at 0x75e399e80db0>

In [18]:
conn.execute('''
SELECT
  *
FROM
  'hf://datasets/Wei-Meng/StockPrices/AMZN.csv';
''').df()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1997-05-15,2.437500,2.500000,1.927083,1.958333,1.958333,72156000
1,1997-05-16,1.968750,1.979167,1.708333,1.729167,1.729167,14700000
2,1997-05-19,1.760417,1.770833,1.625000,1.708333,1.708333,6106800
3,1997-05-20,1.729167,1.750000,1.635417,1.635417,1.635417,5467200
4,1997-05-21,1.635417,1.645833,1.375000,1.427083,1.427083,18853200
...,...,...,...,...,...,...,...
5660,2019-11-11,1778.000000,1780.000000,1767.130005,1771.650024,1771.650024,1946000
5661,2019-11-12,1774.660034,1786.219971,1771.910034,1778.000000,1778.000000,2037600
5662,2019-11-13,1773.390015,1775.000000,1747.319946,1753.109985,1753.109985,2989500
5663,2019-11-14,1751.430054,1766.589966,1749.560059,1754.599976,1754.599976,2264800


#### Performing authentication - CREDENTIAL_CHAIN provider method

In [19]:
%pip install huggingface_hub

Collecting huggingface_hub
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting hf-xet<2.0.0,>=1.1.3 (from huggingface_hub)
  Downloading hf_xet-1.1.10-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.7 kB)
Downloading huggingface_hub-0.34.4-py3-none-any.whl (561 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m561.5/561.5 kB[0m [31m4.6 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hDownloading hf_xet-1.1.10-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m17.0 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: hf-xet, huggingface_hub
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [huggingface_hub]
[1A[2KSuccessfully installed hf-xet-1.1.10 huggingface_hub-0.34.4
Note: you may need to restart the kernel to use updated packages.


In [None]:
%huggingface-cli login
%Token: <HuggingFace_Token>
# Note: Replace <HuggingFace_Token> with your actual Hugging Face token

In [20]:
import duckdb

conn = duckdb.connect()
conn.execute('''
  CREATE SECRET hf_token (
    TYPE HUGGINGFACE,
    PROVIDER CREDENTIAL_CHAIN
  );
''')
conn.execute('''
  SELECT 
    *
  FROM 
    'hf://datasets/Wei-Meng/StockPrices/AMZN.csv';
''').df()


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1997-05-15,2.437500,2.500000,1.927083,1.958333,1.958333,72156000
1,1997-05-16,1.968750,1.979167,1.708333,1.729167,1.729167,14700000
2,1997-05-19,1.760417,1.770833,1.625000,1.708333,1.708333,6106800
3,1997-05-20,1.729167,1.750000,1.635417,1.635417,1.635417,5467200
4,1997-05-21,1.635417,1.645833,1.375000,1.427083,1.427083,18853200
...,...,...,...,...,...,...,...
5660,2019-11-11,1778.000000,1780.000000,1767.130005,1771.650024,1771.650024,1946000
5661,2019-11-12,1774.660034,1786.219971,1771.910034,1778.000000,1778.000000,2037600
5662,2019-11-13,1773.390015,1775.000000,1747.319946,1753.109985,1753.109985,2989500
5663,2019-11-14,1751.430054,1766.589966,1749.560059,1754.599976,1754.599976,2264800


In [21]:
conn.close()