In [None]:
!pip install duckdb==0.4.0

In [2]:
import duckdb
import pandas as pd
import sqlalchemy

In [3]:
%reload_ext sql
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

DuckDB provides out of box experience for you to connect with a database or create a new one with a similar command duckdb.connect(,read_only=False). DuckDB also allows you to create an in-memory temporary database by using duckdb.connect(). The conn.execute() run all the queries requests in the database.

In [5]:
conn = duckdb.connect("../dbs/dbnbs1", read_only=False)

In [6]:
conn.execute("CREATE TABLE test_table (i INTEGER, j STRING)")
conn.execute("INSERT INTO test_table VALUES (1, 'one'),(9,'nine')")
conn.execute("SELECT i from test_table").fetchdf()

Unnamed: 0,i
0,1
1,9


In [7]:
conn.execute("INSERT INTO test_table VALUES (?, ?)", [2, 'two'])
conn.executemany("INSERT INTO test_table VALUES (?, ?)", [[3, 'three'], [4, 'four']])
conn.execute("SELECT * from test_table ").fetchdf()

Unnamed: 0,i,j
0,1,one
1,9,nine
2,2,two
3,3,three
4,4,four


## CSV to Table

In [8]:
df = pd.read_csv("../data/bank_data.csv")
conn.register("bank_df", df)
conn.execute("SELECT actual_recovery_amount FROM bank_df WHERE age > 27").fetchdf()

Unnamed: 0,actual_recovery_amount
0,346.385000
1,520.710000
2,221.585000
3,201.815000
4,262.445000
...,...
1434,15388.195990
1435,9709.909257
1436,30209.181790
1437,20386.232190


## Relations

Relation API uses programmatic queries to evaluate chains f relations commands. In short, you can run python functions on relations and display the results. The results contain an experienced Tree, results in columns, and results in a preview. 

In [9]:
rel = duckdb.from_csv_auto("../data/bank_data.csv")
rel

---------------------
--- Relation Tree ---
---------------------
read_csv_auto(../data/bank_data.csv)

---------------------
-- Result Columns  --
---------------------
- id (INTEGER)
- expected_recovery_amount (INTEGER)
- actual_recovery_amount (DOUBLE)
- recovery_strategy (VARCHAR)
- age (INTEGER)
- sex (VARCHAR)

---------------------
-- Result Preview  --
---------------------
id	expected_recovery_amount	actual_recovery_amount	recovery_strategy	age	sex	
INTEGER	INTEGER	DOUBLE	VARCHAR	INTEGER	VARCHAR	
[ Rows: 10]
2030	194	263.540000	Level 0 Recovery	19	Male	
1150	486	416.090000	Level 0 Recovery	25	Female	
380	527	429.350000	Level 0 Recovery	27	Male	
1838	536	296.990000	Level 0 Recovery	25	Male	
1995	541	346.385000	Level 0 Recovery	34	Male	
731	548	520.710000	Level 0 Recovery	35	Male	
221	549	221.585000	Level 0 Recovery	33	Male	
1932	560	373.720000	Level 0 Recovery	19	Female	
1828	562	201.865000	Level 0 Recovery	22	Female	
2001	565	504.885000	Level 0 Recovery	27	Male	



In [10]:
rel.alias

'../data/bank_data.csv'

In [15]:
rel = rel.set_alias('bank_data')
rel.alias

'bank_data'

In [12]:
rel.type

'SUBQUERY_RELATION'

In [13]:
rel.columns

['id',
 'expected_recovery_amount',
 'actual_recovery_amount',
 'recovery_strategy',
 'age',
 'sex']

In [16]:
rel.filter('age > 18')

---------------------
--- Relation Tree ---
---------------------
Filter [age > 18]
  read_csv_auto(../data/bank_data.csv)

---------------------
-- Result Columns  --
---------------------
- id (INTEGER)
- expected_recovery_amount (INTEGER)
- actual_recovery_amount (DOUBLE)
- recovery_strategy (VARCHAR)
- age (INTEGER)
- sex (VARCHAR)

---------------------
-- Result Preview  --
---------------------
id	expected_recovery_amount	actual_recovery_amount	recovery_strategy	age	sex	
INTEGER	INTEGER	DOUBLE	VARCHAR	INTEGER	VARCHAR	
[ Rows: 10]
2030	194	263.540000	Level 0 Recovery	19	Male	
1150	486	416.090000	Level 0 Recovery	25	Female	
380	527	429.350000	Level 0 Recovery	27	Male	
1838	536	296.990000	Level 0 Recovery	25	Male	
1995	541	346.385000	Level 0 Recovery	34	Male	
731	548	520.710000	Level 0 Recovery	35	Male	
221	549	221.585000	Level 0 Recovery	33	Male	
1932	560	373.720000	Level 0 Recovery	19	Female	
1828	562	201.865000	Level 0 Recovery	22	Female	
2001	565	504.885000	Level 0 Recovery	27	

In [18]:
rel.filter('age > 27').order('sex').project('actual_recovery_amount').limit(2).to_df()

Unnamed: 0,actual_recovery_amount
0,278.72
1,245.0


In [21]:
rel.aggregate("sum(actual_recovery_amount)")

---------------------
--- Relation Tree ---
---------------------
Aggregate [sum(actual_recovery_amount)]
  read_csv_auto(../data/bank_data.csv)

---------------------
-- Result Columns  --
---------------------
- sum(actual_recovery_amount) (DOUBLE)

---------------------
-- Result Preview  --
---------------------
sum(actual_recovery_amount)	
DOUBLE	
[ Rows: 1]
7529821.469511	



In [23]:
rel.set_alias('a').join(rel.set_alias('b'), 'a.id=b.id').to_df()

Unnamed: 0,id,expected_recovery_amount,actual_recovery_amount,recovery_strategy,age,sex,id_2,expected_recovery_amount_2,actual_recovery_amount_2,recovery_strategy_2,age_2,sex_2
0,2030,194,263.540000,Level 0 Recovery,19,Male,2030,194,263.540000,Level 0 Recovery,19,Male
1,1150,486,416.090000,Level 0 Recovery,25,Female,1150,486,416.090000,Level 0 Recovery,25,Female
2,380,527,429.350000,Level 0 Recovery,27,Male,380,527,429.350000,Level 0 Recovery,27,Male
3,1838,536,296.990000,Level 0 Recovery,25,Male,1838,536,296.990000,Level 0 Recovery,25,Male
4,1995,541,346.385000,Level 0 Recovery,34,Male,1995,541,346.385000,Level 0 Recovery,34,Male
...,...,...,...,...,...,...,...,...,...,...,...,...
1877,361,9785,15388.195990,Level 4 Recovery,65,Female,361,9785,15388.195990,Level 4 Recovery,65,Female
1878,196,9857,9709.909257,Level 4 Recovery,68,Female,196,9857,9709.909257,Level 4 Recovery,68,Female
1879,313,9859,30209.181790,Level 4 Recovery,58,Female,313,9859,30209.181790,Level 4 Recovery,58,Female
1880,1781,9920,20386.232190,Level 4 Recovery,67,Female,1781,9920,20386.232190,Level 4 Recovery,67,Female


## Queries

In [29]:
res = duckdb.query("SELECT sex, SUM(expected_recovery_amount) as Expected, \
 SUM(actual_recovery_amount) as Actual \
 FROM '../data/bank_data.csv' \
 WHERE recovery_strategy LIKE 'Level 4 Recovery' \
 GROUP BY sex \
 HAVING sex = 'Female'")
 
res.df()

Unnamed: 0,sex,Expected,Actual
0,Female,867087.0,1597423.0
