# **Columnar Tutorial**

In [1]:
# Test cell
print("Hello World")

Hello World


### **SETUP**

In [2]:
import mysql.connector
import clickhouse_connect
import pandas as pd
import time

In [3]:
# Configuration MySQL
MYSQL_CONFIG = {
    "host": "mysql",
    "user": "user",
    "password": "password",
    "database": "testdb",
}

# Configuration ClickHouse
CLICKHOUSE_CONFIG = {
    "host": "clickhouse",
    "user": "default",
    "password": "password",
    "port": 8123,
}

### **TEST DES CONTENEURS**

In [None]:
# Connexion MySQL
def test_mysql():
    conn = mysql.connector.connect(**MYSQL_CONFIG)
    cursor = conn.cursor()

    cursor.execute("DROP TABLE IF EXISTS test_table;")
    cursor.execute("""
        CREATE TABLE test_table (
            id INT AUTO_INCREMENT PRIMARY KEY,
            name CHAR(10),
            value INT
        );
    """)

    cursor.execute("INSERT INTO test_table (name, value) VALUES ('test', 100), ('test', 200);")
    conn.commit()

    cursor.execute("SELECT * FROM test_table LIMIT 1;")
    print("MySQL :", cursor.fetchone())

    cursor.close()
    conn.close()

In [None]:
# Connexion ClickHouse
def test_clickhouse():
    client = clickhouse_connect.get_client(**CLICKHOUSE_CONFIG)

    client.command("DROP TABLE IF EXISTS test_table;")
    client.command("""
        CREATE TABLE test_table (
            id UInt32,
            name String,
            value Int32
        ) ENGINE = MergeTree()
        ORDER BY id;
    """)

    client.insert("test_table", [[1, 'test', 100], [2, 'test', 200]])

    result = client.query("SELECT * FROM test_table LIMIT 1;")
    print("ClickHouse :", result.result_rows[0])

In [None]:
# Should print "MySQL : (1, 'test', 100)"
test_mysql()

MySQL :  (1, 'test', 100)


In [None]:
# Should print "ClickHouse : (1, 'test', 100)"
test_clickhouse()

ClickHouse :  (1, 'test', 100)


Instantiation des connecteurs / client pour les db

In [8]:
client = clickhouse_connect.get_client(**CLICKHOUSE_CONFIG)
conn = mysql.connector.connect(**MYSQL_CONFIG)

Fonction pour simplifier les requêtes mysql

In [9]:
def sql_query(query) :
    cursor = conn.cursor()
    cursor.execute(query)
    res = cursor.fetchall()
    cursor.close()
    return res

Chargement du dataset. Il comporte un ensemble d'information sur des trajets de taxi à New York.
Le dataset a été réduit à 50.000 données.

In [10]:
df = pd.read_csv(
    "./dataset/input_data.csv",
    usecols=[
        "VendorID",
        "passenger_count",
        "trip_distance",
        "fare_amount",
        "tip_amount",
        "tolls_amount",
    ],
)

## Comparaison 1 : Chargement des données

In [None]:
NB_ROWS_TO_INSERT = 5000

On charge {NB_ROWS_TO_INSERT} données une par une pour étudier le temps mis par chaque système pour les ajouter.

In [15]:
cursor = conn.cursor()

cursor.execute("DROP TABLE IF EXISTS nyc_taxi;")

cursor.execute("""
    CREATE TABLE nyc_taxi (
        id INT AUTO_INCREMENT PRIMARY KEY,
        vendor_id INT,
        passenger_count INT,
        trip_distance FLOAT,
        fare_amount FLOAT,
        tip_amount FLOAT,
        tolls_amount FLOAT
    );
""")

conn.commit()

t0 = time.time()
for i, row in df.iterrows():
    cursor.execute("""
        INSERT INTO nyc_taxi (vendor_id, passenger_count, trip_distance, fare_amount, tip_amount, tolls_amount)
        VALUES (%s, %s, %s, %s, %s, %s)
    """, tuple(row))
    conn.commit()
    if i == NB_ROWS_TO_INSERT :
        break
t1 = time.time()
print("Time required to add %d rows one by one : "%(NB_ROWS_TO_INSERT), t1 - t0)

cursor.close()

Time required to add 10000 rows one by one :  29.56834125518799


True

In [16]:
client.command("DROP TABLE IF EXISTS nyc_taxi;")

client.command("""
    CREATE TABLE nyc_taxi (
        id UInt32,
        vendor_id UInt8,
        passenger_count UInt8,
        trip_distance Float32,
        fare_amount Float32,
        tip_amount Float32,
        tolls_amount Float32
    ) ENGINE = MergeTree()
    ORDER BY id;
""")

t0 = time.time()
data = []
for i, row in df.iterrows():
    client.insert("nyc_taxi", [(i, row["VendorID"], row["passenger_count"], row["trip_distance"], row["fare_amount"], row["tip_amount"], row["tolls_amount"])])
    if i == NB_ROWS_TO_INSERT :
        break
t1 = time.time()
print("Time required to add %d rows one by one : "%(NB_ROWS_TO_INSERT), t1 - t0)

Time required to add 10000 rows one by one :  51.34179663658142


On observe que clickhouse est beaucoup plus lent que mysql pour l'insertion des données.
C'est évidemment dû à la différence de stockage.
- Pour MySQL, on ajoute 1 liste de N éléments
- Pour ClickHouse, on ajoute 1 élement dans N listes

Afin de pallier ce problème, on utilise la fonction `insert` avec un tableau de donnée pour ajouter un paquet.
Ajout maintenant l'ensemble des données dans les deux base de données.

In [17]:
cursor = conn.cursor()

cursor.execute("DROP TABLE IF EXISTS nyc_taxi;")

cursor.execute("""
    CREATE TABLE nyc_taxi (
        id INT AUTO_INCREMENT PRIMARY KEY,
        vendor_id INT,
        passenger_count INT,
        trip_distance FLOAT,
        fare_amount FLOAT,
        tip_amount FLOAT,
        tolls_amount FLOAT
    );
""")

conn.commit()

for i, row in df.iterrows():
    cursor.execute("""
        INSERT INTO nyc_taxi (vendor_id, passenger_count, trip_distance, fare_amount, tip_amount, tolls_amount)
        VALUES (%s, %s, %s, %s, %s, %s)
    """, tuple(row))

conn.commit()
cursor.close()

True

In [18]:
data = []
for i, row in df.iterrows():
    data.append((i, row["VendorID"], row["passenger_count"], row["trip_distance"], row["fare_amount"]))

client.command("DROP TABLE IF EXISTS nyc_taxi;")

client.command("""
    CREATE TABLE nyc_taxi (
        id UInt32,
        vendor_id UInt8,
        passenger_count UInt8,
        trip_distance Float32,
        fare_amount Float32,
        tip_amount Float32,
        tolls_amount Float32
    ) ENGINE = MergeTree()
    ORDER BY id;
""")

data = []
for i, row in df.iterrows():
    data.append([i, row["VendorID"], row["passenger_count"], row["trip_distance"], row["fare_amount"], row["tip_amount"], row["tolls_amount"]])

client.insert("nyc_taxi", data)

<clickhouse_connect.driver.summary.QuerySummary at 0x7f0ee8f9dfd0>

On voit ici qu'il y a un réel gain de temps lors de l'ajout par batch.
L'ajout progressif des données est une limitation importante de ClickHouse et des bases de données orientés colonnes en général.

In [19]:
result = sql_query("SELECT COUNT(*) FROM nyc_taxi;")
print("MySQL Total Rows:", result[0][0])

result = client.query("SELECT COUNT(*) FROM nyc_taxi;")
print("ClickHouse Total Rows:", result.result_rows[0][0])

MySQL Total Rows: 500000
ClickHouse Total Rows: 500000


## Comparaison 2 : Requête (global)

On définit requête pour étudier la différence dans le temps d'execution des requêtes selon la base de donnée. 

In [24]:
query_A = "SELECT * FROM nyc_taxi;"
query_B = "SELECT * FROM nyc_taxi ORDER BY passenger_count ASC, fare_amount DESC;"
query_C = "SELECT SUM(trip_distance) FROM nyc_taxi;"
query_D = "SELECT COUNT(*) FROM nyc_taxi;"
query_E = "SELECT COUNT(passenger_count) FROM nyc_taxi;"

In [41]:
iterations = 10
for query in [query_A, query_B, query_C, query_D, query_E] :
    t0 = time.time()
    for i in range(iterations) :
        resultsql = sql_query(query)
    t1 = time.time()
    for i in range(iterations) :
        resultcol = client.query(query)
    t2 = time.time()
    print("Requête : ", query)
    print("MySQL :      ", (t1 - t0)/iterations)
    print("ClickHouse : ", (t2 - t1)/iterations)
    print(" ")

Requête :  SELECT * FROM nyc_taxi;
MySQL :       0.7422834873199463
ClickHouse :  0.08609867095947266
 
Requête :  SELECT * FROM nyc_taxi ORDER BY passenger_count ASC, fare_amount DESC;
MySQL :       0.960255217552185
ClickHouse :  0.06942627429962159
 
Requête :  SELECT SUM(trip_distance) FROM nyc_taxi;
MySQL :       0.05632903575897217
ClickHouse :  0.004126310348510742
 
Requête :  SELECT COUNT(*) FROM nyc_taxi;
MySQL :       0.010936164855957031
ClickHouse :  0.002720212936401367
 
Requête :  SELECT COUNT(passenger_count) FROM nyc_taxi;
MySQL :       0.052386474609375
ClickHouse :  0.002329587936401367
 


Try it yourself !

In [None]:
query = ""
iterations = 10
t0 = time.time()
for i in range(iterations) :
    resultsql = sql_query(query)
t1 = time.time()
for i in range(iterations) :
    resultcol = client.query(query)
t2 = time.time()
print("Requête : ", query)
print("MySQL :      ", (t1 - t0)/iterations)
print("ClickHouse : ", (t2 - t1)/iterations)

## Comparaison 3 : Requête (specific)

In [50]:
query = "SELECT * FROM nyc_taxi WHERE id=1523"
iterations = 10
t0 = time.time()
for i in range(iterations) :
    resultsql = sql_query(query)
t1 = time.time()
for i in range(iterations) :
    resultcol = client.query(query)
t2 = time.time()
print("MySQL :      ", (t1 - t0)/iterations)
print("ClickHouse : ", (t2 - t1)/iterations)

MySQL :       0.0005139350891113281
ClickHouse :  0.0030791521072387694


In [51]:
query = "SELECT * FROM nyc_taxi WHERE id=30523 OR id=28645"
iterations = 10
t0 = time.time()
for i in range(iterations) :
    resultsql = sql_query(query)
t1 = time.time()
for i in range(iterations) :
    resultcol = client.query(query)
t2 = time.time()
print("MySQL :      ", (t1 - t0)/iterations)
print("ClickHouse : ", (t2 - t1)/iterations)

MySQL :       0.00045096874237060547
ClickHouse :  0.0055588483810424805


Les accès de lignes précises sont bien plus longue avec ClickHouse qu'avec MySQL !

In [53]:
query = "SELECT * FROM nyc_taxi WHERE id = 30523 OR tip_amount > 0"
iterations = 10
t0 = time.time()
for i in range(iterations) :
    resultsql = sql_query(query)
t1 = time.time()
for i in range(iterations) :
    resultcol = client.query(query)
t2 = time.time()
print("MySQL :      ", (t1 - t0)/iterations)
print("ClickHouse : ", (t2 - t1)/iterations)

MySQL :       0.4327935457229614
ClickHouse :  0.05872609615325928


In [55]:
query = "SELECT * FROM nyc_taxi WHERE fare_amount > 1 AND fare_amount < 12 "
iterations = 10
t0 = time.time()
for i in range(iterations) :
    resultsql = sql_query(query)
t1 = time.time()
for i in range(iterations) :
    resultcol = client.query(query)
t2 = time.time()
print("MySQL :      ", (t1 - t0)/iterations)
print("ClickHouse : ", (t2 - t1)/iterations)

MySQL :       0.500825047492981
ClickHouse :  0.056356501579284665


A l'inverse, le filtrage concernant une colonne est bien plus rapide avec ClickHouse.

In [None]:
query = "SELECT * FROM nyc_taxi WHERE fare_amount = 1.0"
iterations = 10
t0 = time.time()
for i in range(iterations) :
    resultsql = sql_query(query)
t1 = time.time()
for i in range(iterations) :
    resultcol = client.query(query)
t2 = time.time()
print("MySQL :      ", (t1 - t0)/iterations)
print("ClickHouse : ", (t2 - t1)/iterations)

MySQL :       0.10245535373687745
ClickHouse :  0.005701088905334472
