# Progress Bar for  Merge Or Concat Operation With tqdm in Pandas

https://datascientyst.com/progress-bar-merge-concat-operation-tqdm-pandas/

## Step 1: Install Dask and TQDM

```python
pip install tqdm
pip install dask

pip install tqdm -U
pip install dask -U
```

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import dask.dataframe as dd

n = 450000
maxa = 700

df1 = pd.DataFrame({'lkey': np.random.randint(0, maxa, n),'lvalue': np.random.randint(0,int(1e8),n)})
df2 = pd.DataFrame({'rkey': np.random.randint(0, maxa, n),'rvalue': np.random.randint(0, int(1e8),n)})

sd1 = dd.from_pandas(df1, npartitions=3)
sd2 = dd.from_pandas(df2, npartitions=3)

## Step 3: Add progress bar for merge on two DataFrames

### context `with TqdmCallback(desc="compute")`

In [2]:
from tqdm.dask import TqdmCallback
from dask.diagnostics import ProgressBar
ProgressBar().register()

with TqdmCallback(desc="compute"):
    sd1.merge(sd2, left_on='lkey', right_on='rkey').compute()

[                                        ] | 0% Completed |  0.0s

compute:   0%|          | 0/31 [00:00<?, ?it/s]

[########################################] | 100% Completed |  6.2s


### globally

In [3]:
# or use callback globally
cb = TqdmCallback(desc="global")
cb.register()
df = sd1.merge(sd2, left_on='lkey', right_on='rkey').compute()

[                                        ] | 0% Completed |  0.0s

global:   0%|          | 0/31 [00:00<?, ?it/s]

[########################################] | 100% Completed |  6.2s


## Compare Pandas vs Dask merge performance

In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import dask.dataframe as dd

n = 45000
maxa = 700

df1 = pd.DataFrame({'lkey': np.random.randint(0, maxa, n),'lvalue': np.random.randint(0,int(1e8),n)})
df2 = pd.DataFrame({'rkey': np.random.randint(0, maxa, n),'rvalue': np.random.randint(0, int(1e8),n)})

sd1 = dd.from_pandas(df1, npartitions=3)
sd2 = dd.from_pandas(df2, npartitions=3)

In [5]:
%timeit df1.merge(df2, left_on='lkey', right_on='rkey')

10 loops, best of 3: 74.6 ms per loop


In [6]:
%timeit sd1.merge(sd2, left_on='lkey', right_on='rkey')

10 loops, best of 3: 20.8 ms per loop


In [7]:
df1.merge(df2, left_on='lkey', right_on='rkey')

Unnamed: 0,lkey,lvalue,rkey,rvalue
0,190,74120230,190,22194194
1,190,74120230,190,89365878
2,190,74120230,190,46824129
3,190,74120230,190,73708072
4,190,74120230,190,38743003
...,...,...,...,...
2894666,126,24460833,126,13984908
2894667,126,24460833,126,73954895
2894668,126,24460833,126,60934273
2894669,126,24460833,126,30962768


In [8]:
sd1.merge(sd2, left_on='lkey', right_on='rkey').compute()

[                                        ] | 0% Completed |  0.0s

global:   0%|          | 0/31 [00:00<?, ?it/s]

[########################################] | 100% Completed |  0.3s


Unnamed: 0,lkey,lvalue,rkey,rvalue
0,427,81656682,427,25865193
1,427,81656682,427,76718330
2,427,81656682,427,41007254
3,427,81656682,427,16779446
4,427,81656682,427,74411877
...,...,...,...,...
969739,650,6095988,650,12634050
969740,650,6095988,650,83722535
969741,650,6095988,650,61420075
969742,650,6095988,650,33649078
