## cuDF VS Pandas

All credit of the data manipulation goes to @GIBA and his notebook [Article_id pairs in 3s using cuDF](https://www.kaggle.com/code/titericz/article-id-pairs-in-3s-using-cudf).


In this notebook we just try to compare the performance of cuDF compared to pandas. It is not exhaustive but it can give an idea whether it is worth to use cuDF (GPU) for data manipulation instead of pandas (CPU). Since the API that they provide are quite identical they are very easy to compare.

In [None]:
import numpy as np
import pandas as pd
import cudf

In [None]:
import time
import gc

def mytimeit(f, n_exec=5, **kwargs):
    times = []
    for i in range(0, n_exec):
        t1 = time.time()
        res = f(**kwargs)
        t2 = time.time()
        times.append(t2 - t1)
        gc.collect()
    return times, res

In [None]:
def calc_pairs(train):
    # Calculate all articles purchased together
    dt = train.groupby(['customer_id','t_dat'])['article_id'].agg(list).rename('pair').reset_index()
    df = train[['customer_id', 't_dat', 'article_id']].merge(dt, on=['customer_id', 't_dat'], how='left')
    del dt
    gc.collect()

    # Explode the rows vs list of articles
    df = df[['article_id', 'pair']].explode(column='pair')
    gc.collect()
    
    # Discard duplicates
    df = df.loc[df['article_id']!=df['pair']].reset_index(drop=True)
    gc.collect()

    # Count how many times each pair combination happens
    df = df.groupby(['article_id', 'pair']).size().rename('count').reset_index()
    gc.collect()
    
    # Sort by frequency
    df = df.sort_values(['article_id' ,'count'], ascending=False).reset_index(drop=True)
    gc.collect()
    
    # Pick only top1 most frequent pair
    df['rank'] = df.groupby('article_id')['pair'].cumcount()
    df = df.loc[df['rank']==0].reset_index(drop=True)
    del df['rank']
    gc.collect()
    
    return df

## cuDF

In [None]:
# Load the dataset and discard unused columns
train = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
del train['price']
del train['sales_channel_id']
gc.collect()

# Convert customer_id to int to save memory and speedup processings.
train['customer_id'] = train['customer_id'].factorize()[0].astype('int32')
train['t_dat'] = train['t_dat'].factorize()[0].astype('int16')
gc.collect()

# number of rows of train
print(train.shape)
train.head(10)

In [None]:
metrics_cudf = []
for i in range(int(10e5), int(16e6), int(10e5)):
    times_cudf, _ = mytimeit(calc_pairs, train=train.sample(n=i))
    metrics_cudf.append(np.mean(times_cudf))

In [None]:
del train

## pandas

In [None]:
pandas_train = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
del pandas_train['price']
del pandas_train['sales_channel_id']
gc.collect()

# Convert customer_id to int to save memory and speedup processings.
pandas_train['customer_id'] = pandas_train['customer_id'].factorize()[0].astype('int32')
pandas_train['t_dat'] = pandas_train['t_dat'].factorize()[0].astype('int16')
gc.collect()

# number of rows of train
print(pandas_train.shape)
pandas_train.head(10)

In [None]:
metrics_pandas = []
for i in range(int(10e5), int(16e6), int(10e5)):
    times_pd, _ = mytimeit(calc_pairs, train=pandas_train.sample(n=i))
    metrics_pandas.append(np.mean(times_pd))

## Results

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (30,5)

lengths = list(range(int(10e5), int(16e6), int(10e5)))
plt.plot(lengths, metrics_pandas, 'o-', label = "pandas")
plt.plot(lengths, metrics_cudf, 'o-', label = "cuDF")
plt.xlabel('DataFrame length')
plt.ylabel('Execution time(s)')
plt.title('cuDF vs pandas')
plt.legend()
plt.show()

In [None]:
print(f'cuDF is {np.mean(metrics_pandas) / np.mean(metrics_cudf)} faster than pandas')