# How to make your Pandas code faster

### 1. Accelerated Array Computing

In [1]:
import numpy as np
import pandas as pd
from numba import jit

# node numbers of the graph
nodes = np.arange(0, 6, 1)

# generate random fake connections
record = np.array([np.random.choice(nodes, 2) for i in range(6000)])

In [2]:
record

array([[1, 4],
       [2, 4],
       [4, 3],
       ...,
       [5, 2],
       [3, 0],
       [4, 4]])

In [3]:
# create adjacency matrix:
am = np.zeros((len(nodes), len(nodes)))
am

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [4]:
# simple iterate through the records:
for row, col in record:
    
    am[row, col] += 1
    
am  

array([[148., 156., 194., 153., 178., 177.],
       [168., 150., 188., 163., 166., 177.],
       [185., 151., 173., 177., 157., 179.],
       [161., 148., 187., 163., 154., 162.],
       [153., 159., 161., 176., 168., 178.],
       [160., 173., 169., 169., 150., 169.]])

In [5]:
# let's wrap the above code into function and time it:

def adjacent1(record, am):
    for row, col in record:
        am[row, col] += 1
    return am

# clear history
record = np.array([np.random.choice(nodes, 2) for i in range(6000)])
am = np.zeros((len(nodes), len(nodes)))

In [6]:
%timeit adjacent1(record, am)

7.68 ms ± 55.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
# let's use the power of numba

@jit(nopython=True, parallel=True)
def adjacent2(record, am):
    for row in range(record.shape[0]):
        am[record[row,:][0], record[row, :][1]] += 1 # this twisted roundabout is due to not supported feature for iterating 2d arrays
    return am

# clear history
record = np.array([np.random.choice(nodes, 2) for i in range(6000)])
am = np.zeros((len(nodes), len(nodes)))

In [8]:
%timeit adjacent2(record, am)

8.7 µs ± 2.3 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


### 2. Converting a Pandas DataFrame to Numpy array

In [9]:
# let's create a pandas df from `record`

df_record = pd.DataFrame(record, columns=['o', 'd'])
df_record

Unnamed: 0,o,d
0,1,2
1,3,1
2,2,2
3,4,3
4,0,5
5,4,0
6,2,3
7,5,2
8,1,4
9,4,3


In [10]:
# let's rewrite adjacent function with pandas

def adjacent3(record, am):
    for _, row in record.iterrows():
        am[row['o'], row['d']] += 1
    return am

# clear history
record = np.array([np.random.choice(nodes, 2) for i in range(6000)])
df_record = pd.DataFrame(record, columns=['o', 'd'])
am = np.zeros((len(nodes), len(nodes)))

In [11]:
# time the pandas version
%timeit adjacent3(df_record, am)

358 ms ± 5.13 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
# suppose the df_record is what we have in the first place.
# we want to leverage numba power to speed up our code
# so have to convert pandas df to numpy array first:

def adjacent4(record, am):
    record = record.to_numpy()
    record = record[1:, :]
    
    return adjacent2(record, am)

# clear history
record = np.array([np.random.choice(nodes, 2) for i in range(6000)])
df_record = pd.DataFrame(record, columns=['o', 'd'])
am = np.zeros((len(nodes), len(nodes)))    

In [13]:
%timeit adjacent4(df_record, am)

15.5 µs ± 43.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


The convertion from pandas dataframe to numpy array slows down more than 7 times compared with adjacent2.

Thus it is not a good idea to use a lot of pandas if you are performance sensitive.

However, it is still 35.1ms / 8.3us ~= 4000+ times faster!

We can expect that more rows the record got, the more obvious is the acceleration:

When set record has 6000 rows (10 times), the speed boost is 355ms / 15.3us ~= 20000+

In [14]:
# let's go through a real call:
# clear history
record = np.array([np.random.choice(nodes, 2) for i in range(6000)])
df_record = pd.DataFrame(record, columns=['o', 'd'])
am = np.zeros((len(nodes), len(nodes)))

res = adjacent4(df_record, am)
res

array([[169., 168., 175., 173., 156., 174.],
       [179., 172., 183., 151., 172., 170.],
       [165., 160., 148., 197., 165., 166.],
       [159., 177., 162., 172., 155., 148.],
       [151., 176., 170., 166., 165., 170.],
       [187., 150., 144., 176., 156., 172.]])

In [15]:
# another real call of the slow pandas version:
record = np.array([np.random.choice(nodes, 2) for i in range(6000)])
df_record = pd.DataFrame(record, columns=['o', 'd'])
am = np.zeros((len(nodes), len(nodes)))

res = adjacent3(df_record, am)
res

array([[172., 174., 151., 157., 171., 162.],
       [153., 167., 160., 150., 183., 173.],
       [165., 177., 184., 157., 169., 169.],
       [179., 161., 166., 138., 157., 161.],
       [184., 193., 159., 160., 154., 179.],
       [188., 170., 159., 147., 166., 185.]])

See, this would eventually result in HUGE difference when you have a big dataset.