# Testing CuDF's GPU acceleration w/ Pandas dataframes

Dataset used: UK property price data from Kaggle:

https://www.kaggle.com/datasets/willianoliveiragibin/uk-property-price-data-1995-2023-04/data

In [3]:
#import cudf

In [153]:
!pip install line_profiler

Collecting line_profiler
  Downloading line_profiler-4.1.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (715 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m715.2/715.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: line_profiler
Successfully installed line_profiler-4.1.2


In [4]:
import pandas as pd
import numpy as np
import random
import itertools

In [154]:
%load_ext line_profiler

In [4]:
#%load_ext cudf.pandas

In [189]:
# Inspired by Rob Mulla - https://www.youtube.com/watch?v=u4_c2LDi4b8&list=PL7RwtdVQXQ8qxBH6ugYn50D0M5u--2Xx4&index=15
# Good result - a good result for a team in their category (e.g. points for backmarker, podium for midfield, win for leading)
# Assuming 21 races a season.
def rand_color(paint, base, base_col):
    '''
        Returns a string joining randomly picked elements from the three lists passed as args.
    '''
    # No wonder it's so slow (for loop). Args can be of different size
    cols_array = np.empty(size, dtype = object)
    for iteration in range(size):
        color = ''
        color = color + str(random.choice(paint))
        color = color + ' ' + str(random.choice(base))
        color = color + ' ' + str(random.choice(base_col))
        cols_array[iteration] = color
    print(cols_array[:10])
    return cols_array
def make_data(size):
    paint_type = ['Gloss', 'Matte', 'Iridescent', 'Semi-gloss', 'Satin', 'Low-gloss']
    color_type = ['Light', 'Dark', 'Deep', 'Pale', 'Chromatic', 'Racing', 'Radiant']
    base_colors = ['Red', 'Green', 'Blue', 'White', 'Black', 'Yellow']
    df = pd.DataFrame()
    df['year'] = np.random.randint(1990, 2023, size)
    df['team'] = np.random.choice(['leading', 'midfield', 'backmarker'], size)
    df['best_finish'] = np.random.randint(1, 20, size)
    df['points_avg'] = np.random.randint(1, 26, size)
    df['total_points'] = np.multiply(df['points_avg'].to_numpy(), 21)
    # This is the slowest, taking 90% of the time
    car_colors = rand_color(paint_type, color_type, base_colors)
    print(car_colors)
    df['color_type'] = np.random.choice(car_colors, size)
    df['car_color'] = np.random.choice(paint_type + color_type + base_colors, size)
    df['good_res'] = np.random.choice(['yes', 'no'], size)
    df['prob_good_res'] = np.random.uniform(0, 1, size)
    return df

In [3]:
#paint_type = ['Gloss', 'Matte', 'Iridescent', 'Semi-gloss', 'Satin', 'Low-gloss']
#color_type = ['Light', 'Dark', 'Deep', 'Pale', 'Chromatic', 'Racing', 'Radiant']
#base_colors = ['Red', 'Green', 'Blue', 'White', 'Black', 'Yellow']
#car_colors = rand_color(paint_type, color_type, base_colors)

In [181]:
#%lprun -f make_data make_data(10 ** 5)

['Gloss Deep White', 'Satin Deep Red', 'Iridescent Racing White', 'Iridescent Dark Black', 'Iridescent Dark Yellow', 'Semi-gloss Dark Blue', 'Gloss Dark Green', 'Semi-gloss Radiant White', 'Low-sheen Chromatic Black', 'Low-sheen Dark Green']


In [2]:
#df2 = make_data(10 ** 4)
#df2.to_csv('data.csv')

In [6]:
%%time
df = pd.read_csv('data.csv')

CPU times: user 17.7 s, sys: 4.04 s, total: 21.7 s
Wall time: 27.3 s


#### Wall time 27 seconds. Eek.

In [7]:
#df = pd.read_csv('data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000000 entries, 0 to 19999999
Data columns (total 10 columns):
 #   Column         Dtype  
---  ------         -----  
 0   Unnamed: 0     int64  
 1   year           int64  
 2   team           object 
 3   best_finish    int64  
 4   points_avg     int64  
 5   total_points   int64  
 6   color_type     object 
 7   car_color      object 
 8   good_res       object 
 9   prob_good_res  float64
dtypes: float64(1), int64(5), object(4)
memory usage: 1.5+ GB


First, we'll import the data and look at it. 

Note the datatype for price. We're explicitly setting this, due to the original dataset being parsed as a string. 

In [17]:
len(df)

10000000

A simple operation to find unique values. In this case, we're going to grab for unique Towns/Cities.

In [8]:
# get unique values for a given column:
which_unique_col = 'color_type'
uniques = df[which_unique_col].unique()
print(len(uniques))

210


In [11]:
df['color_type'][:500]

0      Semi-gloss Chromatic Green
1              Matte Racing White
2                 Gloss Light Red
3           Iridescent Dark White
4               Gloss Pale Yellow
                  ...            
495       Iridescent Radiant Blue
496       Iridescent Radiant Blue
497            Gloss Racing Green
498              Matte Deep Black
499              Matte Dark White
Name: color_type, Length: 500, dtype: object

In [12]:
uniques[:10]

array(['Semi-gloss Chromatic Green', 'Matte Racing White',
       'Gloss Light Red', 'Iridescent Dark White', 'Gloss Pale Yellow',
       'Semi-gloss Chromatic Red', 'Iridescent Chromatic Blue',
       'Satin Deep Black', 'Gloss Radiant Red', 'Satin Deep Yellow'],
      dtype=object)

Now let's do a slightly more complicated operation. We're going to find the average price for each town/city.

In [13]:
df['best_finish'].unique()

array([12, 15,  7, 13, 17, 19,  9, 18, 10,  6,  4,  3,  1, 16,  8, 14,  5,
       11,  2])

In [14]:
%%timeit
years = sorted(df['year'].unique())

115 ms ± 1.96 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
years = sorted(df['year'].unique())
print(years)
uniques_points = {}

[1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]


In [16]:
%%timeit
grouped_df = df.groupby([which_unique_col], as_index = False)[['color_type', 'total_points']]
uniques_points = grouped_df.mean().values.tolist()

2.24 s ± 18.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
uniques_points = {}

In [11]:
%%time
for unique in uniques:
    uniques_points[unique] = df[df[which_unique_col] == unique]['total_points'].mean()
len(uniques_points.items())

CPU times: user 4min 2s, sys: 59.4 ms, total: 4min 2s
Wall time: 4min 2s


210

#### With 1.5GB of data, GroupBy way - 2.24 seconds
#### Old way - 4 min 2 seconds ( ~240 secs) 
#### GroupBy was 119x faster!

### 10^4 dataset:
#### Using GroupBy was 72 times faster!!
#### Old way - 228ms +- 4.32ms
#### This way - 3.17ms

In [17]:
# Also try - groupby.transform('mean')
grouped_df = df.groupby([which_unique_col], as_index = False)[['color_type', 'total_points']]
mean_points = grouped_df.mean()
print(mean_points)

                    color_type  total_points
0        Gloss Chromatic Black    272.878697
1         Gloss Chromatic Blue    273.177468
2        Gloss Chromatic Green    272.432141
3          Gloss Chromatic Red    273.302508
4        Gloss Chromatic White    272.762697
..                         ...           ...
205    Semi-gloss Radiant Blue    272.921620
206   Semi-gloss Radiant Green    273.080040
207     Semi-gloss Radiant Red    273.083033
208   Semi-gloss Radiant White    273.395964
209  Semi-gloss Radiant Yellow    273.056376

[210 rows x 2 columns]


In [18]:
uniques_points = mean_points.values.tolist()
print(len(uniques_points))
uniques_points[:10]

210


[['Gloss Chromatic Black', 272.87869701082667],
 ['Gloss Chromatic Blue', 273.17746788990826],
 ['Gloss Chromatic Green', 272.4321411740064],
 ['Gloss Chromatic Red', 273.30250795294813],
 ['Gloss Chromatic White', 272.76269691241333],
 ['Gloss Chromatic Yellow', 272.2264190558174],
 ['Gloss Dark Black', 272.9953761952693],
 ['Gloss Dark Blue', 272.9275161005177],
 ['Gloss Dark Green', 272.962793052172],
 ['Gloss Dark Red', 272.21319086219603]]

In [15]:
#%%timeit
## Debugging code - ignore
#uniques_points[unique] = [grouped_df.mean() for unique in uniques]
uniques_points = {}

In [16]:
%%time
for unique in uniques:
    uniques_points[unique] = df[df[which_unique_col] == unique]['total_points'].mean()

CPU times: user 4min 2s, sys: 30.1 ms, total: 4min 2s
Wall time: 4min 2s


In [17]:
#for unique in uniques:
#    uniques_points[unique] = df[df[which_unique_col] == unique]['total_points'].mean()
#len(uniques_points.items())

In [24]:
print(list(uniques_points.items())[:5])

[('Semi-gloss Chromatic Green', 273.06433742524393), ('Matte Racing White', 272.3175567234163), ('Gloss Light Red', 272.7523714690006), ('Iridescent Dark White', 273.5581453660269), ('Gloss Pale Yellow', 271.95177028451)]


Next, we'll grab data just for 2022:

In [27]:
# Use string or number in vectorized version
# df_2022 = df[df['year'].str.startswith('2022')]
df_green = df[df[which_unique_col] == 'Matte Racing Green']

In [28]:
print(len(df_green))
df_green.tail()

95039


Unnamed: 0.1,Unnamed: 0,year,team,best_finish,points_avg,total_points,color_type,car_color,good_res,prob_good_res
19997389,19997389,2011,backmarker,2,19,399,Matte Racing Green,Red,yes,0.227745
19997795,19997795,2019,midfield,3,5,105,Matte Racing Green,Matte,yes,0.044618
19998788,19998788,2016,backmarker,8,20,420,Matte Racing Green,Yellow,no,0.902482
19999151,19999151,1995,leading,7,23,483,Matte Racing Green,Gloss,no,0.708475
19999888,19999888,2008,leading,7,20,420,Matte Racing Green,Chromatic,yes,0.163311


Grab the lower 20% and upper 20% averages for properties in 2022.

In [29]:
recent_lower_upper = {}

In [81]:
%%timeit
grouped_df = df.groupby([which_unique_col], as_index = False)
for unique in uniques:
    points = grouped_df.get_group(unique)['total_points']
    #print(points)
    sorted_points = points.sort_values()

    # bottom 20% of the prices
    bottom_20_percent_points = sorted_points[:int(0.2 * len(sorted_points))]

    # average of the bottom 20%
    average_of_bottom_20_percent = bottom_20_percent_points.mean()

    # top 20% of the prices
    top_20_percent_points = sorted_points[int(0.8 * len(sorted_points)):]
    # average of the top 20%
    average_of_top_20_percent = top_20_percent_points.mean()

    recent_lower_upper[unique] = (average_of_bottom_20_percent, average_of_top_20_percent)

133 ms ± 1.76 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [93]:
%%timeit
grouped_df = df.groupby([which_unique_col], as_index = False)
for unique in uniques:
    points = df.iloc[grouped_df.indices.get(unique)]['total_points']
    #print(points)
    sorted_points = points.sort_values()

    # bottom 20% of the prices
    bottom_20_percent_points = sorted_points[:int(0.2 * len(sorted_points))]

    # average of the bottom 20%
    average_of_bottom_20_percent = bottom_20_percent_points.mean()

    # top 20% of the prices
    top_20_percent_points = sorted_points[int(0.8 * len(sorted_points)):]
    # average of the top 20%
    average_of_top_20_percent = top_20_percent_points.mean()

    recent_lower_upper[unique] = (average_of_bottom_20_percent, average_of_top_20_percent)
    
    ## Doesn't use deprecated method

134 ms ± 2.34 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
### Groupby inside is slower here.
### Grouping outside halves the time

Then from here, you might calculate range or a ratio to determine variance for a given area and then find standouts:

In [26]:
# Use np.where and this function
# NumPy array slicing returns a view - check GC effects
def point_ratios(pandas_df):
    # Same team identifier for every element in group
    # To Numpy here - changed time hit from 32% to 18%
    team_name = pandas_df['color_type'].to_numpy()[0]
    # Changing this to NumPy array doesn't make a big difference, because...
    points = pandas_df['total_points']
    sorted_points = np.sort(points)
    # This is a NumPy array
    # sorted_points.shape[0] - gets the length of the array
    # bottom 20% of the prices
    bottom_20_percent_points = sorted_points[:int(0.2 * sorted_points.shape[0])]

    # average of the bottom 20%
    average_of_bottom_20_pct = np.mean(bottom_20_percent_points)

    # top 20% of the prices
    top_20_percent_points = sorted_points[int(0.8 * sorted_points.shape[0]):]
    # average of the top 20%
    average_of_top_20_pct = np.mean(top_20_percent_points)
    
    return team_name, (average_of_bottom_20_pct, average_of_top_20_pct)
    
grouped_df = df.groupby([which_unique_col], as_index = False)
unique_pts_df = [df.iloc[grouped_df.indices.get(unique)] for unique in uniques]
recent_lower_upper = dict(map(point_ratios, unique_pts_df))
print(list(recent_lower_upper)[:5])

['Semi-gloss Chromatic Green', 'Matte Racing White', 'Gloss Light Red', 'Iridescent Dark White', 'Gloss Pale Yellow']


In [21]:
%%time
grouped_df = df.groupby([which_unique_col], as_index = False)
unique_pts_df = [df.iloc[grouped_df.indices.get(unique)] for unique in uniques]
recent_lower_upper = dict(map(point_ratios, unique_pts_df))

CPU times: user 13.4 s, sys: 39.8 ms, total: 13.5 s
Wall time: 13.5 s


In [175]:
%lprun -f point_ratios dict(map(point_ratios, unique_pts_df))

In [27]:
%%time
for unique in uniques:
    points = df.where(df[which_unique_col] == unique)['total_points']
    sorted_points = points.sort_values()

    # bottom 20% of the prices
    bottom_20_percent_points = sorted_points[:int(0.2 * len(sorted_points))]

    # average of the bottom 20%
    average_of_bottom_20_percent = bottom_20_percent_points.mean()

    # top 20% of the prices
    top_20_percent_points = sorted_points[int(0.8 * len(sorted_points)):]
    # average of the top 20%
    average_of_top_20_percent = top_20_percent_points.mean()

    recent_lower_upper[unique] = (average_of_bottom_20_percent, average_of_top_20_percent)

CPU times: user 17min 6s, sys: 5min 35s, total: 22min 42s
Wall time: 19min 43s


### 1.5GB dataset
#### Old way - 19 min amd 43 seconds - 1183 secs
#### New way (GroupBy and vectorized) - 13.5 secs!!
#### 87x faster!

### 10^4 dataset (686kB)
#### Using PD mask without groupby - 1.19 seconds
#### Using PD where - 1.08 seconds
#### df[df[unique_col] == unique] - 294ms (SentDex way)
#### Groupby outside - 133ms
#### List comprehension and map w/ NumPy vector - 60.6ms
#### Above w/ NumPy operations instead of Pd - still 60-61ms, not big difference
#### With profiler optimizations - 59ms

In [135]:
recent_lower_upper = {}
for unique in uniques:
    points = df.where(df[which_unique_col] == unique)['total_points']
    sorted_points = points.sort_values()

    # bottom 20% of the prices
    bottom_20_percent_points = sorted_points[:int(0.2 * len(sorted_points))]

    # average of the bottom 20%
    average_of_bottom_20_percent = bottom_20_percent_points.mean()

    # top 20% of the prices
    top_20_percent_points = sorted_points[int(0.8 * len(sorted_points)):]
    # average of the top 20%
    average_of_top_20_percent = top_20_percent_points.mean()

    recent_lower_upper[unique] = (average_of_bottom_20_percent, average_of_top_20_percent)
    
#recent_lower_upper

In [31]:
recent_lower_upper = {}
grouped_df = df.groupby([which_unique_col], as_index = False)
for unique in uniques:
    points = df.iloc[grouped_df.indices.get(unique)]['total_points']
    #print(points)
    sorted_points = points.sort_values()

    # bottom 20% of the prices
    bottom_20_percent_points = sorted_points[:int(0.2 * len(sorted_points))]

    # average of the bottom 20%
    average_of_bottom_20_percent = bottom_20_percent_points.mean()

    # top 20% of the prices
    top_20_percent_points = sorted_points[int(0.8 * len(sorted_points)):]
    # average of the top 20%
    average_of_top_20_percent = top_20_percent_points.mean()

    recent_lower_upper[unique] = (average_of_bottom_20_percent, average_of_top_20_percent)

TypeError: 'dict_items' object is not subscriptable

In [33]:
list(recent_lower_upper.items())[:20]

[('Semi-gloss Chromatic Green', (63.12118350645263, 482.89974819011644)),
 ('Matte Racing White', (62.883247666002305, 482.3204489432003)),
 ('Gloss Light Red', (62.90425030134689, 483.2993553796971)),
 ('Iridescent Dark White', (63.13273781538944, 483.65049101546174)),
 ('Gloss Pale Yellow', (62.57513171759747, 482.72118018967336)),
 ('Semi-gloss Chromatic Red', (63.13744894753377, 482.0763914341065)),
 ('Iridescent Chromatic Blue', (62.77917865550614, 482.5012554927809)),
 ('Satin Deep Black', (63.55085948588551, 483.42609335576117)),
 ('Gloss Radiant Red', (63.246707407017176, 483.1792129800348)),
 ('Satin Deep Yellow', (62.894300844125205, 483.1882667505505)),
 ('Satin Dark Black', (63.38711202142745, 482.3493330532507)),
 ('Iridescent Racing White', (63.338584751567524, 482.4036354056902)),
 ('Iridescent Deep Blue', (63.417065097462306, 483.5428181149522)),
 ('Satin Deep Green', (62.80136914715719, 482.58739614359615)),
 ('Iridescent Deep White', (63.48950276243094, 482.8872987477

In [46]:
ratios = {}
ranges = {}

In [45]:
%%timeit
for color in recent_lower_upper:
    total_range = recent_lower_upper[color][1] - recent_lower_upper[color][0]
    ratio = recent_lower_upper[color][1] / recent_lower_upper[color][0]
    #confirm ratio is a number:
    if not np.isnan(ratio) and not np.isnan(total_range):
        #print(f"City: {city}, ratio: {ratio}, total_range: {total_range}")

        ratios[color] = ratio
        ranges[color] = total_range
    
sorted_ratios = sorted(ratios.items(), key=lambda x: x[1], reverse=True)
sorted_ranges = sorted(ranges.items(), key=lambda x: x[1], reverse=True)

770 µs ± 12.3 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### Above operation timeit 

#### 10^4 dataset - 1.25ms
#### 1.5GB dataset - 770us

In [51]:
def color_ratios(lower_upper):
    '''
        Input: Dict
        Output: Dict, Dict
        Takes in a dictionary with the team's color as the key and a tuple of their lower and upper points
        bands as the value.
        
        Returns 2 dictionaries - the sorted ratios and ranges.
    '''
    ratios_fn = {}
    ranges_fn = {}
    for key, value_tuple in lower_upper.items():
        # Upper range - lower range
        total_range = value_tuple[1] - value_tuple[0]
        ratio = value_tuple[1] / value_tuple[0]
        # confirming these two are numbers:
        if not np.isnan(ratio) and not np.isnan(total_range):
            ratios[key] = ratio
            ranges[key] = total_range
    return ratios_fn, ranges_fn

ratios, ranges = color_ratios(recent_lower_upper)
# Sort by highest/upper values
sorted_ratios = sorted(ratios.items(), key=lambda x: x[1], reverse=True)
sorted_ranges = sorted(ranges.items(), key=lambda x: x[1], reverse=True)
print(f"Top 5 ratios: {sorted_ratios[:5]}")
print(f"Top 5 ranges: {sorted_ranges[:5]}")

Top 5 ratios: []
Top 5 ranges: []


In [41]:
%%timeit 
ratios, ranges = color_ratios(recent_lower_upper)
sorted_ratios = sorted(ratios.items(), key=lambda x: x[1], reverse=True)
sorted_ranges = sorted(ranges.items(), key=lambda x: x[1], reverse=True)

730 µs ± 10.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### Above operation timeit 

#### 10^4 dataset - 1.1ms
#### 1.5GB dataset - 730us

In [35]:
ratios = {}
ranges = {}

for color in recent_lower_upper:
    total_range = recent_lower_upper[color][1] - recent_lower_upper[color][0]
    ratio = recent_lower_upper[color][1] / recent_lower_upper[color][0]
    #confirm ratio is a number:
    if not np.isnan(ratio) and not np.isnan(total_range):
        #print(f"City: {city}, ratio: {ratio}, total_range: {total_range}")

        ratios[color] = ratio
        ranges[color] = total_range

# top 5 ratios:
sorted_ratios = sorted(ratios.items(), key=lambda x: x[1], reverse=True)
# top 5 ranges:
sorted_ranges = sorted(ranges.items(), key=lambda x: x[1], reverse=True)

print(f"Top 5 ratios: {sorted_ratios[:5]}")
print(f"Top 5 ranges: {sorted_ranges[:5]}")

Top 5 ratios: [('Gloss Light Yellow', 7.786666689154867), ('Satin Racing Yellow', 7.777448874528821), ('Iridescent Dark Yellow', 7.766192321479855), ('Matte Pale White', 7.758228689253698), ('Gloss Pale Blue', 7.757793073707121)]
Top 5 ranges: [('Iridescent Dark Yellow', 421.49190686674854), ('Gloss Light Yellow', 421.43583161789104), ('Satin Racing Yellow', 421.1534437184212), ('Matte Pale White', 420.9854972293599), ('Semi-gloss Radiant Black', 420.954413869469)]


In [194]:
ratios = {}
ranges = {}

In [195]:
%%timeit
for color in recent_lower_upper:
    total_range = recent_lower_upper[color][1] - recent_lower_upper[color][0]
    ratio = recent_lower_upper[color][1] / recent_lower_upper[color][0]
    #confirm ratio is a number:
    if not np.isnan(ratio) and not np.isnan(total_range):
        #print(f"City: {city}, ratio: {ratio}, total_range: {total_range}")

        ratios[color] = ratio
        ranges[color] = total_range
# top 5 ratios:
sorted_ratios = sorted(ratios.items(), key=lambda x: x[1], reverse=True)
# top 5 ranges:
sorted_ranges = sorted(ranges.items(), key=lambda x: x[1], reverse=True)

1.14 ms ± 9.15 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [58]:
# find the earliest date of transfer:
df['total_points'].max()

525

In [38]:
%%time
df_sorted = df.sort_values(by=['total_points'], ascending=False)
# sort the dataframe by price:

CPU times: user 6.51 s, sys: 1.57 s, total: 8.08 s
Wall time: 8.26 s


### Sorting times: 
#### 10^4 dataset - 1.12ms (timeit)
#### 1.5GB (2 * 10^7) dataset - 6.5-8.08s (time)

In [36]:
df_sorted = df.sort_values(by=['total_points'], ascending=False)
df_sorted.tail()

Unnamed: 0.1,Unnamed: 0,year,team,best_finish,points_avg,total_points,color_type,car_color,good_res,prob_good_res
5807675,5807675,1996,backmarker,9,1,21,Matte Racing White,Iridescent,yes,0.553809
535641,535641,1998,backmarker,14,1,21,Iridescent Dark Black,Blue,no,0.788625
17058124,17058124,1998,backmarker,7,1,21,Gloss Chromatic Black,Satin,no,0.479035
4479228,4479228,2015,leading,14,1,21,Satin Chromatic Yellow,Red,yes,0.61943
7352900,7352900,2018,backmarker,19,1,21,Satin Light Yellow,Light,no,0.41526


## To my knowledge, there's no NumPy way to sort an entire dataframe by one column.
## I don't suggest doing the following - this is just for my curiosity.

In [None]:
%%time
## Crashed on 1.5GB dataset, 6x slower on 10^4 dataset.
np_array = np.array(df)
ind = np_array[np_array[:,5].argsort()[::-1]]

In [209]:
np_array = np.array(df)
# argsort if you want to get indices
# argsort[::-1] if you want to sort it in reverse.
ind = np_array[np_array[:,5].argsort()[::-1]]
print(ind[:20])

[[1625 2020 'midfield' 18 25 525 'Satin Dark Blue' 'Semi-gloss' 'yes'
  0.9681931275436216]
 [7023 2010 'backmarker' 6 25 525 'Satin Racing Green' 'Black' 'no'
  0.1661721593275459]
 [1572 2017 'midfield' 16 25 525 'Matte Racing White' 'Gloss' 'yes'
  0.8633248666243962]
 [8719 2013 'leading' 16 25 525 'Semi-gloss Pale White' 'Matte' 'no'
  0.9180138266069688]
 [8715 2000 'midfield' 10 25 525 'Gloss Light Blue' 'Satin' 'yes'
  0.5955981496937104]
 [9710 2021 'backmarker' 10 25 525 'Gloss Racing Black' 'Iridescent' 'no'
  0.2101602352432614]
 [1570 2006 'backmarker' 10 25 525 'Semi-gloss Chromatic Blue' 'Blue'
  'yes' 0.0384736718552681]
 [1569 2021 'backmarker' 18 25 525 'Gloss Racing Red' 'Semi-gloss' 'no'
  0.3934838673996677]
 [2013 2022 'backmarker' 17 25 525 'Gloss Deep Green' 'Light' 'yes'
  0.0735095071798956]
 [9714 2016 'backmarker' 13 25 525 'Matte Chromatic Black' 'Chromatic'
  'no' 0.0175290060712458]
 [8698 1993 'leading' 16 25 525 'Semi-gloss Chromatic Yellow' 'Yellow'
  