In [17]:
import pandas as pd
import gc

# Exercise 05. Pandas optimizations

## Task 1. Read the fines.csv file that you saved in the previous exercise.

In [18]:
df = pd.read_csv('../data/fines.csv', delimiter=',', engine='python'
    )
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2.0,3200.0,Ford,Focus,2018
1,E432XX77RUS,1.0,6500.0,Toyota,Camry,2008
2,7184TT36RUS,1.0,2100.0,Ford,Focus,1994
3,X582HE161RUS,2.0,2000.0,Ford,Focus,1987
4,92918M178RUS,1.0,5700.0,Ford,Focus,2000
...,...,...,...,...,...,...
925,2727271RUS,2.0,8500.0,Lada,Vesta,2015
926,2627271RUS,2.0,1800.0,Kia,Rio,2018
927,2527271RUS,1.0,3200.0,Hyundai,Solaris,2019
928,2427271RUS,2.0,1500.0,Skoda,Octavia,2016


## Task 2. Iterations

In [19]:
%%timeit
def calculate_fines_iloc(df):
    results = []
    for i in range(len(df)):
        row = df.iloc[i]
        results.append(row['Fines'] / row['Refund'] * row['Year'])
    return results

df['strange'] = calculate_fines_iloc(df)

15.1 ms ± 198 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [20]:
%%timeit
def calculate_fines_iterrows(df):
    results = []
    for index, row in df.iterrows():
        results.append(row['Fines'] / row['Refund'] * row['Year'])
    return results

df['strange'] = calculate_fines_iterrows(df)

13.2 ms ± 95.6 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [21]:
%%timeit
df['strange'] = df.apply(lambda row: row['Fines'] / row['Refund'] * row['Year'], axis=1)

3.4 ms ± 44.6 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [22]:
%%timeit
df['strange'] = df['Fines'] / df['Refund'] * df['Year']

76.2 μs ± 6.18 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [23]:
%%timeit
df['strange'] = df['Fines'].values / df['Refund'].values * df['Year'].values

34.8 μs ± 1.24 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [24]:
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,strange
0,Y163O8161RUS,2.0,3200.0,Ford,Focus,2018,3228800.0
1,E432XX77RUS,1.0,6500.0,Toyota,Camry,2008,13052000.0
2,7184TT36RUS,1.0,2100.0,Ford,Focus,1994,4187400.0
3,X582HE161RUS,2.0,2000.0,Ford,Focus,1987,1987000.0
4,92918M178RUS,1.0,5700.0,Ford,Focus,2000,11400000.0
...,...,...,...,...,...,...,...
925,2727271RUS,2.0,8500.0,Lada,Vesta,2015,8563750.0
926,2627271RUS,2.0,1800.0,Kia,Rio,2018,1816200.0
927,2527271RUS,1.0,3200.0,Hyundai,Solaris,2019,6460800.0
928,2427271RUS,2.0,1500.0,Skoda,Octavia,2016,1512000.0


## Task 3. Indexing

In [25]:
%%timeit
df[df['CarNumber'] == 'O136HO197RUS']

103 μs ± 5.94 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [26]:
df = df.set_index('CarNumber')
df

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,strange
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2.0,3200.0,Ford,Focus,2018,3228800.0
E432XX77RUS,1.0,6500.0,Toyota,Camry,2008,13052000.0
7184TT36RUS,1.0,2100.0,Ford,Focus,1994,4187400.0
X582HE161RUS,2.0,2000.0,Ford,Focus,1987,1987000.0
92918M178RUS,1.0,5700.0,Ford,Focus,2000,11400000.0
...,...,...,...,...,...,...
2727271RUS,2.0,8500.0,Lada,Vesta,2015,8563750.0
2627271RUS,2.0,1800.0,Kia,Rio,2018,1816200.0
2527271RUS,1.0,3200.0,Hyundai,Solaris,2019,6460800.0
2427271RUS,2.0,1500.0,Skoda,Octavia,2016,1512000.0


In [27]:
%%timeit
row = df.loc['O136HO197RUS']

23.4 μs ± 457 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## Task 4. Downcasting

In [28]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to 2327271RUS
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Refund   930 non-null    float64
 1   Fines    930 non-null    float64
 2   Make     930 non-null    object 
 3   Model    919 non-null    object 
 4   Year     930 non-null    int64  
 5   strange  930 non-null    float64
dtypes: float64(3), int64(1), object(2)
memory usage: 236.0 KB


In [29]:
optimized_df = df.copy()
optimized_df['Fines'] = optimized_df['Fines'].astype('float32')
optimized_df['strange'] = optimized_df['strange'].astype('float32')
optimized_df['Refund'] = pd.to_numeric(optimized_df['Refund'], downcast='integer')
optimized_df['Year'] = pd.to_numeric(optimized_df['Year'], downcast='integer')
optimized_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to 2327271RUS
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Refund   930 non-null    int8   
 1   Fines    930 non-null    float32
 2   Make     930 non-null    object 
 3   Model    919 non-null    object 
 4   Year     930 non-null    int16  
 5   strange  930 non-null    float32
dtypes: float32(2), int16(1), int8(1), object(2)
memory usage: 216.9 KB


## Task 5. Categories

In [30]:
optimized_df['Make'] = optimized_df['Make'].astype('category')
optimized_df['Model'] = optimized_df['Model'].astype('category')
optimized_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to 2327271RUS
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   Refund   930 non-null    int8    
 1   Fines    930 non-null    float32 
 2   Make     930 non-null    category
 3   Model    919 non-null    category
 4   Year     930 non-null    int16   
 5   strange  930 non-null    float32 
dtypes: category(2), float32(2), int16(1), int8(1)
memory usage: 108.9 KB


## Task 6. Memory clean

In [31]:
gc.collect()
%reset_selective ^df$