# Efficient Coding | Time and Memory Management

In [16]:
import pandas as pd
import numpy as np

In [19]:
#create dataset
def create_df(size=1000):
  df = pd.DataFrame()
  df['league'] = np.random.choice(['L1', 'L2', 'L3'], size=size)
  df['age'] = np.random.randint(15, 45, size)
  df['team_strip'] = np.random.choice(['blue', 'green', 'red', 'yellow', 'black'], size)
  df['prob'] = np.random.uniform(0,1,size)
  df['win'] = np.random.choice(['yes', 'no'], size)
  return df

In [38]:
df = create_df(10_000_000)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   league      object 
 1   age         int64  
 2   team_strip  object 
 3   prob        float64
 4   win         object 
dtypes: float64(1), int64(1), object(3)
memory usage: 381.5+ MB


In [39]:
%timeit df.groupby(['team_strip', 'league'])['age'].rank()
%timeit df.groupby(['team_strip', 'league'])['prob'].rank()
%timeit df.groupby(['team_strip', 'league', 'win'])['prob'].rank()

6.58 s ± 982 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
8.26 s ± 192 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
9.4 s ± 469 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [40]:
df['team_strip'] = df['team_strip'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 5 columns):
 #   Column      Dtype   
---  ------      -----   
 0   league      object  
 1   age         int64   
 2   team_strip  category
 3   prob        float64 
 4   win         object  
dtypes: category(1), float64(1), int64(1), object(2)
memory usage: 314.7+ MB


In [41]:
df['league'] = df['league'].astype('category')
df['win'] = df['win'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 5 columns):
 #   Column      Dtype   
---  ------      -----   
 0   league      category
 1   age         int64   
 2   team_strip  category
 3   prob        float64 
 4   win         category
dtypes: category(3), float64(1), int64(1)
memory usage: 181.2 MB


## Int Downcasting 
- int8 [-128, 127]
- int16 [-32768, 32767]
- int64 [-9223772036854785808, 9223772036854785807]

In [42]:
df['age'] = df['age'].astype('int8')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 5 columns):
 #   Column      Dtype   
---  ------      -----   
 0   league      category
 1   age         int8    
 2   team_strip  category
 3   prob        float64 
 4   win         category
dtypes: category(3), float64(1), int8(1)
memory usage: 114.4 MB


## Float downcasting

In [43]:
df['prob'] = df['prob'].astype('float16')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 5 columns):
 #   Column      Dtype   
---  ------      -----   
 0   league      category
 1   age         int8    
 2   team_strip  category
 3   prob        float16 
 4   win         category
dtypes: category(3), float16(1), int8(1)
memory usage: 57.2 MB


## Bool type casting

In [44]:
df['win'].map({'yes': True, 'no': False})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 5 columns):
 #   Column      Dtype   
---  ------      -----   
 0   league      category
 1   age         int8    
 2   team_strip  category
 3   prob        float16 
 4   win         category
dtypes: category(3), float16(1), int8(1)
memory usage: 57.2 MB


In [45]:
%timeit df.groupby(['team_strip', 'league'])['age'].rank()
%timeit df.groupby(['team_strip', 'league'])['prob'].rank()
%timeit df.groupby(['team_strip', 'league', 'win'])['prob'].rank()

3.76 s ± 240 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
6.34 s ± 404 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
6.63 s ± 401 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
