In [1]:
import pandas as pd

In [2]:
from pandas_help import colour_dtype

In [3]:
!pwd

/Users/sanganichaitanya/Downloads/Pers/workshop/pandas-tutorial


In [4]:
trips_data = pd.read_csv("datasets/biketrip.csv")

In [5]:
trips_data.head()

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,zip_code
0,4576,63.0,8/29/2013 14:13,South Van Ness at Market,66,8/29/2013 14:14,South Van Ness at Market,66,520,Subscriber,
1,4607,70.0,8/29/2013 14:42,San Jose City Hall,10,8/29/2013 14:43,San Jose City Hall,10,661,Subscriber,95138.0
2,4130,71.0,8/29/2013 10:16,Mountain View City Hall,27,8/29/2013 10:17,Mountain View City Hall,27,48,Subscriber,97214.0
3,4251,77.0,8/29/2013 11:29,San Jose City Hall,10,8/29/2013 11:30,San Jose City Hall,10,26,Subscriber,95060.0
4,4299,83.0,8/29/2013 12:02,South Van Ness at Market,66,8/29/2013 12:04,Market at 10th,67,319,Subscriber,94103.0


In [6]:
trips_data.head().style.applymap(colour_dtype)

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,zip_code
0,4576,63,8/29/2013 14:13,South Van Ness at Market,66,8/29/2013 14:14,South Van Ness at Market,66,520,Subscriber,
1,4607,70,8/29/2013 14:42,San Jose City Hall,10,8/29/2013 14:43,San Jose City Hall,10,661,Subscriber,95138.0
2,4130,71,8/29/2013 10:16,Mountain View City Hall,27,8/29/2013 10:17,Mountain View City Hall,27,48,Subscriber,97214.0
3,4251,77,8/29/2013 11:29,San Jose City Hall,10,8/29/2013 11:30,San Jose City Hall,10,26,Subscriber,95060.0
4,4299,83,8/29/2013 12:02,South Van Ness at Market,66,8/29/2013 12:04,Market at 10th,67,319,Subscriber,94103.0


### Memory usage of data frame

Pandas stores the data on RAM by grouping same dtypes in blocks. This helps in optimising for storage and aceessing them quickly ( think of it like legos ) 

<div class="row">
  <div class="column">
    <img src="datasets/int block.png" alt="alternate text" height = "120" width = "80" style="float: left; margin-left: 20%">
  </div>
  <div class="column">
    <img src="datasets/float block.png" height = "30" width = "80" style="float: left;">
  </div>
  <div class="column">
    <img src="datasets/object block.png" height = "300" width = "200" style="float: left;">
  </div>
</div>

<br>



The columns and row indices are missing here. **Block Manager** class of Pandas takes care of these mappings. 
This data is put on RAM for quick retrieval and computation, making it a challenge to work on medium/ large datasets. 

Hence optimizing for dtypes plays crucial role in improving performance and storage.

In [7]:
trips_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 669959 entries, 0 to 669958
Data columns (total 11 columns):
id                    669959 non-null int64
duration              669930 non-null float64
start_date            669959 non-null object
start_station_name    669959 non-null object
start_station_id      669959 non-null int64
end_date              669959 non-null object
end_station_name      669959 non-null object
end_station_id        669959 non-null int64
bike_id               669959 non-null int64
subscription_type     669959 non-null object
zip_code              661650 non-null object
dtypes: float64(1), int64(4), object(6)
memory usage: 56.2+ MB


In [8]:
trips_data.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 669959 entries, 0 to 669958
Data columns (total 11 columns):
id                    669959 non-null int64
duration              669930 non-null float64
start_date            669959 non-null object
start_station_name    669959 non-null object
start_station_id      669959 non-null int64
end_date              669959 non-null object
end_station_name      669959 non-null object
end_station_id        669959 non-null int64
bike_id               669959 non-null int64
subscription_type     669959 non-null object
zip_code              661650 non-null object
dtypes: float64(1), int64(4), object(6)
memory usage: 303.2 MB


In [9]:
# Mean and total usage of each dtype

for dtype in ['float', 'int', 'object']:
    chosen_dtype = trips_data.select_dtypes(include=[dtype])
    col_num = chosen_dtype.shape[1]
    mean_usage_bytes = chosen_dtype.memory_usage(deep=True).mean()
    sum_usage_bytes = chosen_dtype.memory_usage(deep=True).sum()
    mean_usage_mb = mean_usage_bytes/1024**2
    sum_usage_mb = sum_usage_bytes/1024**2
    print("Average memory usage for {} {} column(s): {:03.2f} MB".format(col_num,dtype, mean_usage_mb))
    print("Total memory usage for {} {} column(s): {:03.2f} MB".format(col_num, dtype, sum_usage_mb))

Average memory usage for 1 float column(s): 2.56 MB
Total memory usage for 1 float column(s): 5.11 MB
Average memory usage for 4 int column(s): 4.09 MB
Total memory usage for 4 int column(s): 20.45 MB
Average memory usage for 6 object column(s): 39.66 MB
Total memory usage for 6 object column(s): 277.61 MB


### Optimizing Numeric Columns

Integer and float dtypes are the numeric dtypes. There are sub dtypes of these based on the length and size of the dytpes.

The numeric columns are stored as Numpy ndarray, which is based on C arrays. Hence they are stored as continuous block of memory and access to this data is pretty fast and accurate. To understand optimize dtypes, we need to understand the subtypes better. 

- Integer dtypes
    - int8 
    - int16
    - int32
    - int64

`int8` can hold values ranging from -128 to 127 (2^8 values for 8 bits)

Similarly for float there are `float16`, `float32` and `float64`. The ```-ve``` sign in the integer dtypes occupies some memory. Using unsigned integer or `unit` solves this problem. 

Let's convert and optimize the int objects by `downcast`

In [10]:
# Lets create a function to calculate memory usage for objects

def get_mem_usage(obj):
    col_num = obj.shape[1]
    mean_usage_bytes = obj.memory_usage(deep=True).mean()
    sum_usage_bytes = obj.memory_usage(deep=True).sum()
    mean_usage_mb = mean_usage_bytes/1024**2
    sum_usage_mb = sum_usage_bytes/1024**2
    print("Average memory usage for {} column(s): {:03.2f} MB".format(col_num, mean_usage_mb))
    print("Total memory usage for {} column(s): {:03.2f} MB".format(col_num, sum_usage_mb))

In [11]:
# optimizing int objects

int_dtypes = trips_data.select_dtypes(include=['int'])

In [12]:
get_mem_usage(int_dtypes)

Average memory usage for 4 column(s): 4.09 MB
Total memory usage for 4 column(s): 20.45 MB


In [13]:
optimized_int_dtypes = int_dtypes.apply(pd.to_numeric, downcast='unsigned')

In [14]:
get_mem_usage(optimized_int_dtypes)

Average memory usage for 4 column(s): 1.02 MB
Total memory usage for 4 column(s): 5.11 MB


In [15]:
# Similarly optimizing float objects

float_dtypes = trips_data.select_dtypes(include=['float'])
get_mem_usage(float_dtypes)

Average memory usage for 1 column(s): 2.56 MB
Total memory usage for 1 column(s): 5.11 MB


In [16]:
optimized_float_dtypes = float_dtypes.apply(pd.to_numeric, downcast='float')
get_mem_usage(optimized_float_dtypes)

Average memory usage for 1 column(s): 1.28 MB
Total memory usage for 1 column(s): 2.56 MB


### Optimizing Object dtypes

Though we reduced the memory usage of numeric columns by 60%, majority is made up by string/ object dtype columns. To understand why these dtypes take up a lot of space as compared to numeric arrays, need to dig a little into how a Python works.

A numpy array is a continuous block of memory with a single pointer to the array. While in case of Python List/ object class, it points to continuos block of pointers which in turn point to the data. This is because Python being high level intrepreted language, doesn't have a lot of control on memory mamanement and scatters the data on the memory. 

<img src="datasets/array_vs_list.png" height = "700" width = "550">


With pointers, each taking up a byte of memory and variable sizes of each string, memory is used very inefficiently. 

Converting the categorical values to int saves a lot of memory and a lot of performance. With `.astype('category')` (introduced from pandas 0.15), the categorical values are converted to int values with a mapping dictionary. 

In [17]:
object_dtypes = trips_data.select_dtypes(include=['object'])

In [18]:
object_dtypes

Unnamed: 0,start_date,start_station_name,end_date,end_station_name,subscription_type,zip_code
0,8/29/2013 14:13,South Van Ness at Market,8/29/2013 14:14,South Van Ness at Market,Subscriber,
1,8/29/2013 14:42,San Jose City Hall,8/29/2013 14:43,San Jose City Hall,Subscriber,95138
2,8/29/2013 10:16,Mountain View City Hall,8/29/2013 10:17,Mountain View City Hall,Subscriber,97214
3,8/29/2013 11:29,San Jose City Hall,8/29/2013 11:30,San Jose City Hall,Subscriber,95060
4,8/29/2013 12:02,South Van Ness at Market,8/29/2013 12:04,Market at 10th,Subscriber,94103
5,8/29/2013 18:54,Golden Gate at Polk,8/29/2013 18:56,Golden Gate at Polk,Subscriber,94109
6,8/29/2013 13:25,Santa Clara at Almaden,8/29/2013 13:27,Adobe on Almaden,Subscriber,95112
7,8/29/2013 14:02,San Salvador at 1st,8/29/2013 14:04,San Salvador at 1st,Subscriber,95112
8,8/29/2013 17:01,South Van Ness at Market,8/29/2013 17:03,South Van Ness at Market,Subscriber,94103
9,8/29/2013 11:33,San Jose City Hall,8/29/2013 11:35,MLK Library,Subscriber,95060


In [19]:
# Start station, end station and subscription type, zip code are categorical variables

object_dtypes_cat = object_dtypes[['start_station_name', 'end_station_name', 'subscription_type', 'zip_code']]

In [20]:
get_mem_usage(object_dtypes_cat)

Average memory usage for 4 column(s): 37.22 MB
Total memory usage for 4 column(s): 186.09 MB


In [21]:
optimized_object_dtypes_cat = pd.DataFrame()
for column in object_dtypes_cat.columns:
    optimized_object_dtypes_cat[column] = object_dtypes_cat[column].astype('category')

In [22]:
get_mem_usage(optimized_object_dtypes_cat)

Average memory usage for 4 column(s): 0.79 MB
Total memory usage for 4 column(s): 3.96 MB


> The string type datetime objects are converted to Python datetime objects (64 byte) because they ease the time series analysis. 

In [23]:
object_dtypes_dates = object_dtypes[['start_date', 'end_date']]

In [24]:
get_mem_usage(object_dtypes_dates)

Average memory usage for 2 column(s): 30.51 MB
Total memory usage for 2 column(s): 91.52 MB


In [25]:
optimized_object_dtypes_dates = pd.DataFrame()
for column in object_dtypes_dates.columns:
    print(column)
    optimized_object_dtypes_dates[column] = pd.to_datetime(object_dtypes_dates[column])

start_date
end_date


In [26]:
get_mem_usage(optimized_object_dtypes_dates)

Average memory usage for 2 column(s): 3.41 MB
Total memory usage for 2 column(s): 10.22 MB


In [27]:
# Lets do all the optimizations together
optimize_trips_data = trips_data.copy()

In [28]:
int_dtypes = optimize_trips_data.select_dtypes(include=['int'])
optimize_trips_data[int_dtypes.columns] = int_dtypes.apply(pd.to_numeric, downcast='unsigned')

In [29]:
float_dtypes = optimize_trips_data.select_dtypes(include=['float'])
optimize_trips_data[float_dtypes.columns] = float_dtypes.apply(pd.to_numeric, downcast='float')

In [30]:
cat_dtypes = optimize_trips_data[['start_station_name', 'end_station_name', 'subscription_type', 'zip_code']]

for col in cat_dtypes.columns:
    optimize_trips_data[col] = cat_dtypes[col].astype('category')

In [31]:
date_dtypes = optimize_trips_data[['start_date', 'end_date']]

In [32]:
for col in date_dtypes.columns:
    print(col)
    optimize_trips_data[col] = pd.to_datetime(date_dtypes[col])

start_date
end_date


In [33]:
get_mem_usage(trips_data)

Average memory usage for 11 column(s): 25.26 MB
Total memory usage for 11 column(s): 303.17 MB


In [34]:
get_mem_usage(optimize_trips_data)

Average memory usage for 11 column(s): 1.82 MB
Total memory usage for 11 column(s): 21.85 MB


Reduced the size of the data frame by ~95%