# Dataset
Daily ridership data of 577 000 rows.

Cols:
 - route: Column 0. The bus route name.
 - date: Column 1. A date string of the form MM/DD/YYYY.
 - daytype: Column 2. A day type code (U=Sunday/Holiday, A=Saturday, W=Weekday)
 - rides: Column 3. Total number of riders (integer)

Finding efficient ways to use it.

In [1]:
f = open('Data/ctabus.csv')

# Tracemalloc for memory baseline
- Current = 12459796 = 12MB
- Peak = 37182052 = 37MB

List of strings takes 4x compared to one string
List of tuples takes 10x compared to one string 

In [14]:
# Data into single string
import tracemalloc

f = open('Data/ctabus.csv')
tracemalloc.start()
data = f.read()
len(data)
current, peak = tracemalloc.get_traced_memory()
print(current, peak)

12459796 37182052


In [15]:
# Data into list of strings
import tracemalloc

f = open('Data/ctabus.csv')
tracemalloc.start()
data = f.readlines()
len(data)
current, peak = tracemalloc.get_traced_memory()
print(current, peak)

45445022 57814157


In [16]:
import csv

def read_rides_as_tuples(filename):
    '''
    Read the bus ride data as a list of tuples
    '''
    records = []
    with open(filename) as f:
        rows = csv.reader(f)
        headings = next(rows)     # Skip headers
        for row in rows:
            route = row[0]
            date = row[1]
            daytype = row[2]
            rides = int(row[3])
            record = (route, date, daytype, rides)
            records.append(record)
    return records

if __name__ == '__main__':
    import tracemalloc
    tracemalloc.start()
    rows = read_rides_as_tuples('Data/ctabus.csv')
    print('Memory Use: Current %d, Peak %d' % tracemalloc.get_traced_memory())

Memory Use: Current 169245695, Peak 169276145


# Potential data structures

Create readrides in all ways and see which is most memory efficient when dealing with **whole** dataset

## Results

- tuple
  - 1st place
  - Current 169245695, Peak 169276145
- dict
  - 3rd place
  - Current 261807653, Peak 385381738
- class
  - 2nd place
  - Current 215482050, Peak 431591439
- named tuple
  - 5th place
  - Current 1712783780, Peak 1882703601
- class with slots
  - 4th place
  - Current 1703567671, Peak 1882703601
- pandas
  - Around slotted class
  - Current 1631708868, Peak 1882703601

In [None]:
# A tuple
row = (route, date, daytype, rides)

# A dictionary
row = {
    'route': route,
    'date': date,
    'daytype': daytype,
    'rides': rides,
}

# A class
class Row:
    def __init__(self, route, date, daytype, rides):
        self.route = route
        self.date = date
        self.daytype = daytype
        self.rides = rides

# A named tuple
from collections import namedtuple
Row = namedtuple('Row', ['route', 'date', 'daytype', 'rides'])

# A class with __slots__
class Row:
    __slots__ = ['route', 'date', 'daytype', 'rides']
    def __init__(self, route, date, daytype, rides):
        self.route = route
        self.date = date
        self.daytype = daytype
        self.rides = rides

In [17]:
# Dictionary

import csv

def read_rides_as_dict(filename):
    '''
    Read the bus ride data as a list of dicts
    '''
    records = []
    with open(filename) as f:
        rows = csv.reader(f)
        headings = next(rows)     # Skip headers
        for row in rows:
            route = row[0]
            date = row[1]
            daytype = row[2]
            rides = int(row[3])
            record = {
                'route': route,
                'date': date,
                'daytype': daytype,
                'rides': rides,
                }
            records.append(record)
    return records

if __name__ == '__main__':
    import tracemalloc
    tracemalloc.start()
    rows = read_rides_as_dict('Data/ctabus.csv')
    print('Memory Use: Current %d, Peak %d' % tracemalloc.get_traced_memory())

Memory Use: Current 261807653, Peak 385381738


In [18]:
# Class

import csv

class Row:
    def __init__(self, route, date, daytype, rides):
        self.route = route
        self.date = date
        self.daytype = daytype
        self.rides = rides

def read_rides_as_class(filename):
    '''
    Read the bus ride data as a list of class objects
    '''
    records = []
    with open(filename) as f:
        rows = csv.reader(f)
        headings = next(rows)     # Skip headers
        for row in rows:
            route = row[0]
            date = row[1]
            daytype = row[2]
            rides = int(row[3])
            record = Row(route, date, daytype, rides)
            records.append(record)
    return records

if __name__ == '__main__':
    import tracemalloc
    tracemalloc.start()
    rows = read_rides_as_class('Data/ctabus.csv')
    print('Memory Use: Current %d, Peak %d' % tracemalloc.get_traced_memory())

Memory Use: Current 215482050, Peak 431591439


In [20]:
# Named tuple

import csv
from collections import namedtuple

Record = namedtuple('Record', ['route', 'date', 'daytype', 'rides'])

def read_rides_as_named_tuple(filename):
    '''
    Read the bus ride data as a list of named tuples
    '''
    records = []
    with open(filename) as f:
        rows = csv.reader(f)
        headings = next(rows)     # Skip headers
        for row in rows:
            route = row[0]
            date = row[1]
            daytype = row[2]
            rides = int(row[3])
            record = Record(route, date, daytype, rides)
            records.append(record)
    return records

if __name__ == '__main__':
    import tracemalloc
    tracemalloc.start()
    rows = read_rides_as_named_tuple('Data/ctabus.csv')
    print('Memory Use: Current %d, Peak %d' % tracemalloc.get_traced_memory())

Memory Use: Current 1712783780, Peak 1882703601


In [21]:
# Class with slots

import csv

class Row:
    __slots__ = ['route', 'date', 'daytype', 'rides']
    def __init__(self, route, date, daytype, rides):
        self.route = route
        self.date = date
        self.daytype = daytype
        self.rides = rides

def read_rides_as_class_with_slots(filename):
    '''
    Read the bus ride data as a list of class objects
    '''
    records = []
    with open(filename) as f:
        rows = csv.reader(f)
        headings = next(rows)     # Skip headers
        for row in rows:
            route = row[0]
            date = row[1]
            daytype = row[2]
            rides = int(row[3])
            record = Row(route, date, daytype, rides)
            records.append(record)
    return records

if __name__ == '__main__':
    import tracemalloc
    tracemalloc.start()
    rows = read_rides_as_class_with_slots('Data/ctabus.csv')
    print('Memory Use: Current %d, Peak %d' % tracemalloc.get_traced_memory())        

Memory Use: Current 1703567671, Peak 1882703601


In [22]:
# Pandas

import pandas as pd

def read_rides_as_pandas(filename):
    '''
    Read the bus ride data as a pandas dataframe
    '''
    records = pd.read_csv(filename)
    return records

if __name__ == '__main__':
    import tracemalloc
    tracemalloc.start()
    rows = read_rides_as_pandas('Data/ctabus.csv')
    print('Memory Use: Current %d, Peak %d' % tracemalloc.get_traced_memory())  

Memory Use: Current 1631708868, Peak 1882703601
