Read more cleverly data - no hardcoding columns
Implemented at reader.py

In [4]:
import csv

columntypes = [str, int, float]

with open('Data/portfolio.csv', 'r') as f:
    data = csv.reader(f)

    header = next(data)
    row = next(data)

    # Dynamically convert values to correct datatype
    record = [ func(val) for func, val in zip(columntypes, row) ]
    print(record)

    # Add headers (complete dict)
    print(dict(zip(header, record)))

    # Whole thing in one line
    record = { name: func(val) for name, func, val in zip(header, columntypes, row) }
    print(record)

['AA', 100, 32.2]
{'name': 'AA', 'shares': 100, 'price': 32.2}
{'name': 'AA', 'shares': 100, 'price': 32.2}


Memory-wise better lookup using pointer id instead of string

Using sys.intern, we can utilize id of object to find out number of unique "strings"
- When interned, values are better cached and can be compared like here
- More overlap, more efficient and less memory used (similar to clustering)

In [6]:
# Normal way / 216MB

import reader
import tracemalloc

tracemalloc.start()
rows = reader.read_csv_as_dicts('Data/ctabus.csv', [str, str, str, int])
routes = { row['route'] for row in rows }
print(len(routes))

tracemalloc.get_traced_memory()

181


(216102998, 216124647)

In [7]:
# Utilizing id's of pointers 188MB

import reader
import tracemalloc
from sys import intern

tracemalloc.start()
rows = reader.read_csv_as_dicts('Data/ctabus.csv', [intern, str, str, int])
routes = { id(row['route']) for row in rows }
print(len(routes))

tracemalloc.get_traced_memory()

181


(188310787, 404408707)