# Case Study

## Dicts from Zip

In [None]:
import os

import pandas as pd
import seaborn as sns
from pyprojroot import here

In [None]:
taxis = sns.load_dataset("taxis")

In [None]:
taxis_colnames = taxis.columns
taxis_colnames

In [None]:
taxis_firstrow = list(taxis.iloc[0, :])
taxis_firstrow

In [None]:
zipped_vals = zip(taxis_colnames, taxis_firstrow)
dict(zipped_vals)

## Functional Approach

In [None]:
def dict_from_lists(l1, l2):
    """Returns a dictionary from 2 lists. Keys are taken from l1, vals from
    l2."""
    z = dict(zip(l1, l2))
    return z

In [None]:
dict_from_lists(taxis_colnames, taxis_firstrow)

## List comprehension for iteration

In [None]:
all_taxis_listed = taxis.values.tolist()
all_taxis_listed[0:2]

In [None]:
taxis_listdict = [
    dict_from_lists(taxis_colnames, rowlist) for rowlist in all_taxis_listed
]
taxis_listdict[0:2]

## Back to a DF

In [None]:
taxis2 = pd.DataFrame(taxis_listdict)
taxis2.head()

In [None]:
all(taxis == taxis2)

In [None]:
del taxis2

## Streaming Files & Generators

Goes over the convention `with open(filenm) as f:`

The `with` statement is a context manager & binds the file to the variable `f`.

The connection to the file acts as a generator object.

In [None]:
taxis.to_csv(os.path.join(here(), "data", "taxis.csv"))
del taxis

In [None]:
with open(os.path.join(here(), "data", "taxis.csv")) as taxicabs:
    # skip the colnames by reading the line. Works like iter().next()
    taxicabs.readline()
    table = dict()
    # Use a generator to iterate
    for row in range(1000):
        rowvals = taxicabs.readline().split(",")
        # return the payment method value
        rowval = rowvals[10]
        if rowval in table.keys():
            table[rowval] += 1
        else:
            table[rowval] = 1

In [None]:
table
# Asking for too many rows, eg 700 results in:
# IndexError: list index out of range

## Generators & Lazy Eval

As the above approach broke if you specified too many rows, lazy evaluation can be employed with generators to keep going until you're out of rows.

In [None]:
def read_all_data(some_file):
    """Continues to read lines until out of rows. Use with file connection."""
    # keep on going
    while True:
        line = some_file.readline()
        # if end of file then finish up
        if not line:
            break
        else:
            yield line

In [None]:
read_all_data(os.path.join(here(), "data", "taxis.csv"))

In [None]:
with open(os.path.join(here(), "data", "taxis.csv")) as taxicabs:
    taxi_gen = read_all_data(taxicabs)
    print(next(taxi_gen))
    print(next(taxi_gen))

The above proves that the file read process can be achieved with a yield generator. Now let's go back to `read_all_data()` and tabulate values until we run out of rows.

In [None]:
with open(os.path.join(here(), "data", "taxis.csv")) as taxicabs:
    # skip the colnames by reading the line. Works like iter().next()
    taxicabs.readline()
    table = dict()
    # Modify the below generator line to use the yield function
    # for row in range(1000):
    for l in read_all_data(taxicabs):
        rowvals = l.split(",")
        # return the payment method value
        rowval = rowvals[10]
        if rowval in table.keys():
            table[rowval] += 1
        else:
            table[rowval] = 1

In [None]:
table

## Pandas chunksize

This argument produces a iterable `reader` object:

In [None]:
taxi_reader = pd.read_csv(os.path.join(here(), "data", "taxis.csv"), chunksize=5)
taxi_reader

In [None]:
print(next(taxi_reader))

In [None]:
del taxi_reader

In [None]:
# read in 2000 lines and filter to pickup zone Hudson Sq
taxi_reader = pd.read_csv(os.path.join(here(), "data", "taxis.csv"), chunksize=2000)
taxis = next(taxi_reader)

In [None]:
hudson_pickups = taxis[taxis["pickup_zone"] == "Hudson Sq"]
hudson_pickups.head()

In [None]:
# extract the fare and tip columns to work out a new percentage tip column
charges = zip(hudson_pickups["fare"], hudson_pickups["tip"])
charge_list = list(charges)
charge_list

In [None]:
# use a list comprehension to calculate the perc tip
hudson_pickups["tip_perc"] = [round(tip / fare * 100, 2) for fare, tip in charge_list]
hudson_pickups.head()

In [None]:
hudson_pickups.boxplot(column="tip_perc")