# Reading from csv
* This document attempts to provide basic syntax and a few examples of converting csvs into various python objects. 

## Setup

In [4]:
import csv 
# Extend notebook viewport
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [5]:
# magic function to list directories
    # you will see that there are no csvs here, that is because I have the sample data within a data directory
%ls 

[0m[01;34mdata[0m/  Pypoll_starter.ipynb  working_with_csvs.ipynb


In [7]:
# If I list the contents of that directory you will see the file I am looking for: SalesJan2009.csv
%ls ../../data

SalesJan2009.csv


In [9]:
# this means that I want to reference a file that is:
    # one directory up: ..
    # inside a data subdirectory: data
    # called SalesJan2009.csv: : name
csv_path = '../../data/SalesJan2009.csv'

## Example: Read and Print

In [10]:
# example: reading csv
    # with open(file location, method {'r': 'read', 'w': 'write, rb: read bytes, wb: write bytes, etc.}) as some name: do stuff
with open(csv_path, 'r') as alias:
    # increment counter
    i = 0
    # csv.reader returns an iterable (each iteration returns a line of the file)
    for line in csv.reader(alias):
        # first pass through, just printing the first 5 lines
        print(line)
        # break after 5 lines
        if i>=4:
            break
        # increment counter
        i+=1
        

['Transaction_date', 'Product', 'Price', 'Payment_Type', 'Name', 'City', 'State', 'Country', 'Account_Created', 'Last_Login', 'Latitude', 'Longitude']
['1/2/09 6:17', 'Product1', '1200', 'Mastercard', 'carolina', 'Basildon', 'England', 'United Kingdom', '1/2/09 6:00', '1/2/09 6:08', '51.5', '-1.1166667']
['1/2/09 4:53', 'Product1', '1200', 'Visa', 'Betina', 'Parkville                   ', 'MO', 'United States', '1/2/09 4:42', '1/2/09 7:49', '39.195', '-94.68194']
['1/2/09 13:08', 'Product1', '1200', 'Mastercard', 'Federica e Andrea', 'Astoria                     ', 'OR', 'United States', '1/1/09 16:21', '1/3/09 12:32', '46.18806', '-123.83']
['1/3/09 14:44', 'Product1', '1200', 'Visa', 'Gouya', 'Echuca', 'Victoria', 'Australia', '9/25/05 21:13', '1/3/09 14:22', '-36.1333333', '144.75']


## Strategies for working with csvs
* Below I am laying out a few strategies that I have used to work with csv's.
* This is obviously not an exhaustive list 

### Append to external list, then work with list

In [6]:
# create list to carry each row
csv_contents = []
# open statement
with open(csv_path, 'r') as alias:
    # counter
    i = 0
    # csv.reader returns an iterable (each iteration returns a line of the file)
    for line in csv.reader(alias):
        # handle first line (column headers) differently
        if i == 0:
            columns = line
            i+=1
        else:
            csv_contents.append(line)
            
print(f"Columns: {columns}")
# ignore the wonky string formatting
print(
    "CSV Contents: (first 5 rows, formatted)\n\t{}".format(
        '\n\t'.join(
            [', '.join(row) for row in csv_contents[:5]]
        )
    )
)

Columns: ['Transaction_date', 'Product', 'Price', 'Payment_Type', 'Name', 'City', 'State', 'Country', 'Account_Created', 'Last_Login', 'Latitude', 'Longitude']
CSV Contents: (first 5 rows, formatted)
	1/2/09 6:17, Product1, 1200, Mastercard, carolina, Basildon, England, United Kingdom, 1/2/09 6:00, 1/2/09 6:08, 51.5, -1.1166667
	1/2/09 4:53, Product1, 1200, Visa, Betina, Parkville                   , MO, United States, 1/2/09 4:42, 1/2/09 7:49, 39.195, -94.68194
	1/2/09 13:08, Product1, 1200, Mastercard, Federica e Andrea, Astoria                     , OR, United States, 1/1/09 16:21, 1/3/09 12:32, 46.18806, -123.83
	1/3/09 14:44, Product1, 1200, Visa, Gouya, Echuca, Victoria, Australia, 9/25/05 21:13, 1/3/09 14:22, -36.1333333, 144.75
	1/4/09 12:56, Product2, 3600, Visa, Gerd W , Cahaba Heights              , AL, United States, 11/15/08 15:47, 1/4/09 12:45, 33.52056, -86.8025


### Create column specific Dictionary, then work with columns as lists

In [7]:
# create dictionary to carry columns 
csv_columns = {}
# open statement
with open(csv_path, 'r') as alias:
    # counter
    i = 0
    # csv.reader returns an iterable (each iteration returns a line of the file)
    for line in csv.reader(alias):
        # handle first line (column headers) differently
        if i == 0:
            # create list to hold column names
            columns = line
            print(f"This is what columns looks like: {columns}")
            # create a dictionary key with an empty list for each column
            for column_name in columns:
                csv_columns[column_name] = []
            # increment counter
            i+=1
        else:
            # for the first round through lets print out what the zipped object looks like 
            if i==1:
                print("\nThis is what zip(columns, line) returns: (With some additional formatting added)\n\t",
                    "\n\t".join(
                        [f"{col}: {val}" for col, val in zip(columns, line)]
                    )
                )
                # increment counter 
                i+=1
            # iterate through each column in the line, using zip to make column names available for dictionary mapping
            for column_name, value in zip(columns, line):
                # append value to columns list in csv_columns dictionary
                csv_columns[column_name].append(value)

# now we have a dictionary that has a key for each column, with a list of each value as its key
print(f'\nOutput:\nDictionary Keys: \n\t{csv_columns.keys()}')
print(f'Example Values List (truncated): \n\t {csv_columns["Country"][:10]}')

This is what columns looks like: ['Transaction_date', 'Product', 'Price', 'Payment_Type', 'Name', 'City', 'State', 'Country', 'Account_Created', 'Last_Login', 'Latitude', 'Longitude']

This is what zip(columns, line) returns: (With some additional formatting added)
	 Transaction_date: 1/2/09 6:17
	Product: Product1
	Price: 1200
	Payment_Type: Mastercard
	Name: carolina
	City: Basildon
	State: England
	Country: United Kingdom
	Account_Created: 1/2/09 6:00
	Last_Login: 1/2/09 6:08
	Latitude: 51.5
	Longitude: -1.1166667

Output:
Dictionary Keys: 
	dict_keys(['Transaction_date', 'Product', 'Price', 'Payment_Type', 'Name', 'City', 'State', 'Country', 'Account_Created', 'Last_Login', 'Latitude', 'Longitude'])
Example Values List (truncated): 
	 ['United Kingdom', 'United States', 'United States', 'Australia', 'United States', 'United States', 'United States', 'United States', 'Israel', 'France']


### Implement filtering/ logic/ etc. while reading
* In this example Assume that I only want to answer: "What is the average purchase price in America?"
* For this we don't need to store the whole csv in memory, but rather we can pull out necessary values while reading, then discard the rest

In [8]:
# create variables to hold values 
running_sum, n_entries = 0,0
# define the columns we want, and a space that we can insert the index location
col_idxs = {'Price': None, 'Country': None}
country_match = "United States"

# open statement
with open(csv_path, 'r') as alias:
    # counter
    i = 0
    # csv.reader returns an iterable (each iteration returns a line of the file)
    for line in csv.reader(alias):
        # with the columns row, lets find the index of the price and country column 
        if i ==0:
            print(f"Columns: {line}")
            # find index location of each defined dictionary key
            for k in col_idxs.keys():
                idx = line.index(k)
                print(f"Column: \"{k}\" is in position: {idx}")
                col_idxs[k] = idx
            print(f"Current dictionary: {col_idxs}")
            i+=1
        else:
            # evaluate if line element in position of matched index matches country_match value
            if line[col_idxs['Country']] == country_match:
                # if yes: grab price value (and convert to int), increment n_entries, add value to running_sum
                val = line[col_idxs['Price']]
                # some prices are delimeted with ,'s: handle this
                if ',' in val:
                    # replace , with nothing
                    val = val.replace(',','')
                # now cast as integer
                val = int(val)
                running_sum += val
                n_entries += 1
            
# calculate average
print(f"The average price of a transaction in {country_match} is : ${round(running_sum/n_entries, 2)}")
print(f"This was caluculated by taking the Sum of purchases: {running_sum}\n\tDivided by the total number of purchases: {n_entries}")

Columns: ['Transaction_date', 'Product', 'Price', 'Payment_Type', 'Name', 'City', 'State', 'Country', 'Account_Created', 'Last_Login', 'Latitude', 'Longitude']
Column: "Price" is in position: 2
Column: "Country" is in position: 7
Current dictionary: {'Price': 2, 'Country': 7}
The average price of a transaction in United States is : $1619.87
This was caluculated by taking the Sum of purchases: 750000
	Divided by the total number of purchases: 463


### Read CSV with Pandas, then work with dataframe
* Moving Forward, this will be the main method of doing this (Because, as you will see, it's really easy)

In [9]:
# import module
import pandas as pd
# read_csv into df variable
df = pd.read_csv(csv_path, header=0, parse_dates=[0, 8, 9])

In [10]:
# print first five rows of dataframe
df.head()

Unnamed: 0,Transaction_date,Product,Price,Payment_Type,Name,City,State,Country,Account_Created,Last_Login,Latitude,Longitude
0,2009-01-02 06:17:00,Product1,1200,Mastercard,carolina,Basildon,England,United Kingdom,2009-01-02 06:00:00,2009-01-02 06:08:00,51.5,-1.116667
1,2009-01-02 04:53:00,Product1,1200,Visa,Betina,Parkville,MO,United States,2009-01-02 04:42:00,2009-01-02 07:49:00,39.195,-94.68194
2,2009-01-02 13:08:00,Product1,1200,Mastercard,Federica e Andrea,Astoria,OR,United States,2009-01-01 16:21:00,2009-01-03 12:32:00,46.18806,-123.83
3,2009-01-03 14:44:00,Product1,1200,Visa,Gouya,Echuca,Victoria,Australia,2005-09-25 21:13:00,2009-01-03 14:22:00,-36.133333,144.75
4,2009-01-04 12:56:00,Product2,3600,Visa,Gerd W,Cahaba Heights,AL,United States,2008-11-15 15:47:00,2009-01-04 12:45:00,33.52056,-86.8025


# Summary
Hopefully these examples will help provide a couple examples of how you may convert csvs into a python data type that you are familiar with. Obviously, there are unlimited ways that you could accomplish the same tasks. Regardless, with these examples as templates, you should be able to modify the existing code to personalize your unique approach. I hope you enjoyed this demo!