[Return to Data Page](https://social-lorax.github.io/guides/i2.html)

In [1]:
#pip install pandas 

In [2]:
import pandas as pd 
import numpy as np
%matplotlib inline

# Data in Python

### Equity

* Where does this data come from?
* Why was this data collected?
* How was this data generated?
* Is this data demographically representative?
* Who is included and who is excluded from this data?
* Whose voices, lives, and experiences are missing?
* How much can this data be disaggregated by race, gender, ethnicity, etc.?
* Are the categories mutually exclusive and fully inclusive?
* Are there “other” categories and, if so, who does that include?
* Who stands to benefit from this data?
* Who might be harmed by the collection or publication of this data?

(See more in Urban Institute’s [Do No Harm Guide](https://www.urban.org/research/publication/do-no-harm-guide-applying-equity-awareness-data-visualization/view/full_report))

![](https://github.com/social-lorax/howto_codebooks/blob/master/images/underlines/python_underline.png?raw=true)

# Importing 

`pandas` covers the basics like csv files

In [None]:
pd.read_csv("path/to/file.csv")

In [None]:
pd.read_csv("url")

<br> 

`pandas` can also deal with excel files

In [None]:
#pip install xlrd

In [None]:
pd.read_excel("path/to/file.xlxs")

![](https://github.com/social-lorax/howto_codebooks/blob/master/images/underlines/python_underline.png?raw=true)

# Creating 


In [3]:
data_dict = {"County": ["United States", "Canada", "Mexico"],
             "Captial": ["Washington D.C.", "Ottawa", "Ciudad de Mexico"],
             "Population_mil": [328.2, 37.59, 127.6]}

pd.DataFrame(data_dict)

Unnamed: 0,County,Captial,Population_mil
0,United States,Washington D.C.,328.2
1,Canada,Ottawa,37.59
2,Mexico,Ciudad de Mexico,127.6


![](https://github.com/social-lorax/howto_codebooks/blob/master/images/underlines/python_underline.png?raw=true)

# Exporting 

In [None]:
df.DataFrame.to_csv(index = False)

![](https://github.com/social-lorax/howto_codebooks/blob/master/images/underlines/python_underline.png?raw=true)

# APIs 

In [4]:
import requests #the module for making HTTP requests in Python; provides GET funcionality

try: #spelling depends on enviroment version 
    import urllib2 as urllib #URL handling module
except ImportError:
    import urllib.request as urllib
    
import json
import glob #Unix style pathname pattern expansion
import sys

### NYC Open Data

In [5]:
parameter = {'pulocationid':149, 'dolocationid':132}
url =  "https://data.cityofnewyork.us/resource/t29m-gskq.json"
r = requests.get(url = url, params=parameter)
data = r.json()

data[1]

{'vendorid': '1',
 'tpep_pickup_datetime': '2018-02-26T05:07:58.000',
 'tpep_dropoff_datetime': '2018-02-26T05:31:54.000',
 'passenger_count': '1',
 'trip_distance': '15.90',
 'ratecodeid': '1',
 'store_and_fwd_flag': 'N',
 'pulocationid': '149',
 'dolocationid': '132',
 'payment_type': '1',
 'fare_amount': '43',
 'extra': '0.5',
 'mta_tax': '0.5',
 'tip_amount': '7',
 'tolls_amount': '0',
 'improvement_surcharge': '0.3',
 'total_amount': '51.3'}

In [6]:
def getFare(df):
    
    fares = [] # list for storing fares
    
    for index, row in df.iterrows(): # iterating through all rows of sample points
        
        # specify parameters for making request
        parameters = {'pulocationid':int(row['Start_Zone']), 'dolocationid':int(row['End_Zone'])}
        
        url =  "https://data.cityofnewyork.us/resource/t29m-gskq.json"
        r = requests.get(url = url, params=parameters)
        data = r.json()
        
        odFare = []
        
        for obs in data: # iterating through each returned observation for the returned data 
            
            # making sanity checks and appending fares to 'odFare' list
            try:
                fare = float(obs['fare_amount'])

                if (fare < 300 and fare > 2.5 ):
                    odFare.append(fare)
                    
            except: 
                pass
            
        # appending the mean of travel times retrieved above to the 'fares' list
        fares.append(np.mean(odFare))
        
    return fares

In [7]:
data_dict = {"Trip": ["Home to JFK", "Home to LaGuardia"],
             "Start_Zone": [149, 149],
             "End_Zone": [132, 138]}

ods = pd.DataFrame(data_dict)

ods["Avg_Fare"] = getFare(ods)

ods

Unnamed: 0,Trip,Start_Zone,End_Zone,Avg_Fare
0,Home to JFK,149,132,42.875
1,Home to LaGuardia,149,138,62.125


![](https://github.com/social-lorax/howto_codebooks/blob/master/images/underlines/python_underline.png?raw=true)