In [3]:
import os
import requests
from urllib.parse import urljoin

In [2]:
!pip install --upgrade ..


Processing /Users/elizabeth/Documents/urbanlabs/NCHRP 08-110/working/forecastcards


Building wheels for collected packages: forecastcards
  Running setup.py bdist_wheel for forecastcards ... [?25ldone
[?25h  Stored in directory: /private/var/folders/60/xd2kny110pxfz3ln611jq7hm0000gn/T/pip-ephem-wheel-cache-2_z9g1mg/wheels/d0/d4/f9/2a525455b03f69bc95fcbc11f800fcc010800425cdf26ca30d
Successfully built forecastcards
Installing collected packages: forecastcards
Successfully installed forecastcards-0.1.0.dev0


In [4]:
import forecastcards

# Data Validation and Preparation

1. Find and map where the data is
2. Validate data conforms to schema
3. Combine data
4. Clean and format data


## 1 - Map the Cards

Finds all the relevant cards and assigns them a type in order to compare to the data schema.

Returns a dictionary of card locations by card type.

**`map_cards`**`(  
    repo_loc = default_repo_api,   
    subdirs  = default_subdirs 
  ):`


  - **`repo_loc`** - API tree URL for data to be used
  - **`subdirs`**  - list of subdirs to search through



In [197]:
schema_locs = { 'poi'         : "https://raw.github.com/e-lo/forecast-cards/master/spec/en/poi-schema.json",
                "scenario"    : "https://raw.github.com/e-lo/forecast-cards/master/spec/en/scenario-schema.json",
                "project"     : "https://raw.github.com/e-lo/forecast-cards/master/spec/en/project-schema.json",
                "observations": "https://raw.github.com/e-lo/forecast-cards/master/spec/en/observations-schema.json",
                "forecast"    : "https://raw.github.com/e-lo/forecast-cards/master/spec/en/forecast-schema.json",
}



data_reports = forecastcards.validate_cards(card_locs,schema_locs)

default_data_loc = os.path.join('forecastcards','examples')
projects         = []
exclude_projects = []

#should add cards to map, not initiate
# map = map_cards()
#         add_cards()
#            sniff, add method

import glob, csv
from goodtables import validate
import requests,csv

def get_csv_from_url(url):
    with requests.Session() as s:
        download = s.get(url)

        decoded_content = download.content.decode('utf-8')

        cr = csv.reader(decoded_content.splitlines(), delimiter=',')
        return cr

def raw_github_url(username, repository, branch='master'):
    gh = "https://raw.githubusercontent.com/"+str(username)+"/"+str(repository)+"/"+str(branch)+"/"
    return gh

def api_github_url(username, repository, branch='master'):
    gh = "https://api.github.com/repos/"+str(username)+"/"+str(repository)+"/git/trees/"+str(branch)+"?recursive=1"
    return gh


class Cardset:
    def __init__(self,
                 data_loc ='../forecastcards/examples', 
                 select_projects = [], 
                 exclude_projects = [], 
                 schemas ={},
                 schema_locs = schema_locs,
                ):
        
        self.card_locs = {'poi'         : [],
                          'scenario'    : [],
                          'observations': [],
                          'forecast'    : [],
                          'project'     : []}
                            
        self.data_locs = []
        self.schema_locs = schema_locs # dict by card type of urls to JSON files or schema instances
        
        # list of all projects
        self.projects = []
        
        # validation status
        self.validated_projects   = []
        self.unvalidated_projects = []
        self.invalid_projects     = []
        
        # valid project requires following valid components
        self.validity_requires    = ['project','poi','observations','scenario','forecast']
        
        # add initial projects
        self.add_projects(data_loc, select_projects=select_projects, exclude_projects=exclude_projects)
    
    def validate_project(self, p_card_locs, schema_locs={}, validity_requires = []):
        
        if not schema_locs:
            schema_locs = self.schemas_locs
            
        if not validity_requires: 
            validity_requires = self.validity_requires
            
        ## todo validate against local schemas
        valid        = True
        fail_reports = []
        
        for card_type, locs in p_card_locs.items():
            #print ("validating",card_type, locs)
        
            ##todo add other validation checks here
            for card in locs:
                report = validate(card ,schema=requests.get(schema_locs[card_type]).json())
                if not report['valid']:
                    if card_type in validity_requires:
                        valid = False
                    print ("Validation Error:", card)
                    fail_reports.append(report)
                    
        return valid, fail_reports
    
    def check_project_id(self, project_id, select_projects=[], exclude_projects=[]):
        '''
        :returns: True if project should be added; False if it shouldn't
        '''
        # check to see if project should be added
        if project_id in self.projects:
            print("Excluding", project_id, " already in cardset")
            return False
        elif projects and project_id not in projects:
            print("Excluding", project_id, " because it isn't in project list")
            return False
        elif exclude_projects and project_id in exclude_projects:
            print("Excluding", project_id, " because it is in the exclusion list")
            return False
        else:
            return True
    
    def add_github_projects(self, data_loc, select_projects=[], exclude_projects=[]):
        '''
        :data_loc: is expecting a dictionary with username, repository, branch
        
        This is messy right now b/c we have to loop though several times. 
        1. to find project IDs and their directories that we want to import
        2. to find all the files in those directories
        3. to validate the project once we have all those files together
        
        It would be a lot easier if we were able to take the directory names as the project id or similar.
        '''
        import requests
        import re
        
        repo_loc = api_github_url(
            username = data_loc['username'], 
            repository = data_loc['repository'], 
            branch = data_loc['branch'])
        
        repo_raw = raw_github_url(
            username = data_loc['username'], 
            repository = data_loc['repository'], 
            branch = data_loc['branch']) 
        
        
        subdirs  = ['forecastcards/examples']
        projdirs_to_import = {}
        cards_by_project = {}
        
        r = requests.get(repo_loc)
        rj = r.json()
        #if verbose: print(rj)
        card_locs = {
               "poi": [],
               "scenario": [],
               "project": [],
               "observations": [],
               "forecast": [],
        }
        
        # First loop through is to find all the projects that we want to import
        # We need to get their project ids as well as the folders they live in
        for file in rj['tree']:
            #print (file['path'])
            
            # don't look at things that aren't files
            if file['type']!='blob': continue

            # split path into a list
            path_list = file['path'].split("/")

            # don't look in wrong subdirs
            if subdirs and not any(s in file['path'] for s in subdirs): continue
           
            # find project*.csv and get project id
            if path_list[-1][-4:].lower()==".csv" and path_list[-1][0:7].lower() == "project":
                print("opening",urljoin(repo_raw,file['path']))
                # open project*csv to see if we have a good project id
                
                proj_csv = list(get_csv_from_url(urljoin(repo_raw,file['path'])))
                #print(proj_csv)
            
                assert(proj_csv[0][0] == 'project_id')
                project_id   = proj_csv[1][0].strip().lower()
                
                project_path = os.path.dirname(file['path'])
                #print(project_path)
            
                # check to see if we should be adding this project
                if not self.check_project_id(project_id, select_projects=select_projects, exclude_projects=exclude_projects):
                    continue                                       
                
                # add this directory to the list to import, and to overall project lists
                projdirs_to_import[project_path] = project_id
                cards_by_project[project_id] = {
                    'project' : [urljoin(repo_raw,file['path'])],
                    'scenario': [],
                    'forecast': [],
                    'observations' :[],
                    'poi':[],
                }
                self.projects.append(project_id)
                self.unvalidated_projects.append(project_id)
                
                
        # second loop back through files to find the ones in the right project directory
        for file in rj['tree']:

            #make sure you should be importing this file
            if not any( pdirs in file['path'] for pdirs in projdirs_to_import.keys() ):
                continue

            ## don't look at things that aren't files
            if file['type']!='blob': continue

            # split path into a list
            path_list = file['path'].split("/")

            if path_list[-1][-4:].lower()!=".csv": continue

            if path_list[-1][0:8].lower()=="scenario":
                project_path = os.path.dirname(file['path'])
                project_id = projdirs_to_import[project_path]
                cards_by_project[project_id]['scenario'].append(urljoin(repo_raw,file['path']))
                #print("adding scenario:",file['path'])
            if path_list[-1][0:8].lower()=="forecast":
                project_path = os.path.dirname(os.path.dirname(file['path']))
                project_id = projdirs_to_import[project_path]
                cards_by_project[project_id]['forecast'].append(urljoin(repo_raw,file['path']))
                #print("adding forecast:",file['path'])
            if path_list[-1][0:12].lower()=="observations":
                project_path = os.path.dirname(os.path.dirname(file['path']))
                project_id = projdirs_to_import[project_path]
                cards_by_project[project_id]['observations'].append(urljoin(repo_raw,file['path']))
                #print("adding observation:",file['path'])
            if path_list[-1][0:3].lower()=="poi":
                project_path = os.path.dirname(file['path'])
                project_id = projdirs_to_import[project_path]
                cards_by_project[project_id]['poi'].append(urljoin(repo_raw,file['path']))
                #print("adding poi:",file['path'])
                
        #Third loop is through the cards_by_project, which need to be validated all together
        failed_validation_reports = []
        for project_id, cards in cards_by_project.items():
            p_valid, fail_reports = self.validate_project(cards,self.schema_locs)
            
            if p_valid: 
                self.validated_projects.append(project_id)
                self.unvalidated_projects.remove(project_id)
                print("adding",project_id," - valid")
                for k,v in self.card_locs.items():
                    v += cards[k]
                else: 
                    self.invalid_projects.append(project_id)
            
            failed_validation_reports.append(fail_reports)
        return failed_validation_reports
    
    def add_local_projects(self, data_loc, select_projects=[], exclude_projects=[]):
        
        #find projects by searching for the project csv file
        project_loc = os.path.join(data_loc,'**/project*.csv')
        
        failed_validation_reports = []
        
        ##todo make more windows safe
        for filepath in glob.iglob(project_loc, recursive=True):
            with open(filepath, 'r') as f:
                proj_csv = list(csv.reader(f))
            
            assert(proj_csv[0][0] == 'project_id')
            project_id   = proj_csv[1][0].strip().lower()

            project_path = os.path.dirname(filepath)
            
            #check to see if we should be adding this project
            if not self.check_project_id(project_id, select_projects=select_projects, exclude_projects=exclude_projects):
                continue
                
            self.projects.append(project_id)
            self.unvalidated_projects.append(project_id)
                
            p_card_locs = {'poi'         : glob.glob(os.path.join(project_path,'poi*.csv'),recursive=True),
                           'scenario'    : glob.glob(os.path.join(project_path,'scenario*.csv'),recursive=True),
                           'observations': glob.glob(os.path.join(project_path,'**/observations*.csv'),recursive=True),
                           'forecast'    : glob.glob(os.path.join(project_path,'**/forecast*.csv'),recursive=True),
                           'project'     : [filepath]}
            
            p_valid, fail_reports = self.validate_project(p_card_locs,self.schema_locs)
            
            if p_valid: 
                self.validated_projects.append(project_id)
                self.unvalidated_projects.remove(project_id)
                print("adding",project_id," - valid")
                for k,v in self.card_locs.items():
                    v += p_card_locs[k]
            else: 
                self.invalid_projects.append(project_id)
            
            failed_validation_reports.append(fail_reports)
    
        return failed_validation_reports
                
    
    def add_projects(self, data_loc, select_projects=[], exclude_projects=[]):
        '''
        
        '''
        self.data_locs.append(data_loc)
        
        if type(data_loc) is dict:
            reports = self.add_github_projects(data_loc,select_projects=select_projects, exclude_projects=exclude_projects)
        
        else:
            reports = self.add_local_projects(data_loc,select_projects=select_projects, exclude_projects=exclude_projects)
        
                
card_map = Cardset({'username':'e-lo','repository':'forecastcards','branch':'master'})
#card_map

https://api.github.com/repos/e-lo/forecastcards/git/trees/master?recursive=1
https://api.github.com/repos/e-lo/forecastcards/git/trees/66b3e4fac72da8d179fa2cbd4a06870624b321ba?recursive=1
https://raw.githubusercontent.com/e-lo/forecastcards/master/
https://raw.githubusercontent.com/e-lo/forecastcards/master/
opening https://raw.githubusercontent.com/e-lo/forecastcards/master/forecastcards/examples/ecdot-rx123-yellowbrickroadhov/project.csv
adding rx123  - valid


In [165]:
u = 'https://raw.githubusercontent.com/e-lo/forecastcards/master/forecastcards/examples/ecdot-rx123-yellowbrickroadhov/project.csv'

with requests.Session() as s:
    
    download = s.get(u)

    decoded_content = download.content.decode('utf-8')
    c = csv.reader(decoded_content.splitlines(), delimiter=',')
    cr = list(c)
    print(c)




<_csv.reader object at 0x11778ac88>


## 2 - Validate Forecast Card Data

Uses [Frictionless Good Tables](https://github.com/frictionlessdata/goodtables-py) to validate that the data matches the schemas.

Returns a dictionary of reports by card type.

**`validate_cards`**`(  
    card_locs,
    schemas_loc 
   ):`


  - **`card_locs`** - dictionary of `card type`: list of files
  - **`schemas_loc`** - dictionary of `card type` : schema locations

**TIP: ** If data doesn't validate, try to resolve with the GUI at  https://try.goodtables.io

In [0]:
schema_locs = { 'poi'         : "https://raw.github.com/e-lo/forecast-cards/master/spec/en/poi-schema.json",
                "scenario"    : "https://raw.github.com/e-lo/forecast-cards/master/spec/en/scenario-schema.json",
                "project"     : "https://raw.github.com/e-lo/forecast-cards/master/spec/en/project-schema.json",
                "observations": "https://raw.github.com/e-lo/forecast-cards/master/spec/en/observations-schema.json",
                "forecast"    : "https://raw.github.com/e-lo/forecast-cards/master/spec/en/forecast-schema.json",
}

data_reports = forecastcards.validate_cards(card_locs,schema_locs)
    

In [0]:
data_reports['poi']

## 3 - Combine Data

Returns a dictionary of reports by card type.

**`validate_cards`**`(  
    card_locs 
    schemas_loc 
   ):`


  - **`card_locs`** - dictionary of `card type`: list of files
  - **`schemas_loc`** - dictionary of `card type` : schema locations

In [0]:
all_df = forecastcards.combine_data(card_locs)

In [0]:
all_df.dtypes

## 4 - Clean and Recode

- Fix missing values
- Code categorical variables
- Scale for estimation

Note that this entire process can be exceuted by calling `default_data_clean(df)`

### Fix Missing Values

Returns a dataframe that has some missing data recoded to 'missing' and some records dropped because they didn't have minimum values.

**`fix_missing_values`**`(  
    dataframe
    recode_na_vars = default_recode_na_vars
    no_na_vars     = default_no_na_vars
   ):`


  - **`recode_na_vars`** - list of variables to recode NA to "missing"
  - **`no_na_vars`** - list of variables where having an NA isn't acceptable

In [0]:
recode_na_vars = ['forecast_system_type', 'area_type', 'forecaster_type', 'state', 'agency', 'functional_class','facility_type','project_type']
no_na_vars     = ['scenario_date','forecast_creation_date','forecast_value','obs_value']

select_df = forecastcards.fix_missing_values(all_df,
                                             recode_na_vars=recode_na_vars,
                                             no_na_vars=no_na_vars)

### Create Categorical Variables

1. Add categorical variables for project size (cutoff: 30k), scenario decade and forecast decade.

   Returns a dataframe.

   **`create_default_categorical_vars`**`(  
    dataframe
   )`
   
2. Recodes categorical variables to dummy variables.

    Returns a dataframe.

    **`categorical_to_dummy`**`(  
    dataframe
    categorical_cols_list=default_categorical_cols,
    required_vars = default_required_vars
   ):`
   
 
 - **`categorical_cols_list`** - list of columns that will be recoded  
 - **`required_vars`** - list of variables that will be kept

In [0]:
select_df = forecastcards.create_default_categorical_vars(select_df)
estimate_df = forecastcards.categorical_to_dummy(select_df)

In [0]:
estimate_df.dtypes

### Scale 

Returns a dataframe that has the dummy variables scaled to the forecast value so that the estimation isn't biased.

**`scale_dummies_by_forecast`**`(  
    dataframe
    no_scale_cols=default_no_scale_cols
   ):`


  - **`no_scale_cols`** - list of variables that won't be scaled

In [0]:
scaled_df = forecastcards.scale_dummies_by_forecast_value(estimate_df)

# Dataset ready for Estimation

In [0]:
scaled_df.describe()
scaled_df

In [0]:
estimate_df.plot.scatter(y='forecast_value',
                         x='obs_value')
