# Cleaner Notebook
Appends pipeline reports together

In [64]:
import csv
import dateutil
import glob
import json
import os
import pandas as pd
import logging
import numpy as np
pd.set_option('display.max_columns', 500)

In [65]:
logging.basicConfig(level=logging.INFO)

# Actual Code

In [66]:
#Define Functions

def clean_df(df):
    # Standardize best_stat
    df.best_stat = df.best_stat.apply(lambda x: x.strip().upper())

    # Standardize best_date
    df.best_date = df.best_date.apply(maybe_format_date)
    
    # Standardize apn to 7 digits (with leading 0)
    df['apn']=df['apn'].apply(lambda x: x.zfill(7))
    
    # Compile "First Filed" variable
    df['firstfiled'] = pd.to_datetime(df['firstfiled'])
    df['planning_filed'] = pd.to_datetime(df['planning_filed'])
    df['dbi_filed'] = pd.to_datetime(df['dbi_filed'])
    
    def rule(value):
        if pd.isnull(value['firstfiled']):
            if pd.isnull(value['planning_filed']) & pd.notnull(value['dbi_filed']):
                return value['dbi_filed']
            elif pd.isnull(value['dbi_filed']) & pd.notnull(value['planning_filed']):
                return value['planning_filed']
            elif pd.notnull(value['dbi_filed']) & pd.notnull(value['planning_filed']):
                return value[['planning_filed', 'dbi_filed']].min()
        else:
            return value['firstfiled']
    
    df['firstfiled'] = df.apply(rule, axis = 1)
    

    # Standardize APN
    # Some APN values are "APN XXXXXX", some are just "XXXXXXX", standardize this.
    records_with_apn = df.apn.notnull()
    df.apn = df.apn[records_with_apn].apply(lambda x: x.split()[-1])

    # Standardize Lat/Long
    records_with_location_attribute = df.location.notnull()
    if 'x' not in df.columns:
        df.x = df.location[records_with_location_attribute].apply(get_long_from_glob)
        df.y = df.location[records_with_location_attribute].apply(get_lat_from_glob)
    elif 'x' in df.columns:
        df.x[records_with_location_attribute] = df.location[records_with_location_attribute].apply(get_long_from_glob)
        df.y[records_with_location_attribute] = df.location[records_with_location_attribute].apply(get_lat_from_glob)

    # Get address into separate fields for cases where it is concatenated with the lat,long
    if 'address' not in df.columns:
        records_with_address_in_location_attribute = df.location.apply(lambda x: not pd.isnull(x) and '\n' in x)
        df.address = df.location[records_with_address_in_location_attribute].apply(get_address_from_glob)
    else:
        df['address2'] = df.address #df.address2 syntax would result in chained assignment
        records_with_address_in_location_attribute = df.location.apply(lambda x: not pd.isnull(x) and '\n' in x) #and df.address.apply(lambda x: pandas.isnull(x)) 
        df.address[records_with_address_in_location_attribute] = df.location[records_with_address_in_location_attribute].apply(get_address_from_glob)
        records_with_address_field = df.address2.notnull() #need additional layer of cleaning because some filed have zip codes in location field instead of lat long. These usually have separate address field to use. 
        df.address[records_with_address_field] = df.address2[records_with_address_field]

    # Standardize address by uppercasing and removing punctuation
    df.address = df.address.apply(standardize_address)

    # Classify neighborhoods using point-in-polygon approach. Brian: Skipping this for now
    #df['classified_neighborhood'] = geo_classifier.classify_df(df)

    return df

def maybe_format_date(s):
    """
    Use advanced date parsing (python-dateutil) to parse
    the first 10 chars of each line.
    """
    if len(s) > 0:
        try:
            return dateutil.parser.parse(s.strip()[:10]).strftime("%Y-%m-%d")
        except Exception as e:
            logging.exception("Date formatting failed for {}".format(s))
            return s
    return s
    
def get_coords_tuple_from_address_lat_long_glob(s):
    """
    For some quarters, the address and lat/long tuple are
    in the same column, concatenated together with a newline

    Returns: tuple
    """
    if len(s)>0:
        lat_long_tuple = s.split('\n')[-1]
        # Dirty hack, the lat long tuple happens to be valid python syntax so.. YOLO
        return eval(lat_long_tuple)
    else:
        return s

def get_lat_from_glob(s):
    if len(s) > 0:
        try:
            return get_coords_tuple_from_address_lat_long_glob(s)[0]
        except Exception as e:
            logging.exception("Lat long glob parsing failed for {}".format(s))
            return np.nan
    else:
        return np.nan

def get_long_from_glob(s):
    if len(s) > 0:
        try:
            return get_coords_tuple_from_address_lat_long_glob(s)[1]
        except Exception as e:
            logging.exception("Lat long glob parsing failed for {}".format(s))
            return np.nan
    else:
        return  np.nan
    
def get_address_from_glob(s):
    return s.split('\n')[0]

def standardize_address(s):
    return s.upper().replace('.', '').replace(',', '')

def main():
    ################################################################################
    # Allows for a map of column names to be used to convert field names and output
    # the result into a dict object
    #
    def load_csv_with_mapping(csvfile, column_mapping):
        mapped_data = []
        for row in csv.DictReader(csvfile):
            mapped_row = dict((column_mapping[k], v) for k, v in row.items())
            mapped_data.append(mapped_row)
        return mapped_data
    
     ################################################################################
    # Given a file with the column names, create a list of dictionaries with
    # additional entries for the year and quarter created from the date field to
    # ease future transforms
    #
    # see https://docs.google.com/spreadsheets/d/1ikjaHDLf-iCGBCQ1KmSIXVEiVNbX8pQzW26yYqhrH3U/edit#gid=1633784412
    #
    column_mappings = []
    for mapping_filename in os.listdir('raw/columnnames/'):
        with open('raw/columnnames/' + mapping_filename, 'r') as mapping_file:
            column_mapping = {}
            column_mapping['year'] = mapping_filename[0:4]
            column_mapping['quarter'] = mapping_filename[5:6]
            for row in csv.DictReader(mapping_file):
                column_mapping[row['key']] = row['value']
            column_mappings.append(column_mapping)


    all_housing_data = pd.DataFrame()
    
    ################################################################################
    # Load each file based on filename conventions and apply the column cleaning
    #
    for column_mapping in column_mappings:
        filename = 'San_Francisco_Development_Pipeline_%s_Quarter_%s' % (
            column_mapping['year'],
            column_mapping['quarter'],
        )

        housing_data_csv_filename = 'raw/' + filename + '.csv'
        print('loading ' + housing_data_csv_filename)

        #trying mac roman encoding below because of character issue in 2016 Q1 data (row 522)
        with open(housing_data_csv_filename, 'r', encoding='mac_roman') as csvfile:
            housing_data = load_csv_with_mapping(csvfile, column_mapping)
            housing_data_json_filename = 'cleaned/' + filename + '.json'
            df = pd.DataFrame(housing_data)
            df['original_data_file'] = housing_data_csv_filename
            df['report_year'] = housing_data_csv_filename.replace(".csv",'').split("_")[4]
            df['report_quarter'] = housing_data_csv_filename.replace(".csv",'').split("_")[6]
            all_housing_data = all_housing_data.append(df, ignore_index=True)
            print('writing ' + housing_data_json_filename)
            df.to_json(housing_data_json_filename, orient='records')
    
    all_housing_data.to_csv("cleaned/all_quarters_merged_PRECLEAN.csv")
    df = clean_df(all_housing_data)
    df.to_csv("cleaned/all_quarters_merged.csv")
    
    

In [67]:
#run code
main()

loading raw/San_Francisco_Development_Pipeline_2015_Quarter_1.csv
writing cleaned/San_Francisco_Development_Pipeline_2015_Quarter_1.json
loading raw/San_Francisco_Development_Pipeline_2017_Quarter_2.csv
writing cleaned/San_Francisco_Development_Pipeline_2017_Quarter_2.json
loading raw/San_Francisco_Development_Pipeline_2011_Quarter_4.csv
writing cleaned/San_Francisco_Development_Pipeline_2011_Quarter_4.json
loading raw/San_Francisco_Development_Pipeline_2013_Quarter_4.csv
writing cleaned/San_Francisco_Development_Pipeline_2013_Quarter_4.json
loading raw/San_Francisco_Development_Pipeline_2015_Quarter_2.csv
writing cleaned/San_Francisco_Development_Pipeline_2015_Quarter_2.json
loading raw/San_Francisco_Development_Pipeline_2015_Quarter_3.csv
writing cleaned/San_Francisco_Development_Pipeline_2015_Quarter_3.json
loading raw/San_Francisco_Development_Pipeline_2017_Quarter_1.csv
writing cleaned/San_Francisco_Development_Pipeline_2017_Quarter_1.json
loading raw/San_Francisco_Development_Pip

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
