# CIT Data Pipeline: Formatting

In this notebook, we ingest the avaiable, pre-populated data and format it for proper SQL uploading. 

In [282]:
import pandas as pd
import numpy as np

import requests

In [283]:
def conform_headers(to_conform: [], existing_header: []) -> []:
    conformed_header = []
    """Given the two lists representing column headers, make one 
    header conform to another.
    
    :param to_conform: Column to conform.
    :type to_conform: [str]
    
    :param existing_header: Column for existing conformation.
    :type existing_header: [str] 
    
    :return conformed_header: List of column header conforming to existing header
    :return type: [str]
    """
    for column in existing_header:
        if column in to_conform:
            conformed_header.append(column)
    
    return conformed_header

In [284]:
# Unit Test
def test_conform_headers():
    to_conform = ["yes", "no"]
    existing_header = ["no"]
    conformed_header = conform_headers(to_conform, existing_header)
    assert conformed_header == ["no"]    
test_conform_headers()

In [285]:
def cherry_pick_dataframe(dataframe, conformed_header) -> pd.DataFrame:
    picked_dataframe = pd.DataFrame(columns=conformed_header)
    for column in conformed_header:
        picked_dataframe[column] = dataframe[column]
    return picked_dataframe

## Load the Data

In [286]:
files = pd.read_excel("CIT_Newly_added_Catalog_0521.xlsx")
print("Current Header:")
print(list(files.columns))
files.head()

Current Header:
['Plan Name', 'Date Added', 'Suggested By', 'Url', 'Plan Resolution', 'Planning Method', 'Land Conservation ', 'Unnamed: 7', 'Unnamed: 8', 'RESTORE GOALS', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14']


Unnamed: 0,Plan Name,Date Added,Suggested By,Url,Plan Resolution,Planning Method,Land Conservation,Unnamed: 7,Unnamed: 8,RESTORE GOALS,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,,NaT,,,,,Aquisition,Easement,Stewardship,Habitat,Water Quality,Resources/Species,Community Resilience,Gulf Economy,Code
1,Habitat Management Plan - Baldwin County Meado...,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
2,THE MOBILE PENINSULA - CORRIDOR MASTER PLAN,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
3,Management Plan for the - Audubon Bird Sanctuary,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
4,Apalachee Region Comprehensive Economic Develo...,2018-02-27,FL Fish and Wildlife Conservation Commission,http://www.nado.org/wp-content/uploads/2014/08...,Regional,,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,


This pings which file links actually point to PDFs. Written by Ethan.

## Conform Incoming Column Names with Existing Column Names

### Find Current Labels

### Definitions:
Existing labels  
**incoming_header:** header in the incoming CSV  
**existing_header:** header in the existing db 

In [287]:
meshpoint = 6
excel_header_1 = list(files.columns)[0:meshpoint]
excel_header_2 = list(files.iloc[0,meshpoint:])
incoming_header = excel_header_1 + excel_header_2
print("Incoming Header: \n", incoming_header)

Incoming Header: 
 ['Plan Name', 'Date Added', 'Suggested By', 'Url', 'Plan Resolution', 'Planning Method', 'Aquisition', 'Easement ', 'Stewardship', 'Habitat', 'Water Quality ', 'Resources/Species', 'Community Resilience', 'Gulf Economy', 'Code']


In [288]:
existing_header = ['id', 'plan_name', 'plan_url', 
                   'plan_resolution', 'planning_method', 'aquisition', 
                   'easement', 'stewardship', 'plan_timeframe', 
                   'agency_lead', 'geo_extent', 'habit', 
                   'water_quality', 'resource_species', 'community_resilience', 
                   'ecosystem_resilience', 'gulf_economy', 'related_state',
                   'status', 'is_new', 'existing_planid', 'username']

print("Existing Header: \n", existing_header)

Existing Header: 
 ['id', 'plan_name', 'plan_url', 'plan_resolution', 'planning_method', 'aquisition', 'easement', 'stewardship', 'plan_timeframe', 'agency_lead', 'geo_extent', 'habit', 'water_quality', 'resource_species', 'community_resilience', 'ecosystem_resilience', 'gulf_economy', 'related_state', 'status', 'is_new', 'existing_planid', 'username']


## Strip Column Names

In [289]:
files.columns = incoming_header
files = files[1:]
files.head()

Unnamed: 0,Plan Name,Date Added,Suggested By,Url,Plan Resolution,Planning Method,Aquisition,Easement,Stewardship,Habitat,Water Quality,Resources/Species,Community Resilience,Gulf Economy,Code
1,Habitat Management Plan - Baldwin County Meado...,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
2,THE MOBILE PENINSULA - CORRIDOR MASTER PLAN,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
3,Management Plan for the - Audubon Bird Sanctuary,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
4,Apalachee Region Comprehensive Economic Develo...,2018-02-27,FL Fish and Wildlife Conservation Commission,http://www.nado.org/wp-content/uploads/2014/08...,Regional,,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,
5,Fishery Management Plan for Spanish Mackerel,2018-02-27,FL Fish and Wildlife Conservation Commission,http://sedarweb.org/docs/wsupp/S17RD03%20ASMFC...,GCR,,,,,,,Manage Spanish mackerel resourse,,Minimize disruptions of markets for Spanish ma...,


### Relabel The Incoming Column Names

In [290]:
# Reduced given dataframe. 
# Columns ending with '_1' do not exist in the current database header.
incoming_header_remap = ['plan_name', 'date_added_1', 'suggested_by_1', 
                         'plan_url', 'plan_resolution', 'planning_method', 
                         'aquisition', 'easement', 'stewardship', 
                         'habitat', 'water_quality', 'resource_species', 
                         'community_resilience', 'gulf_economy', 'code_1']

rename_dictionary = {}
for i in range(len(incoming_header)):
    rename_dictionary[incoming_header_remap[i]] = incoming_header[i]

files.rename(columns=rename_dictionary).head()

Unnamed: 0,Plan Name,Date Added,Suggested By,Url,Plan Resolution,Planning Method,Aquisition,Easement,Stewardship,Habitat,Water Quality,Resources/Species,Community Resilience,Gulf Economy,Code
1,Habitat Management Plan - Baldwin County Meado...,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
2,THE MOBILE PENINSULA - CORRIDOR MASTER PLAN,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
3,Management Plan for the - Audubon Bird Sanctuary,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
4,Apalachee Region Comprehensive Economic Develo...,2018-02-27,FL Fish and Wildlife Conservation Commission,http://www.nado.org/wp-content/uploads/2014/08...,Regional,,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,
5,Fishery Management Plan for Spanish Mackerel,2018-02-27,FL Fish and Wildlife Conservation Commission,http://sedarweb.org/docs/wsupp/S17RD03%20ASMFC...,GCR,,,,,,,Manage Spanish mackerel resourse,,Minimize disruptions of markets for Spanish ma...,


### Remap The Given DataFrame To Conform To Existing Columns

In [291]:
files_df_remapped = pd.DataFrame(columns=incoming_header_remap)
print(list(files_df_remapped.columns))

for column in incoming_header_remap:
    files_df_remapped[column] = files[rename_dictionary[column]]

['plan_name', 'date_added_1', 'suggested_by_1', 'plan_url', 'plan_resolution', 'planning_method', 'aquisition', 'easement', 'stewardship', 'habitat', 'water_quality', 'resource_species', 'community_resilience', 'gulf_economy', 'code_1']


### Check Process Outcome

There should be some data here, not just headers.

In [292]:
files_df_remapped.head(2)

Unnamed: 0,plan_name,date_added_1,suggested_by_1,plan_url,plan_resolution,planning_method,aquisition,easement,stewardship,habitat,water_quality,resource_species,community_resilience,gulf_economy,code_1
1,Habitat Management Plan - Baldwin County Meado...,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
2,THE MOBILE PENINSULA - CORRIDOR MASTER PLAN,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,


### Check The Incoming Column Remapping Worked

In [293]:
print(files_df_remapped.columns, "\n")
print(files.columns, "\n")
column_number_difference = len(files_df_remapped.columns) - len(files.columns)
assert column_number_difference == 0
print("Column Number Difference:", column_number_difference)

Index(['plan_name', 'date_added_1', 'suggested_by_1', 'plan_url',
       'plan_resolution', 'planning_method', 'aquisition', 'easement',
       'stewardship', 'habitat', 'water_quality', 'resource_species',
       'community_resilience', 'gulf_economy', 'code_1'],
      dtype='object') 

Index(['Plan Name', 'Date Added', 'Suggested By', 'Url', 'Plan Resolution',
       'Planning Method', 'Aquisition', 'Easement ', 'Stewardship', 'Habitat',
       'Water Quality ', 'Resources/Species', 'Community Resilience',
       'Gulf Economy', 'Code'],
      dtype='object') 

Column Number Difference: 0


## Conform Incoming Labels With Existing Ones 

In [294]:
conformed_header = conform_headers(list(files_df_remapped.columns), existing_header)

In [295]:
files_df_reduced = cherry_pick_dataframe(files_df_remapped, conformed_header)

## Expand Columns For Incoming Dataframe With New Ones

In [296]:
# YAH

# source: https://towardsdatascience.com/4-methods-for-adding-columns-to-pandas-dataframes-dd0696863c16

files_df_expanded = files_df_reduced.copy()
existing_header_copy = existing_header

for i in range(len(existing_header_copy)):  
    current_column = existing_header[i] 
    if current_column not in  list(files_df_reduced.columns):
        files_df_expanded.insert(i, current_column, "null")
    

Unnamed: 0,plan_name,plan_url,plan_resolution,planning_method,aquisition,easement,stewardship,water_quality,resource_species,community_resilience,gulf_economy
1,Habitat Management Plan - Baldwin County Meado...,na,,,,,,,,,
2,THE MOBILE PENINSULA - CORRIDOR MASTER PLAN,na,,,,,,,,,
3,Management Plan for the - Audubon Bird Sanctuary,na,,,,,,,,,
4,Apalachee Region Comprehensive Economic Develo...,http://www.nado.org/wp-content/uploads/2014/08...,Regional,,Yes,Yes,Yes,Yes,Yes,Yes,Yes
5,Fishery Management Plan for Spanish Mackerel,http://sedarweb.org/docs/wsupp/S17RD03%20ASMFC...,GCR,,,,,,Manage Spanish mackerel resourse,,Minimize disruptions of markets for Spanish ma...
...,...,...,...,...,...,...,...,...,...,...,...
291,Green Links Regional CLIP Database,https://www.fws.gov/panamacity/resources/Green...,geopolitical,,,,yes,,"assist conservation, listed species, green inf...","assist conservation, listed species, green inf...",
292,Waterbird Conservation for the Americas: North...,https://www.fws.gov/migratorybirds/pdf/managem...,geopolitical,,yes,,yes,,"protect, restore, and manage populations",education and outreach,
293,West Florida Comprehensive Economic Developmen...,https://www.ecrc.org/document_center/Programs/...,geopolitical,,,,yes,,resource protection and sustainability as econ...,"make appealing to residents and visitors, prov...",economic development strategies
294,Comprehensive Economic Development Strategy fo...,http://www.ncfrpc.org/Publications/CEDS/Withla...,geopolitical,,yes,,yes,oncrease long-term sustainability of regional ...,"support, protect, and enhance the regions natu...","workforce to add value, high quality education...",economic development strategies


In [197]:
list(files_df_reduced_2.columns)
# insert_position_headername_what values...
# files_df_reduced_2.insert(0,"d", "null")
files_df_reduced_2

Unnamed: 0,d,plan_name,plan_url,plan_resolution,planning_method,aquisition,easement,stewardship,water_quality,resource_species,community_resilience,gulf_economy
1,,Habitat Management Plan - Baldwin County Meado...,na,,,,,,,,,
2,,THE MOBILE PENINSULA - CORRIDOR MASTER PLAN,na,,,,,,,,,
3,,Management Plan for the - Audubon Bird Sanctuary,na,,,,,,,,,
4,,Apalachee Region Comprehensive Economic Develo...,http://www.nado.org/wp-content/uploads/2014/08...,Regional,,Yes,Yes,Yes,Yes,Yes,Yes,Yes
5,,Fishery Management Plan for Spanish Mackerel,http://sedarweb.org/docs/wsupp/S17RD03%20ASMFC...,GCR,,,,,,Manage Spanish mackerel resourse,,Minimize disruptions of markets for Spanish ma...
...,...,...,...,...,...,...,...,...,...,...,...,...
291,,Green Links Regional CLIP Database,https://www.fws.gov/panamacity/resources/Green...,geopolitical,,,,yes,,"assist conservation, listed species, green inf...","assist conservation, listed species, green inf...",
292,,Waterbird Conservation for the Americas: North...,https://www.fws.gov/migratorybirds/pdf/managem...,geopolitical,,yes,,yes,,"protect, restore, and manage populations",education and outreach,
293,,West Florida Comprehensive Economic Developmen...,https://www.ecrc.org/document_center/Programs/...,geopolitical,,,,yes,,resource protection and sustainability as econ...,"make appealing to residents and visitors, prov...",economic development strategies
294,,Comprehensive Economic Development Strategy fo...,http://www.ncfrpc.org/Publications/CEDS/Withla...,geopolitical,,yes,,yes,oncrease long-term sustainability of regional ...,"support, protect, and enhance the regions natu...","workforce to add value, high quality education...",economic development strategies


In [None]:
## Add New Empty Columns
# files = files.reindex(columns=[*files.columns.tolist( ), *new_columns], fill_value="")

Check the outcome of the process.

In [None]:
# files.head()

Double check the columns are what you expect.

In [59]:
# files.columns

Verify things look correct.

### Fill Missing Rows 

Trouble importing empties. I believe these should be filled with NULL. In Python as a dataframe, this exists as NaN, but after exporting, there is no data filled in when this happens. Try filling in with option as adjusted below.

<img src="figures/bloq_importing_nofill_csv.png"
     alt="Markdown Monster icon"
     width = 600 
     style="float: left; margin-right: 10px;" />

Preview the csv conversion.

### Fill In Missing Rows (skip for now)

### Write The ID Column To Match Existing Plans 
### **Note may not need to do this now**

This should be automated. Ultimately, we need this to be more automated where it picks up the exactly column number from the existing plans OR this is taken care of by SQL.

In [None]:
#  len(files.index) 

In [None]:
# rows =  len(files.index) 
# values = list(range(344,344 + rows))

# # Insert ID column to the dataframe
# files.insert(0, "id", values)

In [36]:
files_df_reduced.head()

Unnamed: 0,plan_name,date_added_1,suggested_by_1,plan_url,plan_resolution,planning_method,aquisition,easement,stewardship,habitat,water_quality,resource_species,community_resilience,gulf_economy,code_1
1,Habitat Management Plan - Baldwin County Meado...,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
2,THE MOBILE PENINSULA - CORRIDOR MASTER PLAN,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
3,Management Plan for the - Audubon Bird Sanctuary,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
4,Apalachee Region Comprehensive Economic Develo...,2018-02-27,FL Fish and Wildlife Conservation Commission,http://www.nado.org/wp-content/uploads/2014/08...,Regional,,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,
5,Fishery Management Plan for Spanish Mackerel,2018-02-27,FL Fish and Wildlife Conservation Commission,http://sedarweb.org/docs/wsupp/S17RD03%20ASMFC...,GCR,,,,,,,Manage Spanish mackerel resourse,,Minimize disruptions of markets for Spanish ma...,


## Write To A CSV

In [14]:
files_df_reduced.to_csv(r'CIT_Newly_added_Catalog_0521.csv', na_rep='NULL', index=False)

### Review 

In [15]:
files_df_reduced_as_csv = pd.read_csv("CIT_Newly_added_Catalog_0521.csv")

In [16]:
files_df_reduced_as_csv.head()

Unnamed: 0,plan_name,date_added_1,suggested_by_1,plan_url,plan_resolution,planning_method,aquisition,easement,stewardship,habitat,water_quality,resource_species,community_resilience,gulf_economy,code_1
0,Habitat Management Plan - Baldwin County Meado...,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
1,THE MOBILE PENINSULA - CORRIDOR MASTER PLAN,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
2,Management Plan for the - Audubon Bird Sanctuary,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
3,Apalachee Region Comprehensive Economic Develo...,2018-02-27,FL Fish and Wildlife Conservation Commission,http://www.nado.org/wp-content/uploads/2014/08...,Regional,,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,
4,Fishery Management Plan for Spanish Mackerel,2018-02-27,FL Fish and Wildlife Conservation Commission,http://sedarweb.org/docs/wsupp/S17RD03%20ASMFC...,GCR,,,,,,,Manage Spanish mackerel resourse,,Minimize disruptions of markets for Spanish ma...,


# Trouble Shooting

Fix any issues and blockers faced here with some experimentation.

## FXD BLQ 0: 'is_new' column with invalid Boolean values

FIX: Set values in Boolean columns to exactly True or False. Nan does not count.

**NOTE:** `True` in Python may not register as `TRUE` or `t` in SQL. It may not be projecting properly. Try this fix first.

In [None]:
# files_ascsv = files_ascsv.assign(is_new="TRUE")

In [None]:
# files = files_ascsv

In [None]:
# files.to_csv(r'CIT_Newly_added_Catalog_0521.csv', index=False)

In [None]:
# files.head()

In [None]:
# files.to_csv(r'CIT_Newly_added_Catalog_0521.csv', na_rep='NULL', index=False)

## FXD BLQ 1: 
ERROR:  extra data after last expected column 
CONTEXT:  COPY plans, line 1: ",Unnamed: 0,Unnamed: 0.1,id,plan_name,date_added,suggested_by,url,plan_resolution,planning_method,aq..."

In [None]:
# files.head()

## FXD BLQ 2: 

ERROR:  invalid input syntax for type integer: "id"
CONTEXT:  COPY plans, line 1, column id: "id"

In [None]:
# files.head()

In [None]:
# files['is_new']

## FXD BLQ 3: 

**SOL** make sure you use HEADER yes option.

ERROR:  invalid input syntax for type integer: "id"
CONTEXT:  COPY plans, line 1, column id: "id"

In [None]:
# files.head()

## BLQ 4: Fixing Misaligned Columns From Data Process

It looks like some are just out of order.

In [10]:
# list(files.columns)

['Plan Name',
 'Date Added',
 'Suggested By',
 'Url',
 'Plan Resolution',
 'Planning Method',
 'Aquisition',
 'Easement ',
 'Stewardship',
 'Habitat',
 'Water Quality ',
 'Resources/Species',
 'Community Resilience',
 'Gulf Economy',
 'Code']

**NOTES** See below for plans SQL from PGAdmin

CREATE TABLE public.plans
(
    id integer NOT NULL DEFAULT nextval('plans_id_seq'::regclass),
    plan_name text COLLATE pg_catalog."default" NOT NULL,
    plan_url text COLLATE pg_catalog."default" NOT NULL,
    plan_resolution text COLLATE pg_catalog."default",
    planning_method text COLLATE pg_catalog."default",
    acquisition text COLLATE pg_catalog."default",
    easement text COLLATE pg_catalog."default",
    stewardship text COLLATE pg_catalog."default",
    plan_timeframe text COLLATE pg_catalog."default",
    agency_lead text COLLATE pg_catalog."default",
    geo_extent text COLLATE pg_catalog."default",
    habitat text COLLATE pg_catalog."default",
    water_quality text COLLATE pg_catalog."default",
    resources_species text COLLATE pg_catalog."default",
    community_resilience text COLLATE pg_catalog."default",
    ecosystem_resilience text COLLATE pg_catalog."default",
    gulf_economy text COLLATE pg_catalog."default",
    related_state text COLLATE pg_catalog."default",
    status text COLLATE pg_catalog."default",
    is_new boolean,
    existing_planid integer,
    username text COLLATE pg_catalog."default",
    CONSTRAINT plans_pkey PRIMARY KEY (id)