# CIT Data Pipeline: Formatting

In this notebook, we ingest the avaiable, pre-populated data and format it for proper SQL uploading. 

In [273]:
import pandas as pd
import numpy as np

import requests

from termcolor import colored

In [274]:
files = pd.read_excel("CIT_Newly_added_Catalog_0521.xlsx")
print("Current Header:")
print(list(files.columns))
files.head()

Current Header:
['Plan Name', 'Date Added', 'Suggested By', 'Url', 'Plan Resolution', 'Planning Method', 'Land Conservation ', 'Unnamed: 7', 'Unnamed: 8', 'RESTORE GOALS', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14']


Unnamed: 0,Plan Name,Date Added,Suggested By,Url,Plan Resolution,Planning Method,Land Conservation,Unnamed: 7,Unnamed: 8,RESTORE GOALS,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,,NaT,,,,,Aquisition,Easement,Stewardship,Habitat,Water Quality,Resources/Species,Community Resilience,Gulf Economy,Code
1,Habitat Management Plan - Baldwin County Meado...,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
2,THE MOBILE PENINSULA - CORRIDOR MASTER PLAN,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
3,Management Plan for the - Audubon Bird Sanctuary,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
4,Apalachee Region Comprehensive Economic Develo...,2018-02-27,FL Fish and Wildlife Conservation Commission,http://www.nado.org/wp-content/uploads/2014/08...,Regional,,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,


This pings which file links actually point to PDFs. Written by Ethan.

### Mesh The Existing Column Names

In [275]:
meshpoint = 6
excel_header_1 = list(files.columns)[0:meshpoint]
excel_header_2 = list(files.iloc[0,meshpoint:])
incoming_header = excel_header_1 + excel_header_2
print("Incoming Header: \n", incoming_header)

Incoming Header: 
 ['Plan Name', 'Date Added', 'Suggested By', 'Url', 'Plan Resolution', 'Planning Method', 'Aquisition', 'Easement ', 'Stewardship', 'Habitat', 'Water Quality ', 'Resources/Species', 'Community Resilience', 'Gulf Economy', 'Code']


In [276]:
existing_header = ['id', 'plan_name', 'plan_url', 
                   'plan_resolution', 'planning_method', 'aquisition', 
                   'easement', 'stewardship', 'plan_timeframe', 
                   'agency_lead', 'geo_extent', 'habit', 
                   'water_quality', 'resource_species', 'community_resilience', 
                   'ecosystem_resilience', 'gulf_economy', 'related_state',
                   'status', 'is_new', 'existing_planid', 'username']
print("Existing Header: \n", existing_header)

Existing Header: 
 ['id', 'plan_name', 'plan_url', 'plan_resolution', 'planning_method', 'aquisition', 'easement', 'stewardship', 'plan_timeframe', 'agency_lead', 'geo_extent', 'habit', 'water_quality', 'resource_species', 'community_resilience', 'ecosystem_resilience', 'gulf_economy', 'related_state', 'status', 'is_new', 'existing_planid', 'username']


Relabel Incoming Header

In [None]:
# Map incoming header with existing header.
relabel_index = [0, 3, 4, 5, 6, 7, 8, 9]
remapped_header = ['plan_name', ]

In [277]:


# Note: the easiest thing is to perform a manual assignment because these names do NOT match. 
# The first real option is to expand the existing dataframe

## YAH
# # Determine the Mismatched Columns
# start_count = len(files.columns)
# final_count = len(incoming_header) 
# column_deficit = final_count - start_count

# # Replace the current header with new header name
# files.columns = incoming_header[:start_count]

# # Replace 
# new_columns = new_header[start_count:final_count]

# files

# # Add New Empty Columns
# files = files.reindex(columns=[*files.columns.tolist( ), *new_columns], fill_value="")


## Add first column


Unnamed: 0,Plan Name,Date Added,Suggested By,Url,Plan Resolution,Planning Method,Land Conservation,Unnamed: 7,Unnamed: 8,RESTORE GOALS,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,,NaT,,,,,Aquisition,Easement,Stewardship,Habitat,Water Quality,Resources/Species,Community Resilience,Gulf Economy,Code
1,Habitat Management Plan - Baldwin County Meado...,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
2,THE MOBILE PENINSULA - CORRIDOR MASTER PLAN,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
3,Management Plan for the - Audubon Bird Sanctuary,2017-12-11,Jeniffer Roberts,na,,,,,,,,,,,
4,Apalachee Region Comprehensive Economic Develo...,2018-02-27,FL Fish and Wildlife Conservation Commission,http://www.nado.org/wp-content/uploads/2014/08...,Regional,,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,Green Links Regional CLIP Database,2018-02-27,FL Fish and Wildlife Conservation Commission,https://www.fws.gov/panamacity/resources/Green...,geopolitical,,,,yes,"assist conservation, listed species, green inf...",,"assist conservation, listed species, green inf...","assist conservation, listed species, green inf...",,REG
292,Waterbird Conservation for the Americas: North...,2018-02-27,FL Fish and Wildlife Conservation Commission,https://www.fws.gov/migratorybirds/pdf/managem...,geopolitical,,yes,,yes,"protect, restore, and manage habitat",,"protect, restore, and manage populations",education and outreach,,REG
293,West Florida Comprehensive Economic Developmen...,2018-02-27,FL Fish and Wildlife Conservation Commission,https://www.ecrc.org/document_center/Programs/...,geopolitical,,,,yes,,,resource protection and sustainability as econ...,"make appealing to residents and visitors, prov...",economic development strategies,REG
294,Comprehensive Economic Development Strategy fo...,2018-02-27,FL Fish and Wildlife Conservation Commission,http://www.ncfrpc.org/Publications/CEDS/Withla...,geopolitical,,yes,,yes,,oncrease long-term sustainability of regional ...,"support, protect, and enhance the regions natu...","workforce to add value, high quality education...",economic development strategies,REG


## Reassign Header

In [269]:
# Strip the header row.
files = files.iloc[1:, :]

### Add Empty Columns Where No Data Exists

In [137]:
# Determine the Mismatched Columns
start_count = len(files.columns)
final_count = len(incoming_header) 
column_deficit = final_count - start_count

# Replace the current header with new header name
files.columns = incoming_header[:start_count]

# Replace 
new_columns = new_header[start_count:final_count]

In [138]:
# Add New Empty Columns
files = files.reindex(columns=[*files.columns.tolist( ), *new_columns], fill_value="")

Check the outcome of the process.

In [139]:
files.head()

Unnamed: 0,id,plan_name,date_added,suggested_by,url,plan_resolution,planning_method,aquisition,easement,stewardship,...,water_quality,resource_species,community_resilience,gulf_economy,code,related_state,status,is_new,existing_planid,username
0,344,Habitat Management Plan - Baldwin County Meado...,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,True,,
1,345,THE MOBILE PENINSULA - CORRIDOR MASTER PLAN,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,True,,
2,346,Management Plan for the - Audubon Bird Sanctuary,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,True,,
3,347,Apalachee Region Comprehensive Economic Develo...,2018-02-27,FL Fish and Wildlife Conservation Commission,http://www.nado.org/wp-content/uploads/2014/08...,Regional,,Yes,Yes,Yes,...,Yes,Yes,Yes,Yes,,,,True,,
4,348,Fishery Management Plan for Spanish Mackerel,2018-02-27,FL Fish and Wildlife Conservation Commission,http://sedarweb.org/docs/wsupp/S17RD03%20ASMFC...,GCR,,,,,...,,Manage Spanish mackerel resourse,,Minimize disruptions of markets for Spanish ma...,,,,True,,


Double check the columns are what you expect.

In [140]:
files.columns

Index(['plan_name', 'date_added', 'suggested_by', 'url', 'plan_resolution',
       'planning_method', 'aquisition', 'easement', 'stewardship', 'habit',
       'water_quality', 'resource_species', 'community_resilience',
       'gulf_economy', 'code', 'related_state', 'status', 'is_new',
       'existing_planid', 'username'],
      dtype='object')

Verify things look correct.

### Fill Missing Rows 

Trouble importing empties. I believe these should be filled with NULL. In Python as a dataframe, this exists as NaN, but after exporting, there is no data filled in when this happens. Try filling in with option as adjusted below.

<img src="figures/bloq_importing_nofill_csv.png"
     alt="Markdown Monster icon"
     width = 600 
     style="float: left; margin-right: 10px;" />

Preview the csv conversion.

### Fill In Missing Rows (skip for now)

### Write The ID Column To Match Existing Plans

This should be automated. Ultimately, we need this to be more automated where it picks up the exactly column number from the existing plans OR this is taken care of by SQL.

In [141]:
 len(files.index) 

295

In [142]:
rows =  len(files.index) 
values = list(range(344,344 + rows))

# Insert ID column to the dataframe
files.insert(0, "id", values)

In [143]:
files.head()

Unnamed: 0,id,plan_name,date_added,suggested_by,url,plan_resolution,planning_method,aquisition,easement,stewardship,...,water_quality,resource_species,community_resilience,gulf_economy,code,related_state,status,is_new,existing_planid,username
1,344,Habitat Management Plan - Baldwin County Meado...,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,,,
2,345,THE MOBILE PENINSULA - CORRIDOR MASTER PLAN,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,,,
3,346,Management Plan for the - Audubon Bird Sanctuary,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,,,
4,347,Apalachee Region Comprehensive Economic Develo...,2018-02-27,FL Fish and Wildlife Conservation Commission,http://www.nado.org/wp-content/uploads/2014/08...,Regional,,Yes,Yes,Yes,...,Yes,Yes,Yes,Yes,,,,,,
5,348,Fishery Management Plan for Spanish Mackerel,2018-02-27,FL Fish and Wildlife Conservation Commission,http://sedarweb.org/docs/wsupp/S17RD03%20ASMFC...,GCR,,,,,...,,Manage Spanish mackerel resourse,,Minimize disruptions of markets for Spanish ma...,,,,,,


In [144]:
files.to_csv(r'CIT_Newly_added_Catalog_0521.csv', na_rep='NULL', index=False)

### Review 

In [145]:
files_ascsv = pd.read_csv("CIT_Newly_added_Catalog_0521.csv")

In [146]:
files_ascsv.head()

Unnamed: 0,id,plan_name,date_added,suggested_by,url,plan_resolution,planning_method,aquisition,easement,stewardship,...,water_quality,resource_species,community_resilience,gulf_economy,code,related_state,status,is_new,existing_planid,username
0,344,Habitat Management Plan - Baldwin County Meado...,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,,,
1,345,THE MOBILE PENINSULA - CORRIDOR MASTER PLAN,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,,,
2,346,Management Plan for the - Audubon Bird Sanctuary,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,,,
3,347,Apalachee Region Comprehensive Economic Develo...,2018-02-27,FL Fish and Wildlife Conservation Commission,http://www.nado.org/wp-content/uploads/2014/08...,Regional,,Yes,Yes,Yes,...,Yes,Yes,Yes,Yes,,,,,,
4,348,Fishery Management Plan for Spanish Mackerel,2018-02-27,FL Fish and Wildlife Conservation Commission,http://sedarweb.org/docs/wsupp/S17RD03%20ASMFC...,GCR,,,,,...,,Manage Spanish mackerel resourse,,Minimize disruptions of markets for Spanish ma...,,,,,,


# Trouble Shooting

Fix any issues and blockers faced here with some experimentation.

## BLQ 0: 'is_new' column with invalid Boolean values

FIX: Set values in Boolean columns to exactly True or False. Nan does not count.

**NOTE:** `True` in Python may not register as `TRUE` or `t` in SQL. It may not be projecting properly. Try this fix first.

In [147]:
files_ascsv = files_ascsv.assign(is_new="TRUE")

In [148]:
files = files_ascsv

In [149]:
files.to_csv(r'CIT_Newly_added_Catalog_0521.csv', index=False)

In [150]:
files.head()

Unnamed: 0,id,plan_name,date_added,suggested_by,url,plan_resolution,planning_method,aquisition,easement,stewardship,...,water_quality,resource_species,community_resilience,gulf_economy,code,related_state,status,is_new,existing_planid,username
0,344,Habitat Management Plan - Baldwin County Meado...,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,True,,
1,345,THE MOBILE PENINSULA - CORRIDOR MASTER PLAN,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,True,,
2,346,Management Plan for the - Audubon Bird Sanctuary,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,True,,
3,347,Apalachee Region Comprehensive Economic Develo...,2018-02-27,FL Fish and Wildlife Conservation Commission,http://www.nado.org/wp-content/uploads/2014/08...,Regional,,Yes,Yes,Yes,...,Yes,Yes,Yes,Yes,,,,True,,
4,348,Fishery Management Plan for Spanish Mackerel,2018-02-27,FL Fish and Wildlife Conservation Commission,http://sedarweb.org/docs/wsupp/S17RD03%20ASMFC...,GCR,,,,,...,,Manage Spanish mackerel resourse,,Minimize disruptions of markets for Spanish ma...,,,,True,,


In [151]:
files.to_csv(r'CIT_Newly_added_Catalog_0521.csv', na_rep='NULL', index=False)

## FXD BLQ 1: 
ERROR:  extra data after last expected column 
CONTEXT:  COPY plans, line 1: ",Unnamed: 0,Unnamed: 0.1,id,plan_name,date_added,suggested_by,url,plan_resolution,planning_method,aq..."

In [152]:
files.head()

Unnamed: 0,id,plan_name,date_added,suggested_by,url,plan_resolution,planning_method,aquisition,easement,stewardship,...,water_quality,resource_species,community_resilience,gulf_economy,code,related_state,status,is_new,existing_planid,username
0,344,Habitat Management Plan - Baldwin County Meado...,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,True,,
1,345,THE MOBILE PENINSULA - CORRIDOR MASTER PLAN,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,True,,
2,346,Management Plan for the - Audubon Bird Sanctuary,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,True,,
3,347,Apalachee Region Comprehensive Economic Develo...,2018-02-27,FL Fish and Wildlife Conservation Commission,http://www.nado.org/wp-content/uploads/2014/08...,Regional,,Yes,Yes,Yes,...,Yes,Yes,Yes,Yes,,,,True,,
4,348,Fishery Management Plan for Spanish Mackerel,2018-02-27,FL Fish and Wildlife Conservation Commission,http://sedarweb.org/docs/wsupp/S17RD03%20ASMFC...,GCR,,,,,...,,Manage Spanish mackerel resourse,,Minimize disruptions of markets for Spanish ma...,,,,True,,


## FXD BLQ 2: 

ERROR:  invalid input syntax for type integer: "id"
CONTEXT:  COPY plans, line 1, column id: "id"

In [153]:
files.head()

Unnamed: 0,id,plan_name,date_added,suggested_by,url,plan_resolution,planning_method,aquisition,easement,stewardship,...,water_quality,resource_species,community_resilience,gulf_economy,code,related_state,status,is_new,existing_planid,username
0,344,Habitat Management Plan - Baldwin County Meado...,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,True,,
1,345,THE MOBILE PENINSULA - CORRIDOR MASTER PLAN,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,True,,
2,346,Management Plan for the - Audubon Bird Sanctuary,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,True,,
3,347,Apalachee Region Comprehensive Economic Develo...,2018-02-27,FL Fish and Wildlife Conservation Commission,http://www.nado.org/wp-content/uploads/2014/08...,Regional,,Yes,Yes,Yes,...,Yes,Yes,Yes,Yes,,,,True,,
4,348,Fishery Management Plan for Spanish Mackerel,2018-02-27,FL Fish and Wildlife Conservation Commission,http://sedarweb.org/docs/wsupp/S17RD03%20ASMFC...,GCR,,,,,...,,Manage Spanish mackerel resourse,,Minimize disruptions of markets for Spanish ma...,,,,True,,


In [154]:
files['is_new']

0      TRUE
1      TRUE
2      TRUE
3      TRUE
4      TRUE
       ... 
290    TRUE
291    TRUE
292    TRUE
293    TRUE
294    TRUE
Name: is_new, Length: 295, dtype: object

## FXD BLQ 3: 

**SOL** make sure you use HEADER yes option.

ERROR:  invalid input syntax for type integer: "id"
CONTEXT:  COPY plans, line 1, column id: "id"

In [155]:
files.head()

Unnamed: 0,id,plan_name,date_added,suggested_by,url,plan_resolution,planning_method,aquisition,easement,stewardship,...,water_quality,resource_species,community_resilience,gulf_economy,code,related_state,status,is_new,existing_planid,username
0,344,Habitat Management Plan - Baldwin County Meado...,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,True,,
1,345,THE MOBILE PENINSULA - CORRIDOR MASTER PLAN,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,True,,
2,346,Management Plan for the - Audubon Bird Sanctuary,2017-12-11,Jeniffer Roberts,na,,,,,,...,,,,,,,,True,,
3,347,Apalachee Region Comprehensive Economic Develo...,2018-02-27,FL Fish and Wildlife Conservation Commission,http://www.nado.org/wp-content/uploads/2014/08...,Regional,,Yes,Yes,Yes,...,Yes,Yes,Yes,Yes,,,,True,,
4,348,Fishery Management Plan for Spanish Mackerel,2018-02-27,FL Fish and Wildlife Conservation Commission,http://sedarweb.org/docs/wsupp/S17RD03%20ASMFC...,GCR,,,,,...,,Manage Spanish mackerel resourse,,Minimize disruptions of markets for Spanish ma...,,,,True,,


## BLQ 4: Fixing Misaligned Columns From Data Process

It looks like some are just out of order.

In [156]:
list(files.columns)

['id',
 'plan_name',
 'date_added',
 'suggested_by',
 'url',
 'plan_resolution',
 'planning_method',
 'aquisition',
 'easement',
 'stewardship',
 'habit',
 'water_quality',
 'resource_species',
 'community_resilience',
 'gulf_economy',
 'code',
 'related_state',
 'status',
 'is_new',
 'existing_planid',
 'username']

**NOTES** See below for plans SQL from PGAdmin

CREATE TABLE public.plans
(
    id integer NOT NULL DEFAULT nextval('plans_id_seq'::regclass),
    plan_name text COLLATE pg_catalog."default" NOT NULL,
    plan_url text COLLATE pg_catalog."default" NOT NULL,
    plan_resolution text COLLATE pg_catalog."default",
    planning_method text COLLATE pg_catalog."default",
    acquisition text COLLATE pg_catalog."default",
    easement text COLLATE pg_catalog."default",
    stewardship text COLLATE pg_catalog."default",
    plan_timeframe text COLLATE pg_catalog."default",
    agency_lead text COLLATE pg_catalog."default",
    geo_extent text COLLATE pg_catalog."default",
    habitat text COLLATE pg_catalog."default",
    water_quality text COLLATE pg_catalog."default",
    resources_species text COLLATE pg_catalog."default",
    community_resilience text COLLATE pg_catalog."default",
    ecosystem_resilience text COLLATE pg_catalog."default",
    gulf_economy text COLLATE pg_catalog."default",
    related_state text COLLATE pg_catalog."default",
    status text COLLATE pg_catalog."default",
    is_new boolean,
    existing_planid integer,
    username text COLLATE pg_catalog."default",
    CONSTRAINT plans_pkey PRIMARY KEY (id)