# Compile and Clean PHMSA Data

## Introduction

Plan is to get and clean data available on the PHMSA website.

## Data Sources
* https://www.phmsa.dot.gov/data-and-statistics/phmsa-data-and-statistics
* https://www.phmsa.dot.gov/data-and-statistics/pipeline/source-data

## Imports and Notebook Setup

In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotnine as gg

from glob import glob
from zipfile import ZipFile
from urllib.request import urlopen
import os

from IPython.display import IFrame
from IPython.display import FileLink

pd.set_option('display.max_columns', 200)
sns.set()
plt.rcParams['figure.figsize'] = (12, 8)
plt.style.use('ggplot')

## Get Raw Data

In [7]:
if not os.path.exists('../temp_files/'):
    os.makedirs('../temp_files/')

### Annual Report Data

In [40]:
url_annual_reports_2010_present = 'https://www.phmsa.dot.gov/sites/phmsa.dot.gov/files/data_statistics/pipeline/annual_hazardous_liquid_2010_present.zip'
url_annual_reports = urlopen(url_annual_reports_2010_present)
output = open('../temp_files/annual_report_zipFile.zip','wb')
output.write(url_annual_reports.read())
output.close()

In [41]:
zip_obj = ZipFile('../temp_files/annual_report_zipFile.zip', 'r')

In [52]:
zip_obj.extractall(path='../temp_files/')
data_key_annual = glob('../temp_files/*.pdf')[0]
data_key_annual

'../temp_files\\Hazardous Liquid Annual Form - PHMSA F 7000-1.1 (rev 2020) - Data Fields.pdf'

In [48]:
annual_2021 = pd.read_excel('../temp_files/annual_hazardous_liquid_2021.xlsx')
annual_2021.shape

(717, 88)

In [54]:
annual_2021.to_csv('../data/raw_data/annual_2021.csv', index=False)

### Hazardous Liquid Incident Data

In [6]:
url_2010_present = 'https://www.phmsa.dot.gov/sites/phmsa.dot.gov/files/data_statistics/pipeline/accident_hazardous_liquid_jan2010_present.zip'
url = urlopen(url_2010_present)

In [8]:
output = open('../temp_files/zipFile.zip','wb')
output.write(url.read())
output.close()

In [9]:
zip_obj = ZipFile('../temp_files/zipFile.zip', 'r')

# https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile.extract

In [23]:
data_file = zip_obj.extract(zip_obj.filelist[0],path="../temp_files")
data_key = zip_obj.extract(zip_obj.filelist[1],path="../reference/data_keys")
data_key

'..\\reference\\data_keys\\Hazardous Liquid Accident PHMSA F7000 1 Rev 3-2021 Data fields.pdf'

In [31]:
incid = pd.read_table(data_file, encoding='windows-1252', low_memory=False)
incid.shape

(4919, 654)

In [32]:
incid.to_parquet('../data/raw_data/incidents_2010_present.parquet', index=False)

## Clean Data

### Hazardous Liquid Incident Data

In [33]:
hlidf = pd.read_parquet('../data/raw_data/incidents_2010_present.parquet')
hlidf.head()

Unnamed: 0,REPORT_RECEIVED_DATE,IYEAR,REPORT_NUMBER,SUPPLEMENTAL_NUMBER,REPORT_TYPE,OPERATOR_ID,NAME,OPERATOR_STREET_ADDRESS,OPERATOR_CITY_NAME,OPERATOR_STATE_ABBREVIATION,OPERATOR_POSTAL_CODE,LOCAL_DATETIME,TIME_ZONE,DAYLIGHT_SAVINGS_IND,LOCATION_LATITUDE,LOCATION_LONGITUDE,COMMODITY_RELEASED_TYPE,COMMODITY_SUBTYPE,COMMODITY_DETAILS,BLEND_DETAILS,BIO_DIESEL_DETAILS,UNINTENTIONAL_RELEASE_BBLS,INTENTIONAL_RELEASE_BBLS,RECOVERED_BBLS,FATALITY_IND,NUM_EMP_FATALITIES,NUM_CONTR_FATALITIES,NUM_ER_FATALITIES,NUM_WORKER_FATALITIES,NUM_GP_FATALITIES,FATAL,INJURY_IND,NUM_EMP_INJURIES,NUM_CONTR_INJURIES,NUM_ER_INJURIES,NUM_WORKER_INJURIES,NUM_GP_INJURIES,INJURE,ACCIDENT_IDENTIFIER,ACCIDENT_DETAILS,OPERATOR_TYPE,INCIDENT_IDENTIFIED_DATETIME,SYSTEM_PART_INVOLVED,ON_OFF_SHORE,STATUS_WHEN_IDENTIFIED,SHUTDOWN_DUE_ACCIDENT_IND,SHUTDOWN_EXPLAIN,SHUTDOWN_DATETIME,RESTART_DATETIME,STILL_SHUTDOWN_IND,COMMUNICATION_STATE_FED_IND,PARTY_INITIATED_COMMUNICATION,INITIAL_RESPONDER_COM_DATETIME,ON_SITE_DATETIME,CONFIRMED_DISCOVERY_DATETIME,NRC_RPT_DATETIME,NRC_RPT_NUM,ADDITIONAL_NRC_REPORT_NUMBERS,IGNITE_IND,IGNITE_DATETIME,HOW_EXTINGUISHED,HOW_EXTINGUISHED_OTHER_DETAIL,CONSUMED_BY_FIRE_IN_BARRELS,EXPLODE_IND,UPSTREAM_ACTION_TAKEN,UPSTREAM_OPRTNL_CNTRL_DETAIL,UPSTREAM_VALVE_CLOSE_DATETIME,UPSTREAM_VALVE_TYPE_IND,DOWNSTREAM_ACTION_TAKEN,DOWNSTREAM_OPRTNL_CNTRL_DETAIL,DOWNSTREAM_VLV_CLOSE_DATETIME,DOWNSTREAM_VALVE_TYPE_IND,NOTIFY_QUALIFIED_INDIV_IND,QUALIFIED_INDIV_NOTIF_DATETIME,OIL_SPILL_REMOVAL_ORG_IND,OSRO_ACTIVATED_DATETIME,OSRO_ARRIVED_ON_SITE_DT,NUM_PUB_EVACUATED,PIPE_FAC_NAME,SEGMENT_NAME,ONSHORE_STATE_ABBREVIATION,ONSHORE_POSTAL_CODE,ONSHORE_CITY_NAME,ONSHORE_COUNTY_NAME,DESIGNATED_LOCATION,DESIGNATED_NAME,FEDERAL,LOCATION_TYPE,INCIDENT_AREA_TYPE,INCIDENT_AREA_SUBTYPE,INCIDENT_AREA_DETAILS,DEPTH_OF_COVER,CROSSING,BRIDGE_CROSSING_IND,BRIDGE_TYPE,RAILROAD_CROSSING_IND,RAILROAD_TYPE,ROAD_CROSSING_IND,ROAD_TYPE,WATER_CROSSING_IND,...,EMAT_PREV_PROPUL_METHOD,CPCM_RECENT_YEAR,CPCM_RCNT_PROPUL_METHOD,CPCM_PREVIOUS_YEAR,CPCM_PREV_PROPUL_METHOD,OTHER_TOOL,OTHER_RECENT_YEAR,OTHER_RCNT_PROPUL_METHOD,OTHER_PREVIOUS_YEAR,OTHER_PREV_PROPUL_METHOD,INSP_COMPL_BEFORE_DAMAGE_IND,HAS_HYDRTST_CONDUC_BEFORE_IND,HYDRTST_MOST_RCNT_YEAR,HYDRTST_MOST_RCNT_PRESSURE,DIRECT_ASMNT_CONDUCTED,DIRECT_ASMNT_AT_PNT_ACCDNT_YR,DIRECT_ASMNT_PNT_NOT_IDNTF_YR,ASMNT_ECDA_RCNT_IND,ASMNT_ECDA_RCNT_YEAR,ASMNT_OTHER_RCNT_IND,ASMNT_OTHER_TYPE,ASMNT_OTHER_RCNT_YEAR,NON_DESTRUCTIVE_EXAM_IND,EXM_RADIOGRAPHY_RCNT_YEAR,EXM_WAVE_ULTRASONIC_RCNT_YEAR,EXM_HANDL_ULTRASONIC_RCNT_YEAR,EXM_WET_MGNT_PARTCL_RCNT_YEAR,EXM_DRY_MGNT_PARTCL_RCNT_YEAR,EXM_OTHER_TYPE,EXM_OTHER_RCNT_YEAR,EXM_RADIOGRAPHY_RCNT_IND,EXM_WAVE_ULTRASONIC_RCNT_IND,EXM_HANDL_ULTRASONIC_RCNT_IND,EXM_WET_MGNT_PARTCL_RCNT_IND,EXM_DRY_MGNT_PARTCL_RCNT_IND,EXM_OTHER_RCNT_IND,EXTRNL_COR_GALVANIC_IND,EXTRNL_COR_ATMOSPHERIC_IND,EXTRNL_COR_STRAY_CURRENT_IND,EXTRNL_COR_MICROBIOLOGIC_IND,EXTRNL_COR_SELECTIVE_SEAM_IND,INTRNL_COR_CORROSIVE_CMDTY_IND,INTRNL_COR_WTR_DRPOUT_ACID_IND,INTRNL_COR_MICROBIOLOGIC_IND,INTRNL_COR_EROSION_IND,NF_EARTH_MOVEMENT_IND,NF_HEAVY_RAINS_IND,NF_LIGHTNING_IND,NF_TEMPERATURE_IND,NF_HIGH_WINDS_IND,NF_VEGITATION_ROOT_IND,EXCVTN_DMG_OPERATOR_IND,EXCVTN_DMG_OP_CONTRACTOR_IND,EXCVTN_DMG_THIRD_PARTY_IND,EXCVTN_DMG_PREVIOUS_DAMAGE_IND,OSF_NEARBY_INDUSTRIAL_IND,OSF_VEHICLE_IND,OSF_BOAT_IND,OSF_OTHER_MARITIME_IND,OSF_ELECTRICAL_ARCING_IND,OSF_PREVIOUS_MECHANICAL_IND,OSF_INTENTIONAL_IND,PWF_DESIGN_IND,PWF_CONSTRUCTION_IND,PWF_INSTALLATION_IND,PWF_FABRICATION_IND,PWF_MANUFACTURING_IND,PWF_ENV_STRESS_CORROSION_IND,PWF_ENV_SULFIDE_STRESS_IND,PWF_ENV_HYDROGEN_STRESS_IND,PWF_ENV_HARD_SPOT_IND,EQF_CONTROL_RELEAF_IND,EQF_PUMP_EQUIPMENT_IND,EQF_THREADED_COUPLING_IND,EQF_NON_THREADED_IND,EQF_DEFECTIVE_FITTING_IND,EQF_EQUIPMENT_BODY_IND,IO_DAMAGE_BY_OPERATOR_IND,IO_TANK_VESSEL_IND,IO_VALVE_POSITION_IND,IO_EQUIPMENT_OVERPRESSURE_IND,IO_NOT_INSTALLED_PROPERLY_IND,IO_WRONG_EQUIPMENT_IND,IO_INADEQUATE_PROCEDURE_IND,IO_NO_PROCEDURE_IND,IO_FOLLOW_PROCEDURE_IND,PREPARER_NAME,PREPARER_TITLE,PREPARER_EMAIL,PREPARER_TELEPHONE,PREPARER_FAX,PREPARED_DATE,LOCAL_CONTACT_NAME,LOCAL_CONTACT_EMAIL,LOCAL_CONTACT_TELEPHONE,AUTHORIZER_NAME,AUTHORIZER_TELEPHONE,AUTHORIZER_TITLE,AUTHORIZER_EMAIL,NARRATIVE
0,11/29/2022,2022,20220264,37392,ORIGINAL FINAL,40295,"FUNDARE RESOURCES OPERATING COMPANY, LLC",5251 DTC PKWY STE #950,GREENWOOD VILLAGE,CO,80111,11/14/2022 10:00,MOUNTAIN,YES,40.78112,-103.9448,CRUDE OIL,,,,,28.19,,25.5,NO,,,,,,0,NO,,,,,,0,"LOCAL OPERATING PERSONNEL, INCLUDING CONTRACTORS",,CONTRACTOR WORKING FOR THE OPERATOR,11/14/2022 12:38,ONSHORE PUMP/METER STATION EQUIPMENT AND PIPING,ONSHORE,"NORMAL OPERATION, INCLUDES PAUSES BETWEEN BATC...",YES,,11/14/2022 12:42,11/14/2022 16:15,,YES,OPERATOR,11/15/2022 8:00,11/14/2022 12:55,11/14/2022 12:38,11/21/2022 2:30,1353138,,NO,,,,,,,,,,,,,,YES,11/14/2022 6:33,NO,,,0.0,REDTAIL GAS GATHERING SYSTEM,CR 110 EAST,CO,80742,NEW RAYMER,WELD,MILEPOST,10.5,NO,TOTALLY CONTAINED ON OPERATOR-CONTROLLED PROPERTY,ABOVEGROUND,TYPICAL ABOVEGROUND FACILITY PIPING OR APPURTE...,,,NO,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,SYDNEY SMITH,DIRECTOR EHSR,ssmith@fundareresources.com,3039104511,,11/28/2022,SYDNEY SMITH,ssmith@fundareresources.com,3039104511,SYDNEY SMITH,3039104511,DIRECTOR EHSR,ssmith@fundareresources.com,"ON NOVEMBER 14, 2022 A THREE-INCH VALVE FROZE ..."
1,11/29/2022,2022,20220263,37390,ORIGINAL FINAL,22855,"FLINT HILLS RESOURCES, LC",4111 EAST 37TH STREET NORTH,WICHITA,KS,67220,11/3/2022 15:05,CENTRAL,YES,44.881773,-92.956596,CRUDE OIL,,,,,0.5,,0.5,NO,,,,,,0,NO,,,,,,0,"LOCAL OPERATING PERSONNEL, INCLUDING CONTRACTORS",,OPERATOR EMPLOYEE,11/3/2022 15:05,"ONSHORE PIPELINE, INCLUDING VALVE SITES",ONSHORE,"NORMAL OPERATION, INCLUDES PAUSES BETWEEN BATC...",YES,,11/3/2022 15:39,11/3/2022 21:17,,NO,,,11/3/2022 15:05,11/3/2022 15:05,,NRC NOTIFICATION NOT REQUIRED,,NO,,,,,,VALVE CLOSURE,,11/3/2022 15:17,REMOTELY CONTROLLED,VALVE CLOSURE,,11/3/2022 15:39,REMOTELY CONTROLLED,NO,,NO,,,0.0,MINNESOTA PIPELINE LINE / LINE 1,HUGO TO COTTAGE GROVE / 5608,MN,55129,WOODBURY,WASHINGTON,MILEPOST,245,NO,PIPELINE RIGHT-OF-WAY,UNDERGROUND,EXPOSED DUE TO EXCAVATION,,36.0,NO,,,,,,,,...,,,,,,HELICAL MFL,2021.0,FREE SWIMMING,2016.0,FREE SWIMMING,,YES,2014.0,1519.0,NO,,,,,,,,NO,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,MELINA PEREZ,COMPLIANCE MANAGMENT SYSTEMS SPECIALIST,melina.perez@fhr.com,361-242-8781,,11/29/2022,KASON LAUBER,kason.lauber@fhr.com,3167080732,KASON LAUBER,3167080732,COMPLIANCE SPECIALIST,kason.lauber@fhr.com,ON 11/03/2022 FLINT HILLS RESOURCES WAS ON SIT...
2,11/15/2022,2022,20220253,37358,ORIGINAL FINAL,31174,"SHELL PIPELINE CO., L.P.",150 NORTH DAIRY ASHFORD ROAD WCK BLDG. A,HOUSTON,TX,77079,11/2/2022 17:45,CENTRAL,YES,29.977491,-93.936475,CRUDE OIL,,,,,3.75,,3.75,NO,,,,,,0,NO,,,,,,0,"LOCAL OPERATING PERSONNEL, INCLUDING CONTRACTORS",,OPERATOR EMPLOYEE,11/2/2022 17:45,ONSHORE TERMINAL/TANK FARM EQUIPMENT AND PIPING,ONSHORE,"NORMAL OPERATION, INCLUDES PAUSES BETWEEN BATC...",YES,,11/2/2022 18:00,11/6/2022 20:08,,YES,OPERATOR,11/2/2022 18:24,11/2/2022 18:24,11/2/2022 17:45,11/2/2022 19:24,1351585,1351751.0,NO,,,,,,,,,,,,,,YES,11/2/2022 17:45,YES,11/2/2022 18:30,11/2/2022 18:50,0.0,"20"" ZYDECO PIPELINE",PORT NECHES TANK FARM,TX,77651,PORT NECHES,JEFFERSON COUNTY,MILEPOST,PN TANK FARM,NO,TOTALLY CONTAINED ON OPERATOR-CONTROLLED PROPERTY,UNDERGROUND,UNDER SOIL,,66.0,NO,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,JEREME HAVARD,OPERATIONS SUPPORT COORDINATOR,jereme.havard@shell.com,4095402781,,11/15/2022,JEREME HAVARD,jereme.havard@shell.com,4095402781,DEBORAH PRICE,9364990580,INTEGRITY AND REGULATORY SERVICES MANAGER,deborah.price@shell.com,"AROUND 17:45 ON NOVEMBER 2, 2022, THE STATION ..."
3,11/29/2022,2022,20220265,37394,ORIGINAL FINAL,22855,"FLINT HILLS RESOURCES, LC",4111 EAST 37TH STREET NORTH,WICHITA,KS,67220,10/30/2022 18:30,CENTRAL,NO,27.911192,-97.411412,CRUDE OIL,,,,,0.19,,0.19,NO,,,,,,0,NO,,,,,,0,"SCADA-BASED INFORMATION (SUCH AS ALARM(S), ALE...",,,10/30/2022 17:44,ONSHORE PUMP/METER STATION EQUIPMENT AND PIPING,ONSHORE,"NORMAL OPERATION, INCLUDES PAUSES BETWEEN BATC...",YES,,10/30/2022 17:44,10/31/2022 17:44,,YES,OPERATOR,10/30/2022 19:45,10/30/2022 18:30,10/30/2022 18:30,,NRC NOTIFICATION NOT REQUIRED,,NO,,,,,,,,,,,,,,NO,,NO,,,0.0,MIDWAY PUMP STATION,"60510010 MIDWAY TO EAST WHITE POINT 10""",TX,78390,Not Within a Municipality,SAN PATRICIO,SURVEY STATION NO.,520 + 90,NO,TOTALLY CONTAINED ON OPERATOR-CONTROLLED PROPERTY,ABOVEGROUND,TYPICAL ABOVEGROUND FACILITY PIPING OR APPURTE...,,,NO,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,YES,,,,,,,,,,,,,,MERLE TEETER,SR COMPLIANCE SPECIALIST,merle.teeter@fhr.com,409-273-9482,,11/29/2022,MERLE TEETER,merle.teeter@fhr.com,409-273-9482,MERLE TEETER,409-273-9482,SR COMPLIANCE SPECIALIST,merle.teeter@fhr.com,"ON OCTOBER 30, 2022 AT APPROXIMATELY 17:44, A ..."
4,11/28/2022,2022,20220261,37386,ORIGINAL FINAL,22610,"MAGELLAN PIPELINE COMPANY, LP","ONE WILLIAMS CENTER, MD OTC-9 P.O. BOX 22186, ...",TULSA,OK,74172,10/28/2022 11:10,CENTRAL,YES,45.103927,-95.082402,REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,GASOLINE (NON-ETHANOL),,,,0.24,,0.12,NO,,,,,,0,NO,,,,,,0,"LOCAL OPERATING PERSONNEL, INCLUDING CONTRACTORS",,OPERATOR EMPLOYEE,10/28/2022 11:10,ONSHORE PUMP/METER STATION EQUIPMENT AND PIPING,ONSHORE,"NORMAL OPERATION, INCLUDES PAUSES BETWEEN BATC...",YES,,10/28/2022 11:10,10/28/2022 12:30,,YES,OPERATOR,10/28/2022 12:06,10/28/2022 11:10,10/28/2022 11:10,,NRC NOTIFICATION NOT REQUIRED,,NO,,,,,,,,,,,,,,YES,10/28/2022 11:10,NO,,,0.0,WILLMAR,780,MN,56201,WILLMAR,KANDIYOHI,MILEPOST,0,NO,TOTALLY CONTAINED ON OPERATOR-CONTROLLED PROPERTY,ABOVEGROUND,TYPICAL ABOVEGROUND FACILITY PIPING OR APPURTE...,,,NO,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,YES,CAROLINE RATLIFF,STRATEGIC DATA ENGINEER,caroline.ratliff@magellanlp.com,9185747065,,10/28/2022,,,,GRACE RIESS,9185747165,SUPV. INTEGRITY MGMT AND REGULATORY COMPLIANCE,grace.riess@magellanlp.com,"AT APPROXIMATELY 10:30AM, MAGELLAN OPERATIONS ..."


In [34]:
hlidf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4919 entries, 0 to 4918
Columns: 654 entries, REPORT_RECEIVED_DATE to NARRATIVE
dtypes: float64(158), int64(7), object(489)
memory usage: 24.5+ MB


In [None]:
incident_vol = incid[['LOCAL_DATETIME','NAME','ONSHORE_CITY_NAME','ONSHORE_STATE_ABBREVIATION','REPORT_TYPE','ON_OFF_SHORE','CAUSE','CAUSE_DETAILS','STRESS_SUBTYPE','STRESS_DETAILS','SYSTEM_PART_INVOLVED','ITEM_INVOLVED','LOCATION_LATITUDE', 'LOCATION_LONGITUDE', 'COMMODITY_RELEASED_TYPE', 'UNINTENTIONAL_RELEASE_BBLS', 'RECOVERED_BBLS','NARRATIVE']]
incident_vol.head(3)

In [None]:
incident_vol['NAME'].nunique()

In [None]:
incident_vol['NAME'].value_counts()

In [None]:
mask = ~incident_vol['COMMODITY_RELEASED_TYPE'].str.contains('HVL ') # filter out HVL flammable

In [None]:
incid_vol_liquid_and_co2 = incident_vol[mask]

In [None]:
mask2 = ~incid_vol_liquid_and_co2['COMMODITY_RELEASED_TYPE'].str.contains('CO2')
incid_vol_liquid = incid_vol_liquid_and_co2[mask2]
#incid_vol_liquid['COMMODITY_RELEASED_TYPE']

In [None]:
incid_vol_liquid.info()

In [None]:
incid_vol_liquid['NAME'].nunique()

In [None]:
incid_vol_liquid['NAME'].value_counts()

In [None]:
incid_vol_liquid['UNINTENTIONAL_RELEASE_BBLS'].sum()

In [None]:
incid_vol_liquid['RECOVERED_BBLS'].sum()

In [None]:
plt.scatter(incid_vol_liquid['UNINTENTIONAL_RELEASE_BBLS'], incid_vol_liquid['RECOVERED_BBLS'])
plt.xlabel('volume released (bbls)')
plt.ylabel('volume recovered (bbls)')
plt.title('PHMSA Reported Pipeline Incidents 2010 - Present\n(Liquid Commodity)')
plt.savefig(fig_prefix + "liquid-released-v-recovered.png", dpi=350) 

In [None]:
# look at outliers > 20000 bbl release
mask3 = incid_vol_liquid['UNINTENTIONAL_RELEASE_BBLS'] > 20000

In [None]:
large_releases = incid_vol_liquid[mask3]

In [None]:
large_releases.reset_index(inplace=True)

In [None]:
large_releases

In [None]:
large_releases.info()

## Export large releases to geojson file

In [None]:
import json
import geojson

In [None]:
##### SOMETHING NOT WORKING RIGHT

In [None]:
def data2geojson(df):
    points = []
    df.apply(lambda X: points.append((X[["LOCATION_LONGITUDE"]],
                                     X[["LOCATION_LATITUDE"]]), axis=1))
    with open('map.geojson', 'w') as fp:
        geojson.dump(geojson.MultiPoint(points), fp, sort_keys=True)

In [None]:
large_releases[['LOCATION_LONGITUDE', 'LOCATION_LATITUDE']]

In [None]:
with open('map.geojson', 'w') as fp:
    geojson.dump(geojson.MultiPoint([(-102.856912,48.524251),(-84.972510,42.243290)]), fp, sort_keys=True)

## Review Chevron releases

In [None]:
incid_vol_liquid.info()

In [None]:
incid_cvx = incid_vol_liquid[incid_vol_liquid['NAME'].str.contains('CHEVRON')]

In [None]:
incid_cvx

In [None]:
incid_cvx_all = incid[(incid['NAME'].str.contains('CHEVRON')) & (incid['ONSHORE_STATE_ABBREVIATION'] == 'CO')]

In [None]:
incid_cvx_all

## Generate Word Clouds

Websites:

https://www.youtube.com/watch?v=95p3cVkqYHQ

https://www.youtube.com/watch?v=d_zt5XjWVn4

https://stackoverflow.com/questions/16645799/how-to-create-a-word-cloud-from-a-corpus-in-python

TODO: Generate word cloud from notes columns.

In [None]:
incid_vol_liquid.to_csv('2017-03-01-incidents.csv')

In [None]:
incid_vol_liquid['PERCENT_RECOV'] = (incid_vol_liquid['RECOVERED_BBLS'] / incid_vol_liquid['UNINTENTIONAL_RELEASE_BBLS']) * 100
incid_vol_liquid['PERCENT_RECOV'].head()

In [None]:
incid_vol_liquid['PERCENT_RECOV'].plot(kind = 'hist', bins = 10)
plt.xlabel('percent recovery')
plt.ylabel('number of incidents')
plt.title('PHMSA Reported Pipeline Incidents 2010 - Present\n(Liquid Commodity)')
plt.savefig(fig_prefix + "liquid-percent-recovery.png", dpi=350)

In [None]:
plt.scatter(incid_vol_liquid['UNINTENTIONAL_RELEASE_BBLS'], incid_vol_liquid['PERCENT_RECOV'])
plt.xlabel('volume released (bbls)')
plt.ylabel('percent recovered')
plt.title('PHMSA Reported Pipeline Incidents 2010 - Present\n(Liquid Commodity)')
plt.savefig(fig_prefix + "liquid-percent-recovered-by-vol-released.png", dpi=350)

In [None]:
incid_vol_liquid['CAUSE'].value_counts().plot(kind = 'pie', legend=False)
plt.savefig(fig_prefix + "major-incident-causes-pie.png", dpi=350)

In [None]:
incid_vol_liquid.columns

In [None]:
incid_vol_liquid[['CAUSE', 'CAUSE_DETAILS']].head(3)

In [None]:
causes = incid_vol_liquid['CAUSE_DETAILS'].value_counts().sort_values(ascending=True)
causes.tail()

In [None]:
# sns.barplot(causes.values, causes.index)
causes.plot.barh()
plt.xlabel('number of incidents')
plt.title('Causes of PHMSA Reported Pipeline Incidents\n(2010 - Present for Liquid Commodity)')
plt.savefig(fig_prefix + "frequency-of-causes.png", dpi=350)

In [None]:
incid_vol_liquid['UNINTENTIONAL_RELEASE_BBLS'].sum()

In [None]:
mask4 = incid_vol_liquid['CAUSE_DETAILS'] == 'INTERNAL CORROSION'

In [None]:
incid_ic = incid_vol_liquid[mask4]

In [None]:
incid_ic['UNINTENTIONAL_RELEASE_BBLS'].sum()

In [None]:
causes_dict = {}
for item in causes.index:
    item_bbl_sum = incid_vol_liquid[incid_vol_liquid['CAUSE_DETAILS'] == item]['UNINTENTIONAL_RELEASE_BBLS'].sum()
    print('Cause {} resulted in total releases of {} bbls from 2010 to present'.format(item, item_bbl_sum))
    causes_dict[item] = item_bbl_sum

In [None]:
causes_dict

In [None]:
causes_series = pd.Series(causes_dict).sort_values(ascending=True)
causes_series.tail()

In [None]:
# sns.barplot(causes_series.values, causes_series.index)
causes_series.plot.barh()
plt.xlabel('total volume released (bbls)')
plt.title('Causes of PHMSA Reported Pipeline Incidents\n(2010 - Present for Liquid Commodity)')
plt.savefig(fig_prefix + "liquid-released-by-cause.png", dpi=350)

In [None]:
causes.head()
causes['INTENTIONAL DAMAGE']

In [None]:
causes_rate_dict = {}
for item in causes.index:
    item_bbl_sum = incid_vol_liquid[incid_vol_liquid['CAUSE_DETAILS'] == item]['UNINTENTIONAL_RELEASE_BBLS'].sum()
    item_count = causes[item]
    print('Cause {} resulted in averaage of {} bbls per release from 2010 to present'.format(item, item_bbl_sum))
    causes_rate_dict[item] = item_bbl_sum / item_count

In [None]:
causes_rate_series = pd.Series(causes_rate_dict).sort_values(ascending=True)
causes_rate_series.tail()

In [None]:
causes_rate_series.plot.barh()
plt.xlabel('total volume released PER incident (bbls)')
plt.title('Causes of PHMSA Reported Pipeline Incidents\n(2010 - Present for Liquid Commodity)')
plt.savefig(fig_prefix + "liquid-released-per-incident-by-cause.png", dpi=350)

In [None]:
mask5 = incid_vol_liquid['CAUSE_DETAILS'] == 'ENVIRONMENTAL CRACKING-RELATED'
env_cracking = incid_vol_liquid[mask5]

In [None]:
env_cracking[['NAME', 'STRESS_SUBTYPE', 'STRESS_DETAILS', 'UNINTENTIONAL_RELEASE_BBLS']]

In [None]:
incid_vol_top_10 = incid_vol_liquid.sort(columns=['UNINTENTIONAL_RELEASE_BBLS'], ascending=False).head(10)

In [None]:
incid_vol_top_10[['UNINTENTIONAL_RELEASE_BBLS', 'NARRATIVE']]

In [None]:
incid_vol_top_10['NARRATIVE'][1475]

In [None]:
incid_vol_top_10['NARRATIVE'][2601]

In [None]:
incid_vol_top_10['NARRATIVE'][2299]

In [None]:
mask6 = incid_vol_liquid['NAME'].str.contains('PLAINS')

In [None]:
plains = incid_vol_liquid[mask6]

In [None]:
plains.columns

In [None]:
plains[['ONSHORE_CITY_NAME', 'ONSHORE_STATE_ABBREVIATION']]

In [None]:
mask7 = plains['ONSHORE_STATE_ABBREVIATION'] == 'CA'

In [None]:
plains_ca = plains[mask7]

In [None]:
plains_ca['NARRATIVE'].str.contains('SANTA')

In [None]:
plains_ca['NARRATIVE'][692]

In [None]:
incidents.iloc[692]