In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import numpy as np

### Create a Summary Dataframe (tdf) for Data from the most recent year:

In [8]:
year = 2020
filename = 'PHMSA_Distribution_Data_2010_2020/annual_gas_distribution_' + str(year) + '.xlsx'
tdf = pd.read_excel(filename)
tdf.columns = tdf.iloc[1]
tdf = tdf.drop([0,1]).reset_index(drop=True)
tdf['SRVCS_MILES_TOTAL'] = [tdf.at[i,'AVERAGE_LENGTH']*tdf.at[i,'NUM_SRVCS_TOTAL']/5280 for i in tdf.index]
basic_info_cols = ['OPERATOR_NAME','OPERATOR_ID','OPERATOR_TYPE','STOP','MMILES_TOTAL','SRVCS_MILES_TOTAL','AVERAGE_LENGTH','NUM_SRVCS_TOTAL']
mmiles_historic_cols = [str(i) for i in tdf.columns if "MMILES_BY" in i]
srvs_historic_cols = [str(i) for i in tdf.columns if "NUM_SRVS_BY" in i]
leaks_cols = [str(i) for i in tdf.columns if "TOTAL_LEAKS" in i]
hazleaks_cols = [str(i) for i in tdf.columns if "TOTAL_HAZLEAKS" in i]
tdf = tdf.filter(basic_info_cols + mmiles_historic_cols + srvs_historic_cols + leaks_cols + hazleaks_cols)
tot_leaks = [0 for _ in tdf.index]
tot_hazleaks = [0 for _ in tdf.index]
for i in tdf.index:
    tot_leaks[i] = sum(tdf.iloc[i,31:47])
    tot_hazleaks[i] = sum(tdf.iloc[i,47:])
tdf = tdf.drop(columns=[i for i in tdf.columns if ('TOTAL_LEAKS' in i) or ('TOTAL_HAZLEAKS'in i)])
tdf = tdf.drop(columns=['MMILES_BY_DCD_TOTAL','NUM_SRVS_BY_DCD_TOTAL'])
tdf['TOTAL_LEAKS_'+str(year)] = tot_leaks
tdf['HAZARDOUS_LEAKS_'+str(year)] = tot_hazleaks
tdf['Main Lines over 50 Years Old'] = [int(tdf.loc[i,'MMILES_BY_DCD_1940_TO_1949']+tdf.loc[i,'MMILES_BY_DCD_1940_TO_1949']+tdf.loc[i,'MMILES_BY_DCD_1950_TO_1959']+tdf.loc[i,'MMILES_BY_DCD_1960_TO_1969']+(1.5/10)*tdf.loc[i,'MMILES_BY_DCD_1970_TO_1979']) for i in tdf.index]

### Create a Main Mileage and System Age Summary File:

In [15]:
mmiles = tdf.copy()
mmiles = mmiles.filter(['OPERATOR_NAME','OPERATOR_ID','OPERATOR_TYPE','STOP','MMILES_TOTAL','MMILES_BY_DCD_UNK', 'MMILES_BY_DCD_PRE1940','MMILES_BY_DCD_1940_TO_1949', 'MMILES_BY_DCD_1950_TO_1959',
'MMILES_BY_DCD_1960_TO_1969', 'MMILES_BY_DCD_1970_TO_1979','MMILES_BY_DCD_1980_TO_1989', 'MMILES_BY_DCD_1990_TO_1999',
       'MMILES_BY_DCD_2000_TO_2009', 'MMILES_BY_DCD_2010_TO_2019','MMILES_BY_DCD_2020_TO_2029','Main Lines over 50 Years Old'])
mmiles.columns = ['Operator Name','Operator ID','Operator Type','State','Total Main Miles','Main Miles Built in Unknown Decade','Main Miles Built Pre-1940','Main Miles Built 1940-49','Main Miles Built 1950-59','Main Miles Built 1960-69','Main Miles Built 1970-79','Main Miles Built 1980-89','Main Miles Built 1990-99','Main Miles Built 2000-09','Main Miles Built 2010-19','Main Miles Built 2020-29','Main Miles over 50 Years Old']
ages = [1925,1945,1955,1965,1975,1985,1995,2005,2015,2020]
cols = list(mmiles.columns[6:-1])
mileage_age = [0 for _ in mmiles.index]
for i in mmiles.index:
    age = 0
    for j in range(len(ages)):
        age += mmiles.at[i,cols[j]]*(2021-ages[j])
    try:
        mileage_age[i] = int(age/(mmiles.at[i,'Total Main Miles']-mmiles.at[i,'Main Miles Built in Unknown Decade']))
    except:
        None
mmiles['Main Mileage Average Age'] = mileage_age
mmiles = mmiles.filter(['Operator Name','Operator ID','Operator Type','State','Total Main Miles','Main Mileage Average Age','Main Miles over 50 Years Old','Main Miles Built in Unknown Decade','Main Miles Built Pre-1940','Main Miles Built 1940-49','Main Miles Built 1950-59','Main Miles Built 1960-69','Main Miles Built 1970-79','Main Miles Built 1980-89','Main Miles Built 1990-99','Main Miles Built 2000-09','Main Miles Built 2010-19','Main Miles Built 2020-29'])
#mmiles.to_csv('PHMSA_Cleaned_Data/PHMSA_Main_Mileage_2020.csv')


### Create a Hazardous Leaks Time Series File:

In [18]:
opid_to_years = dict()
for i in tdf.index:
    opid = tdf.at[i,'OPERATOR_ID']
    opid_to_years[opid] = dict()
    opid_to_years[opid]['Operator Name'] = tdf.at[i,'OPERATOR_NAME']
    opid_to_years[opid]['Operator Type'] = tdf.at[i,'OPERATOR_TYPE']
    opid_to_years[opid]['State'] = tdf.at[i,'STOP']
    for y in range(2010,2021):
        opid_to_years[opid][str(y)] = 0
for i in range(2010,2020):
    year = str(i)
    filename = 'PHMSA_Distribution_Data_2010_2020/annual_gas_distribution_' + str(year) + '.xlsx'
    tdf = pd.read_excel(filename)
    tdf.columns = tdf.iloc[1]
    tdf = tdf.drop([0,1]).reset_index(drop=True)
    for i in tdf.index:
        try:
            opid = tdf.at[i,'OPERATOR_ID']
            opid_to_years[opid][year] = tdf.at[i,'MMILES_BY_DCD_TOTAL']
        except:
            None
for i in [2020]:
    year = str(i)
    filename = 'PHMSA_Distribution_Data_2010_2020/annual_gas_distribution_' + str(year) + '.xlsx'
    tdf = pd.read_excel(filename)
    tdf.columns = tdf.iloc[1]
    tdf = tdf.drop([0,1]).reset_index(drop=True)
    for i in tdf.index:
        try:
            opid = tdf.at[i,'OPERATOR_ID']
            opid_to_years[opid][year] = tdf.at[i,'MMILES_BY_DCD_TOTAL']
        except:
            None
mmiles10 = pd.DataFrame(opid_to_years).T
mmiles10a = mmiles10.copy().reset_index()
mmiles10a.columns = ['Operator ID'] + list(mmiles10.columns[0:])

In [19]:
mmiles10a.head()

Unnamed: 0,Operator ID,Operator Name,Operator Type,State,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,18,"ABBYVILLE, CITY OF",Municipal Owned,KS,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
1,27,ABITA SPRINGS NAT GAS & WATER,Municipal Owned,LA,28.0,28.0,28.0,28.0,28.0,28.0,28.0,28.0,28.0,28.0,28.0
2,45,"ADAIRSVILLE, CITY OF",Municipal Owned,GA,70.5,72.4,73.7,74.23,79.43,86.7,91.68,87.73,88.31,90.09,114.31
3,49,TOWN OF ADAMSVILLE GAS DEPT,Municipal Owned,TN,158.3,159.01,164.247,164.247,165.444,168.36,171.05,171.05,171.05,171.141,171.141
4,54,"ADEL GAS DEPT, CITY OF",Municipal Owned,GA,36.5,36.5,36.5,36.5,36.5,36.5,50.0,55.6,55.6,58.7,58.31
