In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
# Specify vars to parse as dates
date_vars = ['limit_begin_date', 'limit_end_date', 'monitoring_period_end_date', 'monitoring_period_end_month', 'monitoring_period_quarter_start', 'monitoring_period_quarter_end', 'value_received_date', 'value_received_month', 'value_received_quarter_start', 'value_received_quarter_end']

In [3]:
# Load in DMR data
MS_dmrs = pd.read_csv('~/Documents/DMR work/DMR Python Projects/dmr-permit-limit-changes/dmr_data/MS_dmrs.csv', dtype={
                            'perm_feature_nmbr': object,
                             'limit_unit_code': object,
                             'standard_unit_code': object,
                             'dmr_unit_desc': str,
                             'dmr_unit_code': object
                         }, parse_dates=date_vars, date_parser=lambda t: pd.to_datetime(t, errors='coerce'))

  MS_dmrs = pd.read_csv('~/Documents/DMR work/DMR Python Projects/dmr-permit-limit-changes/dmr_data/MS_dmrs.csv', dtype={


Constructing limit value time series

In [99]:
# For each NPDES permit, permit features, parameter, and limit type-- construct time series of limit values over time

# Variables to include in time series
limit_ids = ['limit_id', 'limit_value_id']
limit_times = ['limit_begin_date', 'limit_end_date']
limit_unique = ['npdes_permit_id', 'perm_feature_nmbr', 'parameter_desc', 'limit_value_type_code', 'statistical_base_code', 'statistical_base_type_code']
limit_vals = ['limit_value_nmbr', 'limit_value_standard_units', 'standard_unit_desc']

# Obtain limit values over time from DMR data
# Exclude missing limit values
excl_missing_lv = MS_dmrs.dropna(subset=['limit_value_nmbr'])
# Drop DMRs with duplicate limit variables
limit_time_series = excl_missing_lv[limit_ids + limit_unique + limit_times + limit_vals].drop_duplicates()
print('DMRs with duplicate limit variables dropped: ', len(MS_dmrs) - len(limit_time_series))

# Sort limit values
limit_time_series.sort_values(by=limit_unique + limit_times + limit_vals, inplace=True)
limit_time_series.head()

KeyError: "['wbd_huc12'] not in index"

In [5]:
# First, finding duplicates on limit id x time
find_dups = limit_time_series.groupby(limit_ids + limit_times).size().reset_index(name='counts')

# Print the counts of limits id x time combinations by how many times they appear in the dataset
find_dups['counts'].value_counts()

1    107403
2      4545
Name: counts, dtype: int64

In [6]:
# Flag duplicates on all known identifiers
find_dups_all = limit_time_series.groupby(limit_unique + limit_times).size().reset_index(name='counts') # Count number of duplicates using the unique identifiers and time as ID variables
find_dups_all = limit_time_series.merge(find_dups_all, how='left', on=limit_unique + limit_times)
find_dups_all['counts'].value_counts()

1     102893
2      13374
4        148
10        40
3         24
8          8
6          6
Name: counts, dtype: int64

In [7]:
# For now, drop limits if they are not unique on our identifiers
pre_length = len(limit_time_series)
limit_time_series = find_dups_all[find_dups_all['counts']==1]
print("Number of limits dropped: ", pre_length-len(limit_time_series))
# notes about duplicated data:
# some of the duplicates (e.g. for permit_id MS0003115 pH) have depulicated limit values; can just drop the id variable and then de-dup
# some duplicates contain different limit values for different observations (e.g. for permit_id MS0034436 Flow, in conduit or thru treatment plant). this at least doesn't seem to change over time? not sure what happens to the limit values
# for now, ignore

Number of limits dropped:  13600


Constructing flag for actual permit limit change

In [8]:
# Compare to flagged value; if limit_value_standard_units != prev then there was a change
limit_time_series_df = limit_time_series
limit_time_series_df['prev_limit_value_nmbr'] = limit_time_series_df.groupby(limit_unique)[['limit_value_nmbr']].shift()

limit_time_series_df['prev_limit_value_standard_units'] = limit_time_series_df.groupby(limit_unique)[['limit_value_standard_units']].shift()
condition1 = limit_time_series_df['prev_limit_value_nmbr'].notna()
condition2 = limit_time_series_df['limit_value_nmbr'] != limit_time_series_df['prev_limit_value_nmbr']
limit_time_series_df['limit_change'] = np.where(condition1 & condition2, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  limit_time_series_df['prev_limit_value_nmbr'] = limit_time_series_df.groupby(limit_unique)[['limit_value_nmbr']].shift()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  limit_time_series_df['prev_limit_value_standard_units'] = limit_time_series_df.groupby(limit_unique)[['limit_value_standard_units']].shift()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

In [9]:
#  Calculate limit change % difference
limit_time_series_df['limit_change_pct'] = 100*(limit_time_series_df['limit_value_standard_units']-limit_time_series_df['prev_limit_value_standard_units'])/limit_time_series_df['prev_limit_value_standard_units']
limit_time_series_df['limit_change_pct'] = np.where(limit_time_series_df['limit_change']==0, np.nan, limit_time_series_df['limit_change_pct']) # if there is no limit change, represent limit_change_pct as NaN

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  limit_time_series_df['limit_change_pct'] = 100*(limit_time_series_df['limit_value_standard_units']-limit_time_series_df['prev_limit_value_standard_units'])/limit_time_series_df['prev_limit_value_standard_units']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  limit_time_series_df['limit_change_pct'] = np.where(limit_time_series_df['limit_change']==0, np.nan, limit_time_series_df['limit_change_pct']) # if there is no limit change, represent limit_change_pct as NaN


In [10]:
limit_time_series_df['limit_change'].value_counts()

0    97125
1     5768
Name: limit_change, dtype: int64

In [68]:
limit_changes = limit_time_series_df[limit_time_series_df['limit_change']==1]
limits_unchanged = limit_time_series_df[limit_time_series_df['limit_change']==0]

How many limit series have never changed?

In [73]:
limits_unchanged.drop_duplicates(subset=['limit_value_id', 'limit_id']).shape[0]

97114

How many limit series have ever changed?

In [83]:
limit_changes.drop_duplicates(subset=['limit_value_id', 'limit_id']).shape[0]

5768

In [47]:
impaired_waters = pd.read_csv('impaired_waters.csv', dtype={'HUC12': object})
permits = pd.read_csv('MS_permits.csv', dtype={'wbd_huc12': object}).rename(columns={'wbd_huc12':'HUC12'})

In [49]:
impaired_permits = permits.merge(impaired_waters, how='inner', on='HUC12')
impaired_permits.to_csv('impaired_permits.csv')

In [122]:
impaired_permits.drop_duplicates(subset=['HUC12']).shape[0]

154

How many impaired waters were left behind after the merge?

In [118]:
hucs = list(impaired_permits['HUC12'].unique())
unmatched = impaired_waters[impaired_waters['HUC12'].apply(lambda x: x not in hucs)]
len(unmatched.drop_duplicates(subset=['HUC12']))

49

49 impaired waters were not matched by HUC12 code to the MS permits list. What do these waters look like?

In [51]:
unmatched

Unnamed: 0,AUID,ASSESSMENT_ID,ASSESSMENT_UNIT_NAME,FIRST LISTED,EPA_IR_CAT,WATER_TYPE,Acres,SqKm,LAST_ASSESSED,LAST_MODIFIED,Overall_Status,HUC12,HUC12_NAME
2,MS931011,MS931011,WALNUT LAKE,2022.0,5,LAKE/RESERVOIR/POND,142.855946,0.578118,2022,2022,Not Supporting,80302040804,Brushy Bayou-Coldwater River
3,MS938312,MS938312,LAKE HENRY,2016.0,5,LAKE/RESERVOIR/POND,154.714323,0.626107,2018,2020,Not Supporting,80302070802,Lake Henry
4,MS990411,MS990411,LAKE BEULAH,,5,LAKE/RESERVOIR/POND,993.896204,4.022155,2018,2020,Not Supporting,80201000302,Lake Beulah-Mississippi River
5,MS100211,MS100211,SPRING CREEK,2018.0,5,STREAM/CREEK/RIVER,8.350559,,2022,2022,Not Supporting,80602010101,Savannah Creek-Spring Creek
7,101112,101112,SAND CREEK,2008.0,5,STREAM/CREEK/RIVER,4.574525,,2022,2022,Not Supporting,80602010301,Calabrella Creek
10,103913,103913,CROOKED CREEK,2018.0,5,STREAM/CREEK/RIVER,3.509387,,2022,2022,Not Supporting,80602010903,Ousley Creek-Seneasha Creek
11,103914,103914,PARKER CREEK,2018.0,5,STREAM/CREEK/RIVER,5.621625,,2022,2022,Not Supporting,80602010903,Ousley Creek-Seneasha Creek
12,104511,104511,PEPPER CREEK / RUCKER CREEK,2012.0,5,STREAM/CREEK/RIVER,4.57123,,2014,2020,Not Supporting,80602020407,Pepper Creek-Big Black River
13,104812,104812,BIG CYPRESS CREEK,2012.0,5,STREAM/CREEK/RIVER,2.843281,,2012,2020,Not Supporting,80602020101,Cypress Creek-Big Cypress Creek
17,107611,107611,PORTER CREEK,2012.0,5,STREAM/CREEK/RIVER,9.066825,,2014,2020,Not Supporting,80602020704,Porter Creek


How many limit series had a limit change and an impairment?

In [92]:
impaired_changed_permits = impaired_permits.merge(limit_changes, how='inner', on='npdes_permit_id')
impaired_changed_permits.drop_duplicates(subset=['limit_value_id', 'limit_id']).shape[0]

787

How many limit series correspond to an impaired water but never had a limit change?

In [71]:
impaired_unchanged_permits = impaired_permits.merge(limits_unchanged, how='inner', on='npdes_permit_id')
impaired_unchanged_permits.drop_duplicates(subset=['limit_value_id', 'limit_id']).shape[0]

16983

How permits could not be matched to any limit?

In [120]:
impaired_permits.merge(limit_time_series_df, how='inner', on='npdes_permit_id').drop_duplicates(subset=['limit_value_id', 'limit_id']).shape[0]-787-16983

0

How many limit series had a limit change after their first impairment?

In [93]:
impaired_changed_permits['FIRST LISTED '] = pd.to_datetime(impaired_changed_permits['FIRST LISTED '], errors='coerce')
impaired_changed_permits = impaired_changed_permits.rename(columns={'FIRST LISTED ': 'Year first listed'})
impaired_changed_permits['delay'] = impaired_changed_permits.limit_begin_date-impaired_changed_permits['Year first listed']
delays = impaired_changed_permits.drop_duplicates(subset=['limit_id', 'limit_value_id'])['delay']

impaired_changed_permits[impaired_changed_permits['delay'].dt.days>=0].drop_duplicates(subset=['limit_id', 'limit_value_id']).shape[0]

490

In [79]:
impaired_changed_permits.to_csv('impaired_changed_permits.csv')

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))
plt.hist(delays.dt.days)

In [104]:
impaired_changed_permits[(impaired_changed_permits['delay'].dt.days>=0) & (impaired_changed_permits['delay'].dt.days<365*3)].drop_duplicates(subset=['limit_id', 'limit_value_id']).shape[0]

118

In [105]:
impaired_changed_permits[(impaired_changed_permits['delay'].dt.days>=0) & (impaired_changed_permits['delay'].dt.days<365*2)].drop_duplicates(subset=['limit_id', 'limit_value_id']).shape[0]

79

In [106]:
impaired_changed_permits[(impaired_changed_permits['delay'].dt.days>=0) & (impaired_changed_permits['delay'].dt.days<365)].drop_duplicates(subset=['limit_id', 'limit_value_id']).shape[0]

36

In [138]:
print('Numbers are in terms of limit series')
A = impaired_unchanged_permits.drop_duplicates(subset=['limit_id', 'limit_value_id']).shape[0] # Unchanged impaired water limit series
unchanged_permits = permits.merge(limits_unchanged, how='inner', on='npdes_permit_id')
B = unchanged_permits.drop_duplicates(subset=['limit_id', 'limit_value_id']).shape[0] # Unchanged limit series
C = impaired_changed_permits.drop_duplicates(subset=['limit_id', 'limit_value_id']).shape[0] # Changed impaired water limit series
D = permits.merge(limit_changes, how='inner', on='npdes_permit_id').drop_duplicates(subset=['limit_id', 'limit_value_id']).shape[0] # Changed water limit series

E = impaired_changed_permits[impaired_changed_permits['delay'].dt.days>=0].drop_duplicates(subset=['limit_id', 'limit_value_id']).shape[0]
F = impaired_changed_permits[(impaired_changed_permits['delay'].dt.days>=0) & (impaired_changed_permits['delay'].dt.days<365*3)].drop_duplicates(subset=['limit_id', 'limit_value_id']).shape[0]
G = impaired_changed_permits[(impaired_changed_permits['delay'].dt.days>=0) & (impaired_changed_permits['delay'].dt.days<365*2)].drop_duplicates(subset=['limit_id', 'limit_value_id']).shape[0]
H = impaired_changed_permits[(impaired_changed_permits['delay'].dt.days>=0) & (impaired_changed_permits['delay'].dt.days<365)].drop_duplicates(subset=['limit_id', 'limit_value_id']).shape[0]

pd.DataFrame({'Limit constant': [A, B-A], 'Limit change ever':[C, D-C], 'Limit change on/after first impairment':[E, 'NA'], 'Limit change within 3 yrs of first impairment': [F, 'NA'], 'Limit change within 2 yrs of first impairment': [G, 'NA'], 'Limit change within 1 yr of first impairment': [H, 'NA']}, {'Never impaired', 'Ever impaired'}).T

Numbers are in terms of limit series


Unnamed: 0,Ever impaired,Never impaired
Limit constant,16983,80131.0
Limit change ever,787,4981.0
Limit change on/after first impairment,490,
Limit change within 3 yrs of first impairment,118,
Limit change within 2 yrs of first impairment,79,
Limit change within 1 yr of first impairment,36,


What about in terms of waters?

In [135]:
print('Numbers are in terms of HUC12 waters')
limit_change_hucs = list(impaired_changed_permits.drop_duplicates(subset='HUC12')['HUC12'])
A = impaired_unchanged_permits[impaired_unchanged_permits['HUC12'].apply(lambda x: x not in limit_change_hucs)].drop_duplicates(subset='HUC12').shape[0] # All never-changing limit impaired HUCs

unchanged_permits = permits.merge(limits_unchanged, how='inner', on='npdes_permit_id')
B = unchanged_permits[unchanged_permits['HUC12'].apply(lambda x: x not in limit_change_hucs)].drop_duplicates(subset=['HUC12']).shape[0] # All never-changing limit HUCs
C = impaired_changed_permits.drop_duplicates(subset=['HUC12']).shape[0] # Impaired limit change HUCs
D = permits.merge(limit_changes, how='inner', on='npdes_permit_id').drop_duplicates(subset=['HUC12']).shape[0] # All limit change HUCs

E = impaired_changed_permits[impaired_changed_permits['delay'].dt.days>=0].drop_duplicates(subset=['HUC12']).shape[0]
F = impaired_changed_permits[(impaired_changed_permits['delay'].dt.days>=0) & (impaired_changed_permits['delay'].dt.days<365*3)].drop_duplicates(subset=['HUC12']).shape[0]
G = impaired_changed_permits[(impaired_changed_permits['delay'].dt.days>=0) & (impaired_changed_permits['delay'].dt.days<365*2)].drop_duplicates(subset=['HUC12']).shape[0]
H = impaired_changed_permits[(impaired_changed_permits['delay'].dt.days>=0) & (impaired_changed_permits['delay'].dt.days<365)].drop_duplicates(subset=['HUC12']).shape[0]

pd.DataFrame({'All limits constant': [A, B-A], 'Limit change ever':[C, D-C], 'Limit change on/after first impairment':[E, 'NA'], 'Limit change within 3 yrs of first impairment': [F, 'NA'], 'Limit change within 2 yrs of first impairment': [G, 'NA'], 'Limit change within 1 yr of first impairment': [H, 'NA']}, {'Never impaired', 'Ever impaired'}).T

Numbers are in terms of HUC12 waters


Unnamed: 0,Ever impaired,Never impaired
All limits constant,47,609.0
Limit change ever,66,327.0
Limit change on/after first impairment,45,
Limit change within 3 yrs of first impairment,18,
Limit change within 2 yrs of first impairment,15,
Limit change within 1 yr of first impairment,10,
