In [1]:
from datetime import date
import json
from pathlib import Path
import re

from pandas import DataFrame


END_OF_ADMIN = (2000, 2008, 2016, 2020)
PRESIDENTIAL_ADMINS = {
    "Clinton": {
        "party": "D", 
        "years": range(1992, 2001)
        }, 
    "Bush": {
        "party": "R", 
        "years": range(2001, 2009)
        },
    "Obama": {
        "party": "D", 
        "years": range(2009, 2017)
        },
    "Trump": {
        "party": "R", 
        "years": range(2017, 2021)
        },
    "Biden": {
        "party": "D", 
        "years": range(2021, 2025)
        }    
    }


def load_json(path: Path, file_name: str) -> dict | list:
    """Import data from .json format.

    Args:
        path (Path): Path of directory where file is located.
        file_name (str): Name of .json file (without file extension; e.g., "file_name").

    Returns:
        dict | list: JSON object.
    """        
    with open(path / f"{file_name}.json", "r", encoding="utf-8") as f:
        data = json.load(f)
    
    return data


def extract_date(string):
    
    res = re.compile("\d{4}-\d{2}-\d{2}", re.I).match(string)
    
    if isinstance(res, re.Match):
        return date.fromisoformat(res[0])
    else:
        return None


def json_to_df(
        data: dict | list, 
        has_metadata: bool = True, 
        date_cols: list | tuple = ("effective", "received", "published")):
    
    if has_metadata:
        results = data.get("results")
    else:
        results = data
    
    df = DataFrame(results)
    
    # convert date columns to datetime.date format
    for col in date_cols:
        df.loc[:, f"{col}_dt"] = [extract_date(x) if isinstance(x, str) else x for x in df.loc[:, col]]
        df.loc[:, f"{col}_year"] = [x.year if isinstance(x, date) else x for x in df.loc[:, f"{col}_dt"]]
        df.loc[:, f"{col}_month"] = [x.month if isinstance(x, date) else x for x in df.loc[:, f"{col}_dt"]]
    
    return df


def convert_to_presidential_year(df: DataFrame, date_col: str = "published"):
    """Convert calendar year to presidential year for selected column `date_col`.

    Args:
        df (DataFrame): Input data.
        date_col (str, optional): Date column in calendar years. Defaults to "published".

    Returns:
        DataFrame: Output data with new date column in presidential years.
    """
    df_copy = df.copy(deep=True)
    # create presidential year column
    df_copy['presidential_year'] = df_copy[f'{date_col}_year']
    bool_jan = df[f'{date_col}_month'] == 1
    df_copy.loc[bool_jan, 'presidential_year'] = df_copy.loc[bool_jan, f'{date_col}_year'] - 1
    return df_copy


def define_presidential_terms(
        df: DataFrame, 
        end_of_term: list | tuple = END_OF_ADMIN, 
        terms: dict = PRESIDENTIAL_ADMINS):
    """_summary_

    Args:
        df (DataFrame): _description_
        end_of_term (list | tuple, optional): _description_. Defaults to END_OF_TERM.
        terms (dict, optional): _description_. Defaults to PRESIDENTIAL_TERMS.

    Returns:
        _type_: _description_
    """
    df_copy = df.copy(deep=True)
    df_copy.loc[:, "end_of_term"] = [1 if i in end_of_term else 0 for i in df_copy["presidential_year"]]
    party = [v.get("party") for v in terms.values() for y in v.get("years") if y in set(df_copy["presidential_year"])]
    df_copy.loc[:, "democratic_admin"] = [1 if p == "D" else 0 for p in party]
    return df_copy


def save_csv(df: DataFrame, path: Path, file_name: str):
    """Save processed data in .csv format.

    Args:
        df (DataFrame): .
        path (Path): Path of directory where file is located.
        file_name (str): Name of .json file (without file extension; e.g., "file_name").

    Returns:
        dict | list: JSON object.
    """        
    with open(path / f"{file_name}.csv", "w", encoding="utf-8") as f:
        df.to_csv(f, index=False, lineterminator="\n")
    
    print(f"Saved data to {path}.")


def groupby_year(df: DataFrame, 
                 year_col: str = "published", 
                 agg_col: str = "control_number", 
                 agg_func: str = "nunique"):    
    grouped = df.groupby([f"{year_col}_year"]).agg({agg_col: agg_func}).reset_index()
    return grouped.rename(columns={agg_col: "major_rules"})


# df.groupby(["agency", "subagency"])["control_number"].agg("count")


if __name__ == "__main__":
    
    # profile time elapsed
    import time
    start = time.process_time()

    p = Path(__file__)
    major_path = p.parents[1]
    data_path = major_path.joinpath("raw_data")
    if not data_path.is_dir():
        print("Cannot locate data.")

while True:
    
    # print prompts to console
    major_prompt = input("Process only major rules? [yes/no]: ").lower()
    
    # check user inputs
    y_inputs = ["y", "yes", "true"]
    n_inputs = ["n", "no", "false"]
    valid_inputs = y_inputs + n_inputs
    if major_prompt in valid_inputs:
        
        # set major_only param
        if major_prompt.lower() in y_inputs:
            major_only = True
            data_file = "rule_detail_major"
        elif major_prompt.lower() in n_inputs:
            major_only = False
            data_file = "rule_detail_all"

        # call processing pipeline
        data = load_json(data_path, data_file)    
        df = json_to_df(data)
        df = convert_to_presidential_year(df, "received")
        grouped = groupby_year(df, year_col = "presidential")
        output = define_presidential_terms(grouped)
        print("Aggregated data:", output, sep="\n")
        save_csv(output, major_path, "major_rules_received_year_test")
        break

    else:
        print(f"Invalid input. Must enter one of the following: {', '.join(valid_inputs)}.")
 
    # calculate time elapsed
    stop = time.process_time()
    print(f"CPU time: {stop - start:0.1f} seconds")

Aggregated data:
    presidential_year  major_rules  end_of_term  democratic_admin
0                1996           52            0                 1
1                1997           59            0                 1
2                1998           75            0                 1
3                1999           51            0                 1
4                2000           87            1                 1
5                2001           58            0                 0
6                2002           52            0                 0
7                2003           50            0                 0
8                2004           67            0                 0
9                2005           52            0                 0
10               2006           54            0                 0
11               2007           65            0                 0
12               2008          102            1                 0
13               2009           80            0            

PermissionError: [Errno 13] Permission denied: 'h:\\Projects\\Reg-Stats\\data\\major_rules\\major_rules_received_year_test.csv'

In [2]:
df

Unnamed: 0,url,title,agency,subagency,type,description,priority,effective,received,published,...,effective_dt,effective_year,effective_month,received_dt,received_year,received_month,published_dt,published_year,published_month,presidential_year
0,https://www.gao.gov/fedrules/208127,Federal Rules: National Organic Program (Nop);...,Department of Agriculture,Agricultural Marketing Service,Major,NATIONAL ORGANIC PROGRAM (NOP); ORGANIC LIVEST...,Significant/Substantive,2024-01-02T13:00:00Z,2023-11-08T13:00:00Z,2023-11-08T13:00:00Z,...,2024-01-02,2024.0,1.0,2023-11-08,2023,11,2023-11-08,2023,11,2023
1,https://www.gao.gov/fedrules/208130,Federal Rules: Medicare Program; End-Stage Ren...,Department of Health and Human Services,Centers for Medicare & Medicaid Services,Major,MEDICARE PROGRAM; END-STAGE RENAL DISEASE PROS...,Significant/Substantive,2024-01-01T13:00:00Z,2023-11-01T13:00:00Z,2023-11-01T13:00:00Z,...,2024-01-01,2024.0,1.0,2023-11-01,2023,11,2023-11-01,2023,11,2023
2,https://www.gao.gov/fedrules/208129,Federal Rules: Medicare Program; Hospital Outp...,Department of Health and Human Services,Centers for Medicare & Medicaid Services,Major,MEDICARE PROGRAM; HOSPITAL OUTPATIENT PROSPECT...,Significant/Substantive,2024-01-01T13:00:00Z,2023-11-01T13:00:00Z,2023-11-01T13:00:00Z,...,2024-01-01,2024.0,1.0,2023-11-01,2023,11,2023-11-01,2023,11,2023
3,https://www.gao.gov/fedrules/208125,Federal Rules: Revocation Of Uses Of Partially...,Department of Health and Human Services,Food and Drug Administration,Major,REVOCATION OF USES OF PARTIALLY HYDROGENATED O...,Significant/Substantive,2023-12-22T13:00:00Z,2023-10-31T13:00:00Z,2023-10-31T13:00:00Z,...,2023-12-22,2023.0,12.0,2023-10-31,2023,10,2023-10-31,2023,10,2023
4,https://www.gao.gov/fedrules/208118,Federal Rules: Energy Conservation Program: En...,Department of Energy,Department of Energy,Major,ENERGY CONSERVATION PROGRAM: ENERGY CONSERVATI...,Significant/Substantive,2023-12-05T13:00:00Z,2023-10-30T13:00:00Z,2023-10-30T13:00:00Z,...,2023-12-05,2023.0,12.0,2023-10-30,2023,10,2023-10-30,2023,10,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2107,https://www.gao.gov/fedrules/103071,Federal Rules: Amendment To The Commission'S R...,Independent Government Entities,Federal Communications Commission,Major,AMENDMENT TO THE COMMISSION'S RULES REGARDING ...,Significant/Substantive,1996-08-12T13:00:00Z,1996-05-06T13:00:00Z,1996-05-06T13:00:00Z,...,1996-08-12,1996.0,8.0,1996-05-06,1996,5,1996-05-06,1996,5,1996
2108,https://www.gao.gov/fedrules/101498,Federal Rules: Open Access Same-Time Informati...,Independent Government Entities,Federal Energy Regulatory Commission,Major,OPEN ACCESS SAME-TIME INFORMATION SYSTEM AND S...,Significant/Substantive,1996-07-09T13:00:00Z,1996-04-25T13:00:00Z,1996-04-25T13:00:00Z,...,1996-07-09,1996.0,7.0,1996-04-25,1996,4,1996-04-25,1996,4,1996
2109,https://www.gao.gov/fedrules/101499,Federal Rules: Promoting Wholesale Competition...,Independent Government Entities,Federal Energy Regulatory Commission,Major,PROMOTING WHOLESALE COMPETITION THROUGH OPEN A...,Significant/Substantive,1996-07-08T13:00:00Z,1996-04-24T13:00:00Z,1996-04-24T13:00:00Z,...,1996-07-08,1996.0,7.0,1996-04-24,1996,4,1996-04-24,1996,4,1996
2110,https://www.gao.gov/fedrules/103066,Federal Rules: Light Truck Average Fuel Econom...,Department of Transportation,National Highway Traffic Safety Administration,Major,"LIGHT TRUCK AVERAGE FUEL ECONOMY STANDARD, MOD...",Significant/Substantive,1996-05-03T13:00:00Z,1996-04-22T13:00:00Z,1996-04-22T13:00:00Z,...,1996-05-03,1996.0,5.0,1996-04-22,1996,4,1996-04-22,1996,4,1996


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2112 entries, 0 to 2111
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   url                2112 non-null   object 
 1   title              2112 non-null   object 
 2   agency             2112 non-null   object 
 3   subagency          2101 non-null   object 
 4   type               2112 non-null   object 
 5   description        2109 non-null   object 
 6   priority           2111 non-null   object 
 7   effective          2109 non-null   object 
 8   received           2112 non-null   object 
 9   published          2112 non-null   object 
 10  control_number     2112 non-null   object 
 11  fed_reg_number     2111 non-null   object 
 12  identifier         1947 non-null   object 
 13  major_rule_report  2103 non-null   object 
 14  effective_dt       2109 non-null   object 
 15  effective_year     2109 non-null   float64
 16  effective_month    2109 

In [4]:
df['control_number'].duplicated

<bound method Series.duplicated of 0       208127
1       208130
2       208129
3       208125
4       208118
         ...  
2107    103071
2108    101498
2109    101499
2110    103066
2111    103065
Name: control_number, Length: 2112, dtype: object>

In [5]:
df['control_number'].duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
2107    False
2108    False
2109    False
2110    False
2111    False
Name: control_number, Length: 2112, dtype: bool

In [6]:
df['control_number'].duplicated().value_counts()

control_number
False    2101
True       11
Name: count, dtype: int64

In [7]:
df['control_number'].duplicated(keep=None).value_counts()

ValueError: keep must be either "first", "last" or False

In [8]:
df['control_number'].duplicated(keep=False).value_counts()

control_number
False    2090
True       22
Name: count, dtype: int64

In [9]:
dup = df['control_number'].duplicated(keep=False)

In [10]:
df.loc[dup, 'control_number']

299     201392
300     201392
439     196661
440     196661
459     195521
460     195521
739     182326
740     182326
979     172843
980     172843
1058    168538
1060    168538
1279    156957
1280    156957
1478    143406
1480    143406
1519    139813
1523    139813
1778    119466
1780    119466
1898    113212
1900    113212
Name: control_number, dtype: object

In [11]:
df.loc[dup, :]

Unnamed: 0,url,title,agency,subagency,type,description,priority,effective,received,published,...,effective_dt,effective_year,effective_month,received_dt,received_year,received_month,published_dt,published_year,published_month,presidential_year
299,https://www.gao.gov/fedrules/201392,Consolidated Net Operating Losses,Department of the Treasury,Internal Revenue Service,Major,Final Rule,Significant/Substantive,2020-12-28T13:00:00Z,2020-10-23T13:00:00Z,2020-10-27T13:00:00Z,...,2020-12-28,2020.0,12.0,2020-10-23,2020,10,2020-10-27,2020,10,2020
300,https://www.gao.gov/fedrules/201392,Consolidated Net Operating Losses,Department of the Treasury,Internal Revenue Service,Major,Final Rule,Significant/Substantive,2020-12-28T13:00:00Z,2020-10-23T13:00:00Z,2020-10-27T13:00:00Z,...,2020-12-28,2020.0,12.0,2020-10-23,2020,10,2020-10-27,2020,10,2020
439,https://www.gao.gov/fedrules/196661,"Student Assistance General Provisions, The Sec...",Department of Education,Department of Education,Major,Final Rule,Significant/Substantive,2020-07-01T13:00:00Z,2019-10-23T13:00:00Z,2019-11-01T13:00:00Z,...,2020-07-01,2020.0,7.0,2019-10-23,2019,10,2019-11-01,2019,11,2019
440,https://www.gao.gov/fedrules/196661,"Student Assistance General Provisions, The Sec...",Department of Education,Department of Education,Major,Final Rule,Significant/Substantive,2020-07-01T13:00:00Z,2019-10-23T13:00:00Z,2019-11-01T13:00:00Z,...,2020-07-01,2020.0,7.0,2019-10-23,2019,10,2019-11-01,2019,11,2019
459,https://www.gao.gov/fedrules/195521,Validation And Approval Of Credit Score Models,Independent Government Entities,Federal Housing Finance Agency,Major,Final Rule,Significant/Substantive,2019-10-15T13:00:00Z,2019-08-13T13:00:00Z,2019-08-16T13:00:00Z,...,2019-10-15,2019.0,10.0,2019-08-13,2019,8,2019-08-16,2019,8,2019
460,https://www.gao.gov/fedrules/195521,Validation And Approval Of Credit Score Models,Independent Government Entities,Federal Housing Finance Agency,Major,Final Rule,Significant/Substantive,2019-10-15T13:00:00Z,2019-08-13T13:00:00Z,2019-08-16T13:00:00Z,...,2019-10-15,2019.0,10.0,2019-08-13,2019,8,2019-08-16,2019,8,2019
739,https://www.gao.gov/fedrules/182326,"Department Of Health And Human Services, Cente...",Department of Health and Human Services,Centers for Medicare & Medicaid Services,Major,Final Rule,Significant/Substantive,2016-01-01T13:00:00Z,2015-10-30T13:00:00Z,2015-11-16T13:00:00Z,...,2016-01-01,2016.0,1.0,2015-10-30,2015,10,2015-11-16,2015,11,2015
740,https://www.gao.gov/fedrules/182326,"Department Of Health And Human Services, Cente...",Department of Health and Human Services,Centers for Medicare & Medicaid Services,Major,Final Rule,Significant/Substantive,2016-01-01T13:00:00Z,2015-10-30T13:00:00Z,2015-11-16T13:00:00Z,...,2016-01-01,2016.0,1.0,2015-10-30,2015,10,2015-11-16,2015,11,2015
979,https://www.gao.gov/fedrules/172843,"Department Of Health And Human Services, Cente...",Department of Health and Human Services,Centers for Medicare & Medicaid Services,Major,Final Rule,Significant/Substantive,2012-11-05T13:00:00Z,2012-09-04T13:00:00Z,2012-09-05T13:00:00Z,...,2012-11-05,2012.0,11.0,2012-09-04,2012,9,2012-09-05,2012,9,2012
980,https://www.gao.gov/fedrules/172843,"Department Of Health And Human Services, Cente...",Department of Health and Human Services,Centers for Medicare & Medicaid Services,Major,Final Rule,Significant/Substantive,2012-11-05T13:00:00Z,2012-09-04T13:00:00Z,2012-09-05T13:00:00Z,...,2012-11-05,2012.0,11.0,2012-09-04,2012,9,2012-09-05,2012,9,2012
