## UKCP18 error checking

- Author: Sam Hardy
- Check that the csv files produced by the code in this repository contain the correct number of values for each month

In [1]:
import pandas as pd

In [8]:
proj_id = 3 # 2021-2040 time slice
var = 'Rainfall_bin_counts_1h'
member_id = 1
all_errors = pd.DataFrame()
#input_file_path = f'/mnt/metdata/2024s1475/UKCP18_Processing_2024/precip_profiles/proj{proj_id}/output_mem{id:02d}/{var}_ens{id:02d}_proj{proj_id}.csv'

### Check for any missing data for the water companies

- Read in data from csv file 
- Generate a list of all expected year-month combinations
- Months with fewer than 13 rows (indicates CEDA connection issues), more than 13 rows (double counted) or no data (also indicates connection issues)
- Reformat the `group_counts` index to pad single numbers (1,2,etc) with a leading zero
- Loop over all ensemble members and year-month combinations

In [9]:
FirstTime = True 

input_file_path = f'/mnt/metdata/2024s1475/UKCP18_Processing_2024/precip_profiles/proj{proj_id}/output_mem{member_id:02d}/{var}_ens{member_id:02d}_proj{proj_id}.csv'
df = pd.read_csv(input_file_path)
print(f"Working on {var} for ensemble member {member_id:02d}...")

if FirstTime:
    if proj_id == 1:
        start_year, start_month = 1980, 12
        end_year, end_month = 2000, 11 
    elif proj_id == 2:
        start_year, start_month = 2000, 12 
        end_year, end_month = 2020, 11
    elif proj_id == 3: 
        start_year, start_month = 2060, 12 #2040, 12 
        end_year, end_month = 2080, 11 #2060, 11

    expected_year_months = pd.date_range(
        start=f"{start_year}-{start_month:02d}",
        end=f"{end_year}-{end_month:02d}",
        freq="MS"
    ).strftime("%Y-%m").tolist()
else:
    dates_df = pd.read_csv(f'./proj{proj_id}_all_months_original.txt', names=['date_str'], dtype=str) # TODO: update to input different txt files
    expected_year_months = pd.to_datetime(dates_df['date_str'],format="%Y%m").dt.strftime("%Y-%m").tolist()

print(df)

# df['Year-Month'] = df['Year'].astype(str) + '-' + df['Month'].astype(int).apply(lambda x: f'{x:02d}')
# group_counts = df.groupby('Year-Month').size()
# group_counts.index = group_counts.index.map(lambda x: f"{x.split('-')[0]}-{x.split('-')[1].zfill(2)}")

# # identify errors in the processing 
# missing_months = sorted(set(expected_year_months) - set(group_counts.index))
# incomplete_months = group_counts[group_counts < 13].index.to_list()
# overcounted_months = group_counts[group_counts > 13].index.to_list()

# # flag consecutive months where values for all 13 water companies are the same 
# copied_months = []
# previous_values = None
# grouped = df.groupby('Year-Month')

# # account for different column names for different functions 
# if var == 'Total_rainfall':
#     value = 'Mean total rainfall'
# elif var == 'Dry_days_counts':
#     value = 'Mean dry day counts'
# elif var == 'Rainfall_bin_counts_1h' or var == 'Rainfall_bin_counts_3h' or var == 'Rainfall_bin_counts_6h':
#     value = 'Bin counts'

# for year_month, group in grouped:
#     group_sorted = group.sort_values(by='WCID')
#     current_values = tuple(group_sorted[value])

#     if previous_values == current_values:
#         copied_months.append(year_month)

#     previous_values = current_values    

# # combine all months into a single df 
# errors = []
# for value in missing_months:
#     errors.append({"Ensemble Member": str(member_id).zfill(2), "Year-Month": value, "Function": var, "Issue": "Missing"})
# for value in incomplete_months:
#     errors.append({"Ensemble Member": str(member_id).zfill(2), "Year-Month": value, "Function": var, "Issue": "Incomplete"})
# for value in overcounted_months:
#     errors.append({"Ensemble Member": str(member_id).zfill(2), "Year-Month": value, "Function": var, "Issue": "Over-counted"})
# for value in copied_months:
#     errors.append({"Ensemble Member": str(member_id).zfill(2), "Year-Month": value, "Function": var, "Issue": "Copied"})

# errors_df = pd.DataFrame(errors)

# # Print summary to the console
# if errors:
#     print(f"Errors found for ens{member_id:02d}. Details saved to csv file.")
#     print(errors_df)
# else:
#     print(f"No anomalies found for ens{member_id:02d}. The dataset is complete and consistent.")

Working on Rainfall_bin_counts_1h for ensemble member 01...
      Projection_slice_ID  Member  Year  Month  WCID  \
0                       3       1  2060     12     0   
1                       3       1  2060     12     1   
2                       3       1  2060     12     2   
3                       3       1  2060     12     3   
4                       3       1  2060     12     4   
...                   ...     ...   ...    ...   ...   
1608                    3       1  2080     11     8   
1609                    3       1  2080     11     9   
1610                    3       1  2080     11    10   
1611                    3       1  2080     11    11   
1612                    3       1  2080     11    12   

                                             Bin counts  
0     [65610, 10054, 801, 75, 8, 0, 0, 0, 0, 0, 0, 0...  
1     [90487, 16822, 3156, 579, 80, 0, 0, 0, 0, 0, 0...  
2     [27198, 7764, 1068, 83, 0, 0, 0, 0, 0, 0, 0, 0...  
3     [31603, 9225, 1775, 485, 120,

### Loop over all ensemble members and functions (`Total_rainfall`, `Dry_days`, `get_bin_counts`)

- Output to a single csv file for each ensemble member 
- Deal with duplicate values so that we only output one row per year-month combination
- TODO: build in functionality to account for running the code for the first time (`v1`, etc)

In [None]:
all_errors = pd.DataFrame()
for var in variables:
    input_file_path = f'/mnt/metdata/2024s1475/UKCP18_Processing_2024/precip_profiles/proj{proj_id}/output_mem{id:02d}/{var}_ens{id:02d}_proj{proj_id}.csv'
    monthly_calc_errors = error_check_ukcp18_data(input_file_path, 
                                                proj_id, 
                                                id, 
                                                var,
                                                FirstTime=False)
    all_errors = pd.concat([all_errors, monthly_calc_errors], ignore_index=True)

# remove any duplicate rows 
duplicates_removed = (
    all_errors.groupby('Year-Month')
    .agg({
        'Issue': lambda x: ', '.join(sorted(set(x))),
        'Function': lambda x: ', '.join(sorted(set(x)))
        })
    .reset_index()
)

output_file_path = f'/mnt/metdata/2024s1475/UKCP18_Processing_2024/precip_profiles/proj{proj_id}/output_mem{id:02d}/error_checking_proj{proj_id}_ens{id:02d}.csv'
duplicates_removed.to_csv(output_file_path, index=False)
print(f'Combined error report saved to {output_file_path}.')

Working on Rainfall_bin_counts_1h for ensemble member 01...
Errors found for ens01. Details saved to csv file.
  Ensemble Member Year-Month                Function    Issue
0              01    1981-06  Rainfall_bin_counts_1h  Missing
1              01    1994-05  Rainfall_bin_counts_1h  Missing
2              01    1994-07  Rainfall_bin_counts_1h  Missing
3              01    1995-07  Rainfall_bin_counts_1h  Missing
4              01    1996-01  Rainfall_bin_counts_1h  Missing
5              01    1997-09  Rainfall_bin_counts_1h  Missing
6              01    1999-06  Rainfall_bin_counts_1h  Missing
7              01    1999-08  Rainfall_bin_counts_1h  Missing
8              01    1999-09  Rainfall_bin_counts_1h  Missing
Working on Rainfall_bin_counts_3h for ensemble member 01...
Errors found for ens01. Details saved to csv file.
  Ensemble Member Year-Month                Function    Issue
0              01    1981-06  Rainfall_bin_counts_3h  Missing
1              01    1994-05  Rain

ValueError: invalid literal for int() with base 10: ' 0]"'

### Create single-column text files for each ensemble member 

- Extract the list of dates 
- Change the format slightly 
- Output list can be fed directly into the `ukcp18_wrapper_script.sh` shell script to rerun the months that we've identified

In [7]:
for proj_id in proj_ids:
    for id in member_ids:
        input_csv_path = f'/mnt/metdata/2024s1475/UKCP18_Processing_2024/precip_profiles/proj{proj_id}/output_mem{id:02d}/v1/error_checking_proj{proj_id}_ens{id:02d}.csv'
        dates_for_processing = pd.read_csv(input_csv_path)
        dates_for_processing['Year-Month'] = dates_for_processing['Year-Month'].str.replace("-","")
        error_months = dates_for_processing['Year-Month'].unique()
        with open(f'proj{proj_id}_{id:02d}_error_months_v1.txt', 'w') as f:
            f.write('\n'.join(error_months))

### Filter out specific years that we know we don't want to process again

In [50]:
def filter_specific_years_ukcp18_processing(ignored_years: list[str],
                                            proj_id: str,
                                            member_id: str,
                                            input_file_path: str) -> list[str]:
    """ 
    Filter specific years that we know we don't want to process any further 
    """

    dates_df = pd.read_csv(input_file_path, names=['date_str'], dtype=str)

    # format the dates to add a hyphen between the YYYY and MM parts of the string 
    expected_year_months = pd.to_datetime(dates_df['date_str'],format="%Y%m").dt.strftime("%Y-%m").tolist()
    # create a DataFrame from the list and name the column "date"
    df = pd.DataFrame(expected_year_months,columns=["date"])

    # parse the dates and retrieve the year from each 
    df["year"] = pd.to_datetime(df["date"]).dt.year
    # filter the df to remove any years that match our list above 
    filtered_df = df[~df["year"].isin(ignored_years)]
    filtered_df["date"] = filtered_df["date"].str.replace("-","")
    filtered_dates = filtered_df['date'].unique().tolist()

    # convert any elements back to a list 
    with open(f'proj{proj_id}_{member_id:02d}_error_months_v1_filtered.txt', 'w') as f:
        f.write('\n'.join(filtered_dates))

### Call the function

In [51]:
ignored_years = [2020, 2060]

for proj_id in proj_ids:
    for id in member_ids:
        input_txt_path = f'./proj{proj_id}_{id:02d}_error_months_v1.txt'
        filter_specific_years_ukcp18_processing(ignored_years,
                                                proj_id,
                                                id,
                                                input_txt_path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["date"] = filtered_df["date"].str.replace("-","")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["date"] = filtered_df["date"].str.replace("-","")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["date"] = filtered_df["date"].str.replace("-","")
A value is trying t