# Merge Data Files
Created by [Author name removed] on 2024-12-06.  
Last edited by [Name removed] on 2024-12-06.

This notebook is used to merge two pickle files where each pickle file represents a pandas DataFrame.

To run the notebook, use Python 3.10 (Python 3.12 does not work), and
- on linux: use `ficc_python/requirements_py310_linux_jupyter.txt`
- on mac: use `ficc_python/requirements_py310_mac_jupyter.txt`

In [None]:
import warnings
from datetime import datetime, timedelta

import pandas as pd

In [None]:
file1_path = '~/ficc/ficc_python/notebooks/merge_data_files/files/processed_data_yield_spread_with_similar_trades.pkl'
file2_path = '~/ficc/ficc_python/notebooks/compare_datasets/files/new_data.pkl'
file3_path = '~/ficc/ficc_python/notebooks/compare_datasets/files/new_data_2024-11-01_2024-12-05.pkl'
output_file_path = '~/ficc/ficc_python/notebooks/merge_data_files/files/processed_data_yield_spread_with_similar_trades_v2.pkl'

In [None]:
DATETIME_COLUMN_NAME = 'trade_datetime'

In [None]:
COLUMNS_TO_DROP = ['sp_stand_alone', 'moodys_long']    # columns that are no longer used

In [None]:
START_OF_DAY_TIME = '00:00:00'
END_OF_DAY_TIME = '23:59:59'

In [None]:
def get_df_between_datetimes(file_path: str, 
                             start_datetime: str = None, 
                             end_datetime: str = None, 
                             datetime_column_name: str = DATETIME_COLUMN_NAME) -> pd.DataFrame:
    if start_datetime is None and end_datetime is None: warnings.warn('Both `start_datetime` and `end_datetime` are `None`, and so all rows returned')
    df = pd.read_pickle(file_path)
    if datetime_column_name not in df.columns: raise ValueError(f'Column `{datetime_column_name}` not found in the DataFrame from {file_path}')
    if start_datetime is not None: df = df[df[datetime_column_name] >= start_datetime]
    if end_datetime is not None: df = df[df[datetime_column_name] <= end_datetime]
    return df

In [None]:
df1_to_june_30 = get_df_between_datetimes(file1_path, end_datetime=f'2024-06-30 {END_OF_DAY_TIME}')
df2_from_july_1 = get_df_between_datetimes(file2_path, start_datetime=f'2024-07-01 {START_OF_DAY_TIME}')
df3 = get_df_between_datetimes(file3_path)
combined_df = pd.concat([df1_to_june_30, df2_from_july_1, df3])
combined_df = combined_df.sort_values(by='trade_datetime', ascending=False)
combined_df = combined_df.drop(columns=COLUMNS_TO_DROP, errors='ignore')    # will ignore the error when any of the columns in  `COLUMNS_TO_DROP` do not exist in `combined_df`
combined_df

In [None]:
combined_df['trade_datetime']

In [None]:
print(f'Earliest trade datetime: {combined_df["trade_datetime"].min()}')
print(f'Latest trade datetime: {combined_df["trade_datetime"].max()}')

In [None]:
assert combined_df['rtrs_control_number'].is_unique, f'Duplicate RTRS control numbers present: {combined_df["rtrs_control_number"][combined_df["rtrs_control_number"].duplicated()].unique()}'

In [None]:
YEAR_MONTH_DAY = '%Y-%m-%d'

In [None]:
def get_weekdays_between_dates(start_date: str, end_date: str) -> list:
    start_date = datetime.strptime(start_date, YEAR_MONTH_DAY)
    end_date = datetime.strptime(end_date, YEAR_MONTH_DAY)
    if start_date > end_date: raise ValueError(f'Start date: {start_date} must be before or equal to the end date: {end_date}')
    
    weekdays = []
    current_date = start_date
    while current_date <= end_date:
        if current_date.weekday() < 5:    # weekdays are Monday (0) to Friday (4)
            weekdays.append(current_date.strftime(YEAR_MONTH_DAY))
        current_date += timedelta(days=1)
    return weekdays

In [None]:
unique_trade_dates = [numpy_datetime.astype('datetime64[ms]').astype(datetime).strftime(YEAR_MONTH_DAY) for numpy_datetime in combined_df['trade_date'].unique()]
earliest_trade_date = combined_df['trade_date'].min().strftime(YEAR_MONTH_DAY)
latest_trade_date = combined_df['trade_date'].max().strftime(YEAR_MONTH_DAY)
print(f'The following dates are missing from the combined dataframe. These should be holidays.\n{sorted(set(get_weekdays_between_dates(earliest_trade_date, latest_trade_date)) - set(unique_trade_dates))}')

In [None]:
combined_df.to_pickle(output_file_path)