In [1]:
import os
import json
import glob

folder_path = r"\\cph3\academics\Parsed"

# Get all JSON files in the folder
json_files = glob.glob(os.path.join(folder_path, "*.json"))

# Load JSON data into a list
data = []
for file in json_files:
    with open(file, "r", encoding="utf-8") as f:
        data += json.load(f)
        #data_list.append(data)

print(f"Loaded {len(data)} JSON files.")


Loaded 1884049 JSON files.


In [2]:
from datetime import datetime

validation_start = datetime(2023, 10, 31)
test_start = datetime(2024, 3, 1)

# Convert filing_date to datetime and split
train, validation, test = [], [], []

for item in data:
    filing_date = datetime.strptime(item['filing_date'], '%Y%m%d%H%M%S')
    if filing_date < validation_start:
        train.append(item)
    elif validation_start <= filing_date < test_start:
        validation.append(item)
    else:
        test.append(item)

# Output results
print(f"Training set size: {len(train)}")
print(f"Validation set size: {len(validation)}")
print(f"Test set size: {len(test)}")

Training set size: 1523481
Validation set size: 208542
Test set size: 152026


In [4]:
import pandas as pd
import random

df = pd.DataFrame((validation + test))

df['filing_date'] = pd.to_datetime(df['filing_date'])

company_groups = df.groupby('company_name')

val_set = []
test_set = []

val_proportion = 0.5  # 50% for validation, 50% for testing

for company_name, group in company_groups:
    if len(group) > 1:
        # Sort records by date within the group
        group = group.sort_values(by='filing_date')
        
        # this means that the earlier filings are still in the val set instead of the test set.
        split_index = int(len(group) * val_proportion)
        
        # Assign earlier records to validation and later records to test
        val_sample = group.iloc[:split_index]
        test_sample = group.iloc[split_index:]
    else:
        # Randomly assign the single record to either validation or test
        if random.random() < 0.5:
            val_sample = group
            test_sample = pd.DataFrame(columns=group.columns)  # Empty test sample
        else:
            test_sample = group
            val_sample = pd.DataFrame(columns=group.columns)  # Empty validation sample

    val_set.append(val_sample)
    test_set.append(test_sample)

val_df = pd.concat(val_set)
test_df = pd.concat(test_set)


  val_df = pd.concat(val_set)
  test_df = pd.concat(test_set)


In [5]:
import pandas as pd
from datetime import datetime

train_df = pd.DataFrame(train)
all_data = pd.concat([train_df, val_df, test_df], axis=0)
val_test = pd.concat([val_df, test_df], axis=0)
all_data['filing_date'] = pd.to_datetime(all_data['filing_date'])

first_appearance = all_data.groupby('company_name')['filing_date'].min()

# Define the threshold date
threshold_date = datetime(2023, 10, 31)

# Step 3: Identify companies introduced after the threshold date
new_companies = first_appearance[first_appearance > threshold_date].index.tolist()

# Step 4: Filter `val_test` to include only entries from the new companies
filtered_data = val_test[val_test['company_name'].isin(new_companies)]

# Step 4: Build the new validation and test sets
# Validation set excludes entries from new companies
val_dataset = val_df[~val_df['company_name'].isin(new_companies)].copy()

# Test set includes entries from new companies
test_dataset = pd.concat([test_df, val_df[val_df['company_name'].isin(new_companies)].copy()])

# Display the results
print(f"Updated val_dataset has {len(val_dataset)} entries.")
print(f"Updated test_dataset has {len(test_dataset)} entries.")


Updated val_dataset has 171397 entries.
Updated test_dataset has 189171 entries.


In [6]:
test_df['text'] = test_df['text'].apply(lambda t: t.replace('\xa0', ' '))
val_df['text'] = val_df['text'].apply(lambda t: t.replace('\xa0', ' '))
train_df['text'] = train_df['text'].apply(lambda t: t.replace('\xa0', ' '))

In [8]:
# Optionally, save the splits to JSON files
train_df.to_json('train_rescraped.json', orient='records', indent=4, force_ascii=False)

# Save the validation set
val_dataset.to_json('validation_rescraped.json', orient='records', indent=4, force_ascii=False)

# Save the test set
test_dataset.to_json('test_rescraped.json', orient='records', indent=4, force_ascii=False)

In [None]:
# fix timestamps to be ISO formatted.

import json
from datetime import datetime, timezone

def convert_timestamp_to_iso(data, date_key="filing_date"):
    """
    Recursively finds and converts various date formats (millisecond Unix 
    timestamps and 'YYYYMMDDHHMMSS' strings) to the ISO 8601 format.

    Args:
        data (dict or list): The JSON data to process.
        date_key (str): The key of the date field to convert.

    Returns:
        The processed data with converted dates.
    """
    if isinstance(data, dict):
        for key, value in data.items():
            if key == date_key:
                iso_date = None
                if isinstance(value, (int, float)):
                    # Handle Unix timestamp in milliseconds
                    try:
                        timestamp_sec = value / 1000.0
                        iso_date = datetime.fromtimestamp(timestamp_sec, tz=timezone.utc).isoformat()
                    except (ValueError, OSError): # Handles out of range timestamps
                        pass # Keep original value if conversion fails
                elif isinstance(value, str):
                    # Handle string-based dates
                    try:
                        # Attempt to parse 'YYYYMMDDHHMMSS' format
                        if len(value) >= 14 and value.isdigit():
                            dt_object = datetime.strptime(value[:14], '%Y%m%d%H%M%S')
                            # Make it timezone-aware (assuming UTC) before formatting
                            iso_date = dt_object.replace(tzinfo=timezone.utc).isoformat()
                    except ValueError:
                        # If parsing fails, it's not the expected string format.
                        # We'll leave the original value as is.
                        pass
                
                if iso_date:
                    data[key] = iso_date
            else:
                # Recurse into nested dictionaries or lists
                data[key] = convert_timestamp_to_iso(value, date_key)
    elif isinstance(data, list):
        # If the data is a list, iterate through its items
        return [convert_timestamp_to_iso(item, date_key) for item in data]
    
    return data

def process_json_file(input_path, output_path):
    """
    Reads a JSON file, converts the 'filing_date' to ISO format,
    and saves it to a new file.
    """
    try:
        print(f"Processing '{input_path}'...")
        # Open and load the original JSON file
        with open(input_path, "r", encoding="utf-8") as f:
            json_data = json.load(f)

        # Convert the dates in the loaded data
        converted_data = convert_timestamp_to_iso(json_data)
        # Save the updated data to the new file
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(converted_data, f, indent=4, ensure_ascii=False)
        
        print(f"Successfully converted and saved to '{output_path}'\n")

    except FileNotFoundError:
        print(f"Error: The file '{input_path}' was not found.")
    except Exception as e:
        print(f"An unexpected error occurred while processing {input_path}: {e}")

if __name__ == "__main__":
    # --- Configuration ---
    # Define the base directory where your files are located
    base_dir = r"D:\PhD\Data\HIFi-paper\final_unit_converted"

    # List of filenames to process
    filenames = ["train.json", "validation.json", "test.json"]
    
    # --- Execution ---
    for filename in filenames:
        input_file = f"{base_dir}\\{filename}"
        # Create a new name for the output file, e.g., 'train_iso.json'
        output_file = f"{base_dir}\\{filename.replace('.json', '_iso.json')}"
        process_json_file(input_file, output_file)

    print("All files have been processed.")