In [3]:
import json
import os
from datetime import datetime
from collections import Counter

# Directory containing the JSON files
directory_path = "/Users/anastasyarussu/Downloads/Telegram Desktop/ChatExport_2024-11-22"

# Define the date range
include_start_date = datetime.strptime("2023-10-01", "%Y-%m-%d")
include_end_date = datetime.strptime("2024-11-15", "%Y-%m-%d")

# List to store messages within the date range
filtered_messages = []
date_counter = Counter()  # Counter for messages per day

# Iterate over all JSON files in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith(".json"):
        file_path = os.path.join(directory_path, file_name)
        
        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                # Load the JSON content
                data = json.load(file)
                messages = data.get("messages", [])  # Access the messages key
                
                # Iterate over the messages
                for message in messages:
                    # Parse the message date and make it offset-naive
                    message_date_str = message.get("date")
                    if not message_date_str:
                        continue
                    
                    message_date = datetime.strptime(message_date_str, "%Y-%m-%dT%H:%M:%S").replace(tzinfo=None)
                    
                    # Include messages within the specified date range
                    if include_start_date <= message_date <= include_end_date:
                        filtered_messages.append({
                            "message": message,
                            "date": message_date
                        })
                        # Increment the count for the message date
                        date_counter[message_date.date()] += 1
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {file_name}: {e}")
            except Exception as e:
                print(f"Error processing file {file_name}: {e}")

# Sort the filtered messages by date in descending order
filtered_messages.sort(key=lambda x: x["date"], reverse=True)

# Select the last 30,000 messages
latest_30000_messages = [msg["message"] for msg in filtered_messages[:30000]]

# Output the 30,000 most recent filtered messages to a new JSON file
output_file = "match_30k_filtered_messages.json"
with open(output_file, 'w', encoding='utf-8') as out_file:
    json.dump(latest_30000_messages, out_file, ensure_ascii=False, indent=4)

print(f"Filtered messages have been saved to {output_file}")


Filtered messages have been saved to match_30k_filtered_messages.json


In [4]:
len(latest_30000_messages)

30000