In [1]:
import json

def parse_chat_to_jsonl(chat_file, output_file, query_name, response_name):
    with open(chat_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    conversations = []
    current_query = None
    current_response = None

    for line in lines:
        # Skip empty lines
        if not line.strip():
            continue

        # Extract date, time, sender, and message
        if '] ' in line and ': ' in line:
            try:
                timestamp, content = line.split('] ', 1)
                sender, message = content.split(': ', 1)
            except ValueError:
                continue

            sender = sender.strip()
            message = message.strip()

            # Handle query-response mapping
            if sender == query_name:
                if current_query and current_response:
                    conversations.append({"query": current_query, "response": current_response})
                current_query = message
                current_response = None  # Reset the response
            elif sender == response_name:
                if current_query:  # Ensure there's a query before adding a response
                    if current_response:
                        current_response += " " + message
                    else:
                        current_response = message
        else:
            # Continuation of the message
            if current_response is not None:
                current_response += " " + line.strip()

    # Append the last conversation
    if current_query and current_response:
        conversations.append({"query": current_query, "response": current_response})

    # Write to JSONL
    with open(output_file, 'w', encoding='utf-8') as jsonl_file:
        for convo in conversations:
            jsonl_file.write(json.dumps(convo, ensure_ascii=False) + '\n')

In [2]:
# Define parameters
chat_file = '_chat.txt'
output_file = 'chat_data.jsonl'
query_name = 'Recpient Name'
response_name = 'Your Name'

# Convert chat to JSONL
parse_chat_to_jsonl(chat_file, output_file, query_name, response_name)

print(f"Conversion complete. JSONL file saved as {output_file}.")

Conversion complete. JSONL file saved as chat_data.jsonl.


In [8]:
import json

# Function to filter out unwanted responses
def is_valid_response(response):
    # Check if response contains any unwanted phrases
    return not any(phrase in response.lower() for phrase in ["sticker omitted", "image omitted", "you deleted this message"])

# Function to convert your chat_data.jsonl to the desired format
def convert_chat_data(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            # Parse the JSON line
            chat = json.loads(line.strip())
            query = chat["query"]
            response = chat["response"]
            
            # Filter out invalid responses
            if is_valid_response(response):
                # Create the conversation in the required format
                conversation = [
                    {"from": "human", "value": query},
                    {"from": "gpt", "value": response}
                ]
                # Write the conversation to the output file
                json.dump(conversation, outfile, ensure_ascii=False)
                outfile.write("\n")  # Add a newline for each conversation

# File paths
input_file = 'chat_data.jsonl'  # Input file containing the original data
output_file = 'formatted_chat_data.jsonl'  # Output file for the formatted data

# Convert the data
convert_chat_data(input_file, output_file)

print("Conversion complete. The formatted data is saved to", output_file)

Conversion complete. The formatted data is saved to formatted_chat_data.jsonl


In [9]:
import json

def transform_chat_data(input_file, output_file):
    # Initialize list to store conversation blocks
    current_block = []
    
    # Words to filter out
    filter_words = ["image omitted", "sticker omitted", "You deleted this message"]
    
    with open(input_file, 'r', encoding='utf-8') as f:
        # Read line by line
        for line in f:
            try:
                # Parse JSON from each line
                data = json.loads(line)
                
                # Skip responses containing filter words
                if any(word in data['response'].lower() for word in filter_words):
                    continue
                
                # Add to current block
                current_block.extend([
                    {"from": "human", "value": data["query"]},
                    {"from": "gpt", "value": data["response"]}
                ])
                
            except json.JSONDecodeError:
                print(f"Skipping invalid JSON line: {line}")
                continue
    
    # Write transformed data
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump([current_block], f, ensure_ascii=False, indent=2)

# Usage
transform_chat_data('chat_data.jsonl', 'transformed_chat.jsonl')