In [1]:
import json

In [2]:
import json

input_file = "my_data.json"
output_file = "my_data_fixed.json"

with open(input_file, "r", encoding="utf-8") as f:
    data_str = f.read()

# Attempt to parse JSON
try:
    data = json.loads(data_str)
except json.JSONDecodeError as e:
    print(f"Error parsing JSON: {e}")
    # If the JSON is invalid, you can try removing trailing commas or
    # checking brackets/braces manually before re-running.
    raise

# Ensure 'data' is a list of dictionaries
if not isinstance(data, list):
    raise ValueError("JSON root element is not a list. Expected a list of objects.")

# Reassign IDs from 1 to len(data)
for i, item in enumerate(data, start=1):
    # Make sure 'item' is a dict before assigning
    if isinstance(item, dict):
        item["ID"] = i
    else:
        raise ValueError(f"Item at index {i} is not a dictionary: {item}")

# Write updated data to a new file
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print(f"Fixed JSON saved to {output_file}")


Fixed JSON saved to my_data_fixed.json


In [1]:
import re

def process_brick_tokens(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    words = []
    for line in lines:
        # Remove underscores and split words
        words.extend(line.strip().split("_"))
    
    # Remove duplicates while maintaining order
    seen = set()
    unique_words = [word for word in words if not (word in seen or seen.add(word))]
    
    # Write to output file
    with open(output_file, 'w', encoding='utf-8') as out_file:
        for word in unique_words:
            out_file.write(f"{word}\n")
    
    print(f"Processed words saved to {output_file}")

# Specify file paths
input_file = "bricktokensv1.txt"
output_file = "processed_bricktokensv2.txt"

# Run the function
process_brick_tokens(input_file, output_file)


Processed words saved to processed_bricktokensv2.txt


In [2]:
import re

def process_brick_tokens(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    words = []
    for line in lines:
        # Remove underscores and hyphens, split words
        temp_words = re.split(r'[_-]', line.strip())
        words.extend(temp_words)
    
    # Remove duplicates while maintaining order
    seen = set()
    unique_words = [word for word in words if not (word in seen or seen.add(word))]
    
    # Write to output file
    with open(output_file, 'w', encoding='utf-8') as out_file:
        for word in unique_words:
            out_file.write(f"{word}\n")
    
    print(f"Processed words saved to {output_file}")

# Specify file paths
input_file = "processed_bricktokensv2.txt"
output_file = "processed_bricktokensv3.txt"

# Run the function
process_brick_tokens(input_file, output_file)


Processed words saved to processed_bricktokensv3.txt


In [3]:
import re

def process_brick_tokens(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    words = []
    for line in lines:
        # Remove underscores and hyphens, split words
        temp_words = re.split(r'[_-]', line.strip())
        words.extend(temp_words)
    
    # Remove duplicates while maintaining order and filter out single alphabets and numbers
    seen = set()
    unique_words = [word for word in words if word and not re.fullmatch(r'[a-zA-Z]', word) and not re.fullmatch(r'\d+', word) and not (word in seen or seen.add(word))]
    
    # Write to output file
    with open(output_file, 'w', encoding='utf-8') as out_file:
        for word in unique_words:
            out_file.write(f"{word}\n")
    
    print(f"Processed words saved to {output_file}")

# Specify file paths
input_file = "processed_bricktokensv3.txt"
output_file = "processed_bricktokens4.txt"

# Run the function
process_brick_tokens(input_file, output_file)


Processed words saved to processed_bricktokens4.txt


In [None]:
def clean_token_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        words = file.readlines()

    # Apply filters
    cleaned_words = []
    seen = set()
    
    for word in words:
        word = word.strip()  # Remove spaces
        if word and word.isalnum() and word not in seen:  # Remove non-alphanumeric words
            cleaned_words.append(word)
            seen.add(word)

    # Save cleaned words
    with open(output_file, 'w', encoding='utf-8') as out_file:
        for word in cleaned_words:
            out_file.write(f"{word}\n")

    print(f"Cleaned token file saved to {output_file}")

# Run the function
input_file = "processed_bricktokensv4.txt"
output_file = "final_tokenizer_tokens.txt"
clean_token_file(input_file, output_file)


Cleaned token file saved to final_tokenizer_tokens.txt


In [1]:
import re

def extract_tokens(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    words = []
    for line in lines:
        # Remove prefixes and split on non-alphanumeric characters
        temp_words = re.split(r'[^a-zA-Z]', line.strip())
        words.extend(temp_words)
    
    # Remove empty strings, single letters, and single digits
    words = [word for word in words if word and len(word) > 1]
    
    # Remove duplicates while maintaining order
    seen = set()
    unique_words = [word for word in words if not (word in seen or seen.add(word))]
    
    # Write to output file
    with open(output_file, 'w', encoding='utf-8') as out_file:
        for word in unique_words:
            out_file.write(f"{word}\n")
    
    print(f"Processed words saved to {output_file}")

# Specify file paths
input_file = "bldg1.ttl"
output_file = "bldg1_tokens.txt"

# Run the function
extract_tokens(input_file, output_file)


Processed words saved to bldg1_tokens.txt


In [None]:
import re

def is_uuid_like(token):
    """Check if the token looks like a UUID (random alphanumeric mix)."""
    return bool(re.match(r'^[a-fA-F0-9]{4,}$', token))  # Matches long alphanumeric strings

def extract_tokens(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        data = file.read()

    # Replace special characters with spaces to treat them as separators
    cleaned_data = re.sub(r'[^a-zA-Z0-9]+', ' ', data)  # Replace non-alphanumeric chars with spaces

    # Split the text into words
    words = cleaned_data.split()

    # Use an Ordered Set to remove duplicates while keeping order
    seen = set()
    unique_tokens = [
        word for word in words 
        if word not in seen and not seen.add(word)  # Removes duplicates
        and not word.isdigit()                      # Removes pure numbers
        and not is_uuid_like(word)                  # Removes UUID-like strings
    ]

    # Sort the unique tokens
    sorted_tokens = sorted(unique_tokens)

    # Write tokens to output file
    with open(output_file, 'w', encoding='utf-8') as out_file:
        for token in sorted_tokens:
            out_file.write(f"{token}\n")

    print(f"Extracted {len(sorted_tokens)} unique tokens and saved to {output_file}")

# Define file paths
input_file = "bldg3.ttl"  # Your TTL file
output_file = "bldg3tokens.txt"  # Output file

# Run token extraction
extract_tokens(input_file, output_file)


Extracted 138 unique tokens and saved to bldg2tokens2.txt
