In [None]:
import json
import re
import csv

# Define the preprocessing function
def preprocess_text(text):
    # Lowercasing Text
    text = text.lower()
    
    # Removing Hyperlinks
    text = re.sub(r'https?://\S+', '', text)
    
    # Removing HTML Tags
    text = re.sub(r'<.*?>', '', text)
    
    # Removing User Mentions
    text = re.sub(r'@\w+', '', text)
    
    # Removing HTML Entities (including x200b;)
    text = re.sub(r'&\w+;', ' ', text)  # General removal of HTML entities
    text = re.sub(r'x200b;', '', text)  # Specifically remove "x200b;"
    
    # Processing Hashtags
    text = re.sub(r'#(\w+)', r'\1', text)
    
    # Preserving Certain Characters and Whitespace
    text = re.sub(r'[^a-zA-Z0-9\s\.\'\!\?\,\;\-]', ' ', text)
    
    # Normalize spaces around punctuation, but avoid breaking periods in numbers (e.g., 4.89) and abbreviations (e.g., i.e., e.g.)
    text = re.sub(r'(?<!\d)\.(?!\d)', ' .', text)  # Add space before period only if it's not part of a decimal
    text = re.sub(r'(?<!\b\w)\s+([.,!?;])', r'\1', text)  # Remove space before common punctuation
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Split the text into sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<!\d)\.|\!|\?|\;', text)  # Prevent splitting at periods within abbreviations and numbers
    
    # Remove sentences of length 1
    sentences = [sentence for sentence in sentences if len(sentence.strip()) > 1]
    
    # Join the sentences back together with a space after punctuation
    text = ' '.join(sentences)
    
    # Normalizing punctuation
    text = re.sub(r'(\.|\!|\?|\,|\;)\1+', r'\1', text)  # Deduplicate sentence-ending symbols
    text = re.sub(r'\.{3,}', '.', text)  # Replace consecutive ellipses with a single full stop
    
    # Ensure text ends with a full stop if it doesn't end with a sentence-ending symbol
    if not re.search(r'[.!?]$', text):
        text += '.'
    
    # Remove any combination of sentence-ending punctuation being seen in sequence without any words in between
    text = re.sub(r'([.!?])(\s*[.!?])+', r'\1', text)
    
    return text

# Define the input and output file paths
input_file_path = '/Users/macbookpro/Desktop/data/mindset_diagnosed_sfw.jl'
output_file_path = '/Users/macbookpro/Desktop/data/diagnosed_preprocessed.csv'

# Open the input JSONL file and the output CSV file
with open(input_file_path, 'r', encoding='utf-8') as jsonl_file, open(output_file_path, 'w', encoding='utf-8', newline='') as output_file:
    writer = csv.writer(output_file)
    writer.writerow(['TID', 'text'])  # Write CSV header
    
    prev_text = None
    dupe_count = 0
    too_short = 0
    dupe_tid = 0
    tids = set()
    
    n = 0
    
    # Process each line in the JSONL file
    for line in jsonl_file:
        # Parse the JSON object (user data) from the line
        user_obj = json.loads(line)
        
        # Ensure 'posts' property exists and is a list
        if 'posts' not in user_obj or not isinstance(user_obj['posts'], list):
            continue
        
        # Process each post in the 'posts' property
        for post in user_obj['posts']:
            if 'selftext' not in post or 'id' not in post:
                continue
            
            # Preprocess the text field
            processed_text = preprocess_text(post['selftext'])
            
            combined_tid = f"{user_obj['username']}|{post['id']}"
            
            if combined_tid in tids:
                print(f"Duplicate TID: {combined_tid}")
                dupe_tid += 1
            elif processed_text == prev_text:
                dupe_count += 1
            else:
                n += 1
                prev_text = processed_text
                
                # Write the TID and processed text to the CSV file
                writer.writerow([combined_tid, processed_text])
                
                # Track the TID to avoid duplicates
                tids.add(combined_tid)

print(f"Duplicate TIDs: {dupe_tid}")
print(f"Duplicate texts: {dupe_count}")
