In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv("../../data/reddit/reddit_stories.csv")

In [3]:
def add_space_around_parentheses(text):
    """Ensure there are spaces before and after parentheses."""
    if isinstance(text, str):
        # Ensure there is a space before an opening parenthesis
        text = re.sub(r'(?<! )\(', ' (', text)
        # Ensure there is a space after a closing parenthesis
        text = re.sub(r'\)(?! )', ') ', text)
    return text

def clean_text(text):
    """Apply all cleaning steps including newline removal, ensuring space after periods,
    replacing multiple spaces with a single space, and adding spaces around parentheses."""
    if isinstance(text, str):
        # Remove newline characters
        text = text.replace('\n', ' ')
        # Ensure there is a space after a period
        text = re.sub(r'\.([A-Za-z])', r'. \1', text)
        # Replace multiple spaces with a single space
        text = re.sub(' +', ' ', text)
        # Add spaces around parentheses
        text = add_space_around_parentheses(text)
    else:
        # Return a default value for non-string inputs
        return text  # or return "" to convert non-strings to empty strings
    return text

# Apply the functions to the 'selftext' column
df['selftext'] = df['selftext'].apply(lambda x: clean_text(x))

# Apply the functions to the 'selftext' column
df['title'] = df['title'].apply(lambda x: clean_text(x))

In [4]:
df.iloc[4].selftext

'This happened to me about a month ago but it still boggles me. Two points of background to make this story make sense: 1) I live in Tokyo and commute via those famously crazy crowded trains daily. There exists on them this kind of unspoken agreement that everyone works together to make this suck as little as possible. People for the most part hold their backpacks in front of them, men don\'t manspread etc. But thats when the trains are full. About two stops before mine, the train goes from SARDINE CAN to everyone on this train could lay on the seats and have room left over. So usually I can sit down at this point. 2) I\'m not a Japanese woman. Very obviously so, even when I\'m in my white dress shirt and pencil skirt like all the other office drones. I\'m often the only obviously non-Japanese woman on my train in the morning. But despite my appearance, I\'m fluent in Japanese. So one day I sit down when the train empties out, headphones in, mobile game going, ready to enjoy the 10 min