In [1]:
import pandas as pd
import spacy

In [2]:
file_path = "data/adhd-posts.csv"

In [3]:
# Load spaCy model
nlp = spacy.load('en_core_web_sm')
df = pd.read_csv(file_path)

In [4]:
# Important words to keep
important_words = {"not", "no", "nor", "never", "n't"}
adjusted_stop_words = {word for word in nlp.Defaults.stop_words if word not in important_words}

In [5]:
# Define the smarter clean function
def smart_clean(text):
    doc = nlp(text)
    filtered_tokens = []

    for token in doc:
        # Keep negations
        if token.lower_ in important_words:
            filtered_tokens.append("not")  # Normalize n't -> not
        elif not token.is_stop or token.lemma_.lower() not in adjusted_stop_words:
            if token.is_alpha:  # Keep only words
                filtered_tokens.append(token.lemma_.lower())

    return ' '.join(filtered_tokens)

In [6]:
# Apply the function
df['cleaned_body'] = df['body'].apply(smart_clean)

KeyboardInterrupt: 

In [None]:
print(df[['body', 'cleaned_body']])

                                                text  \
0  This movie is not very good, but the soundtrac...   
1  I don't think the acting was particularly impr...   
2        The plot was simple, yet not boring at all.   

                               cleaned_text  
0         movie not good soundtrack not bad  
1  not think acting particularly impressive  
2                    plot simple not boring  


In [None]:
df.to_csv(file_path, index=False)