In [9]:
import pandas as pd
import spacy

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Example DataFrame
df = pd.DataFrame({
    'text': [
        "This movie is not very good, but the soundtrack was not bad at all.",
        "I don't think the acting was particularly impressive.",
        "The plot was simple, yet not boring at all."
    ]
})

# Important words to keep
important_words = {"not", "no", "nor", "never", "n't"}
adjusted_stop_words = {word for word in nlp.Defaults.stop_words if word not in important_words}

# Define the smarter clean function
def smart_clean(text):
    doc = nlp(text)
    filtered_tokens = []

    for token in doc:
        # Keep negations
        if token.lower_ in important_words:
            filtered_tokens.append("not")  # Normalize n't -> not
        elif not token.is_stop or token.lemma_.lower() not in adjusted_stop_words:
            if token.is_alpha:  # Keep only words
                filtered_tokens.append(token.lemma_.lower())

    return ' '.join(filtered_tokens)

# Apply the function
df['cleaned_text'] = df['text'].apply(smart_clean)

print(df[['text', 'cleaned_text']])


                                                text  \
0  This movie is not very good, but the soundtrac...   
1  I don't think the acting was particularly impr...   
2        The plot was simple, yet not boring at all.   

                               cleaned_text  
0         movie not good soundtrack not bad  
1  not think acting particularly impressive  
2                    plot simple not boring  


In [10]:
text = "I don't think the acting was particularly impressive."
doc = nlp(text)
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.is_stop)

I I PRON True
do do AUX True
n't not PART True
think think VERB False
the the DET True
acting acting NOUN False
was be AUX True
particularly particularly ADV False
impressive impressive ADJ False
. . PUNCT False
