The code cleans up raw text from an Excel file by removing stopwords, URLs, usernames, short words, and converting each word to its root form

In [None]:
#code 1
import nltk
nltk.download('wordnet')

import spacy
import pandas as pd
from spacy.lang.en import English
import os


# Load spaCy English model
sp = spacy.load('en_core_web_sm')
parser = English()
sp_stop = sp.Defaults.stop_words

# Tokenization
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url or token.orth_.startswith('@'):
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

# Lemmatization
from nltk.corpus import wordnet as wn

def get_lemma(word):
    lemma = wn.morphy(word)
    return word if lemma is None else lemma

# Text cleaning
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in sp_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

# Load your Excel file (adjust filename here)
df = pd.read_excel('data_processed_v1.xlsx')  # ðŸ‘ˆ replace with your file

# Prepare cleaned tokens
cleaned_texts = []
for line in df['text']:
    tokens = prepare_text_for_lda(str(line))
    cleaned_texts.append(' '.join(tokens))

df['cleaned_text'] = cleaned_texts

# Save the cleaned dataframe as a new Excel file
df.to_excel('cleaned_output.xlsx', index=False)
