In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Load the dataset
data = pd.read_csv("cleaned_data.csv")

# Fill any missing values in 'cleaned_text' with an empty string
data['cleaned_text'] = data['cleaned_text'].fillna('')

# Initialize the lemmatizer and stopwords list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function for tokenization and lemmatization
def tokenize_and_lemmatize(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    # Lemmatize each token
    lemmatized_text = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return ' '.join(lemmatized_text)
    

# Apply the function to the 'cleaned_text' column
data['lemmatized_text'] = data['cleaned_text'].apply(tokenize_and_lemmatize)

# Save the processed data to a new CSV file to avoid reprocessing next time
data.to_csv("lemmatized_data.csv", index=False)

# Display the results
print(data[['cleaned_text', 'lemmatized_text']].head())



                                        cleaned_text  \
0  awww thats bummer shoulda got david carr third...   
1  upset cant update facebook texting might cry r...   
2  dived many times ball managed save rest go bounds   
3                   whole body feels itchy like fire   
4                           behaving im mad cant see   

                                     lemmatized_text  
0  awww thats bummer shoulda got david carr third...  
1  upset cant update facebook texting might cry r...  
2    dived many time ball managed save rest go bound  
3                    whole body feel itchy like fire  
4                           behaving im mad cant see  


In [3]:
# Check if the columns "cleaned_text" and "lemmatized_text" are the same
data['is_same'] = data['cleaned_text'] == data['lemmatized_text']

# Display rows where the columns are not the same
different_rows = data[data['is_same'] == False]

# Display results
print(f"Number of rows where the columns are the same: {data['is_same'].sum()}")
print(f"Number of rows where the columns are different: {len(different_rows)}")

# Optionally display the rows where they are different
print(different_rows[['cleaned_text', 'lemmatized_text']])

Number of rows where the columns are the same: 964328
Number of rows where the columns are different: 635672
                                              cleaned_text  \
2        dived many times ball managed save rest go bounds   
3                         whole body feels itchy like fire   
7        hey long time see yes rains bit bit lol im fin...   
11                                          repierced ears   
13                    counts idk either never talk anymore   
...                                                    ...   
1599986                                      much ads blog   
1599988  ha good job thats right gotta throw bigrun tag...   
1599991  mmmm sounds absolutely perfect schedule full w...   
1599996            thewdbcom cool hear old walt interviews   
1599997                    ready mojo makeover ask details   

                                           lemmatized_text  
2          dived many time ball managed save rest go bound  
3                       