In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


In [None]:
# Read the CSV file
df = pd.read_csv('reviews.csv')

# Extract the 'comments' column as the text corpus
corpus = df['comments']

# Ensure that the corpus is a list of strings
corpus = df['comments'].astype(str).tolist()

# Tokenize the text corpus
corpus_tokens = [word_tokenize(comment) for comment in corpus]

# Print the first few tokenized comments
for tokens in corpus_tokens[:5]:
    print(tokens)

In [None]:
# Remove stop words and perform cleaning
stop_words = set(stopwords.words('english'))
cleaned_corpus = []

for tokens in corpus_tokens:
    cleaned_tokens = []
    for token in tokens:
        # Remove punctuation and convert to lowercase
        token = token.lower()
        if token.isalpha():
            # Remove stop words
            if token not in stop_words:
                cleaned_tokens.append(token)
    cleaned_corpus.append(cleaned_tokens)

cleaned_corpus


In [None]:
# Normalize the corpus using lemmatization
lemmatizer = WordNetLemmatizer()
normalized_corpus = []

for tokens in cleaned_corpus:
    normalized_tokens = []
    for token in tokens:
        # Get the part of speech tag for each token
        pos_tag = nltk.pos_tag([token])[0][1]
        # Map the part of speech tag to WordNet tags
        if pos_tag.startswith('J'):
            wordnet_tag = wordnet.ADJ
        elif pos_tag.startswith('V'):
            wordnet_tag = wordnet.VERB
        elif pos_tag.startswith('N'):
            wordnet_tag = wordnet.NOUN
        elif pos_tag.startswith('R'):
            wordnet_tag = wordnet.ADV
        else:
            wordnet_tag = wordnet.NOUN
        # Lemmatize the token
        lemma = lemmatizer.lemmatize(token, pos=wordnet_tag)
        normalized_tokens.append(lemma)
    normalized_corpus.append(normalized_tokens)

# Print the normalized corpus
for tokens in normalized_corpus:
    print(tokens)
