In [2]:
import pandas as pd
import spacy

# Install spaCy model if not installed
# !python -m spacy download en_core_web_sm

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# 1 & 4: Load dataset (assuming CSV is downloaded and accessible)
def load_data(filepath, n=10000):
    df = pd.read_csv(filepath, engine='python', on_bad_lines='skip')
    reviews = df['Text'].dropna().head(n)
    return reviews

# 5: Preprocess text - lowercase and remove punctuation (optional because spaCy can handle it)
def preprocess_text(text):
    return text.lower()

# 6-8: Process text with spaCy for tokenization, stopword removal, POS tagging, NER
def process_review(text):
    doc = nlp(text)
    # Tokenization and cleaning: alphabetic tokens and remove stopwords
    tokens = [token.text for token in doc if token.is_alpha and not token.is_stop]
    # POS tagging: list of (token, pos)
    pos_tags = [(token.text, token.pos_) for token in doc if token.text in tokens]
    # NER: list of (entity_text, entity_label)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return tokens, pos_tags, entities

if __name__ == "__main__":
    filepath = "/content/Reviews.csv"  # Update your path here/content/Reviews.csv
    reviews = load_data(filepath)

    # Preprocess reviews
    reviews_clean = reviews.apply(preprocess_text)

    # Process only first 5 for demo
    for i, review in enumerate(reviews_clean.head(5), 1):
        tokens, pos_tags, entities = process_review(review)
        print(f"\nReview {i}:")
        print(f"Tokens (cleaned): {tokens}")
        print(f"POS tags: {pos_tags}")
        print(f"Named Entities:")
        for ent_text, ent_label in entities:
            print(f"  {ent_text}: {ent_label}")


Review 1:
Tokens (cleaned): ['bought', 'vitality', 'canned', 'dog', 'food', 'products', 'found', 'good', 'quality', 'product', 'looks', 'like', 'stew', 'processed', 'meat', 'smells', 'better', 'labrador', 'finicky', 'appreciates', 'product', 'better']
POS tags: [('bought', 'VERB'), ('vitality', 'NOUN'), ('canned', 'VERB'), ('dog', 'NOUN'), ('food', 'NOUN'), ('products', 'NOUN'), ('found', 'VERB'), ('good', 'ADJ'), ('quality', 'NOUN'), ('product', 'NOUN'), ('looks', 'VERB'), ('like', 'ADP'), ('stew', 'NOUN'), ('processed', 'VERB'), ('meat', 'NOUN'), ('smells', 'VERB'), ('better', 'ADV'), ('labrador', 'NOUN'), ('finicky', 'ADJ'), ('appreciates', 'VERB'), ('product', 'NOUN'), ('better', 'ADV')]
Named Entities:

Review 2:
Tokens (cleaned): ['product', 'arrived', 'labeled', 'jumbo', 'salted', 'peanuts', 'peanuts', 'actually', 'small', 'sized', 'unsalted', 'sure', 'error', 'vendor', 'intended', 'represent', 'product', 'jumbo']
POS tags: [('product', 'NOUN'), ('arrived', 'AUX'), ('labeled', 