In [None]:
import pandas as pd
import numpy as np

In [None]:
file_paths = [
    "/content/sample_data/reddit_costco_alcohol_posts_and_comments.csv",
    "/content/sample_data/reddit_costco_posts_and_comments_1000.csv",
    "/content/sample_data/reddit_costco_wholesale_posts_and_comments_1000.csv"
]

In [None]:
def clean_data(file_path):
    df = pd.read_csv(file_path)

    # Remove duplicate rows
    df = df.drop_duplicates()

    # Handle missing values
    df = df.dropna(subset=["post_title", "comment_body"])  # Remove rows with missing text
    df["comment_author"] = df["comment_author"].fillna("Anonymous")

    # Convert UNIX timestamps to readable datetime
    df["post_created"] = pd.to_datetime(df["post_created"], unit="s")
    df["comment_created"] = pd.to_datetime(df["comment_created"], unit="s")

    # Standardize text: convert to lowercase and strip extra whitespace
    df["post_title"] = df["post_title"].str.lower().str.strip()
    df["comment_body"] = df["comment_body"].str.lower().str.strip()

    # Remove special characters (optional)
    df["post_title"] = df["post_title"].str.replace(r"[^a-zA-Z0-9\s]", "", regex=True)
    df["comment_body"] = df["comment_body"].str.replace(r"[^a-zA-Z0-9\s]", "", regex=True)

    # Filter out deleted comments
    df = df[~df["comment_body"].isin(["[deleted]", "[removed]"])]
    df['source']="Reddit"

    return df

In [None]:
# Process all files
cleaned_dataframes = [clean_data(file) for file in file_paths]

In [None]:
# Combine all cleaned datasets
final_df = pd.concat(cleaned_dataframes, ignore_index=True)

In [None]:
# Save the final cleaned dataset
final_cleaned_file = "/content/sample_data/cleaned_combined_reddit_costco_data.csv"
final_df.to_csv(final_cleaned_file, index=False)

In [None]:
print(f"Data cleaning completed. Combined cleaned data saved to {final_cleaned_file}")

Data cleaning completed. Combined cleaned data saved to /content/sample_data/cleaned_combined_reddit_costco_data.csv


In [None]:
## Cleaning after categorization

In [None]:
import pandas as pd

# Step 1: Read the file
file_path = "/content/drive/MyDrive/Colab Notebooks/DatasetsCapstone/Cleaned Dataset/categorized_reddit_costco_data.csv"  # Update with your actual file path
df = pd.read_csv(file_path)

# Step 2: Drop the 'post_id' column
df = df.drop(columns=["post_id"])

# Step 3: Rename 'post_score' to 'likes' and 'comment_score' to 'comment_likes'
df = df.rename(columns={"post_score": "likes", "comment_score": "comment_likes"})

# Step 4: Extract 'post_year' and 'post_month' from 'post_created'
df["post_created"] = pd.to_datetime(df["post_created"])
df["post_year"] = df["post_created"].dt.year
df["post_month"] = df["post_created"].dt.month

# Step 5: Extract 'comment_year' and 'comment_month' from 'comment_created'
df["comment_created"] = pd.to_datetime(df["comment_created"])
df["comment_year"] = df["comment_created"].dt.year
df["comment_month"] = df["comment_created"].dt.month

# Step 6: Add a new column 'source' with the value 'Reddit'
df["source"] = "Reddit"
df = df.drop(columns=["post_created"])
df = df.drop(columns=["comment_created"])
df = df.drop(columns=["comment_id"])
df = df.drop(columns=["comment_author"])

# Display the first few rows to verify the changes
print(df.head())

# Optional: Save the modified dataset to a new file
final_file_path = "/content/drive/MyDrive/Colab Notebooks/DatasetsCapstone/Cleaned Dataset/transformed_reddit_costco_data.csv"
df.to_csv(final_file_path, index=False)
print(f"Data transformation completed. Transformed data saved to {final_file_path}")


                  post_title                                post_url  likes  \
0     morena san diego  noon    https://i.redd.it/7yqih97z7mde1.jpeg     28   
1     morena san diego  noon    https://i.redd.it/7yqih97z7mde1.jpeg     28   
2     morena san diego  noon    https://i.redd.it/7yqih97z7mde1.jpeg     28   
3     morena san diego  noon    https://i.redd.it/7yqih97z7mde1.jpeg     28   
4  greenville south carolina  https://www.reddit.com/gallery/1i3v11d     11   

                                        comment_body  comment_likes  \
0                                  anything else new              1   
1                         ahh moreno the blessed one              1   
2                                           sold out              1   
3  nope was hoping something wouldve been put out...              2   
4  thats 2997 right now at costco rohnert park in...              3   

             category  post_year  post_month  comment_year  comment_month  \
0  General Discussion

In [None]:
## Steps for preprocessing started (Stopwords and selected punctuations removal, lemmitization, tokenization)

In [None]:
!pip install spacy
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
import pandas as pd
import re  # For regular expressions

nlp = spacy.load("en_core_web_md")

In [None]:

cleaned_file_path = "/content/drive/MyDrive/Colab Notebooks/DatasetsCapstone/Cleaned Dataset/transformed_reddit_costco_data.csv"
df = pd.read_csv(cleaned_file_path)

# Custom stopwords (Costco-related terms to keep)
custom_stopwords = set([
    "costco", "customer", "membership", "return", "refund", "kirkland", "delivery", "order",
    "id", "its", "price", "coupon", "discount", "sale", "offer", "online", "store", "product",
    "purchase", "shipping", "review", "availability", "pack", "bulk", "cart", "checkout",
    "service", "item", "manager", "location", "warehouse", "brand", "department", "shopping",
    "receipt", "cashier", "staff", "member", "club", "barcode", "pickup", "rewards", "card",
    "cost", "line", "experience", "returning", "queue", "section", "selection", "helpdesk",
    "support", "phone", "app", "counter", "employee", "promotion", "policy", "gas", "station",
    "parking", "hours", "aisle", "price match"
])

# Common contractions and misspellings
misspelling_mapping = {
    "thats": "that is", "its": "it is", "cant": "cannot", "im": "i am", "dont": "do not",
    "wont": "will not", "didnt": "did not", "couldnt": "could not", "wouldnt": "would not",
    "shouldnt": "should not", "havent": "have not", "hasnt": "has not", "aint": "is not",
    "arent": "are not", "werent": "were not", "wasnt": "was not", "wanna": "want to",
    "gonna": "going to", "gotta": "got to", "lemme": "let me", "gimme": "give me",
    "yall": "you all", "idk": "i do not know", "tbh": "to be honest", "ikr": "i know right",
    "omg": "oh my god", "u": "you", "r": "are", "ur": "your", "cuz": "because", "pls": "please",
    "ppl": "people", "tho": "though", "nite": "night", "luv": "love", "bday": "birthday",
    "gr8": "great", "thx": "thanks", "sry": "sorry", "lol": "laughing out loud",
    "btw": "by the way", "imo": "in my opinion", "brb": "be right back", "lmk": "let me know",
    "smh": "shaking my head", "fyi": "for your information"
}

# Function to correct common misspellings and contractions
def correct_misspellings(text):
    for misspelling, full_form in misspelling_mapping.items():
        text = re.sub(rf"\b{misspelling}\b", full_form, text)
    return text

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = correct_misspellings(text)  # Correct common misspellings
    text = re.sub(r"[^a-zA-Z0-9\s!?]", "", text)  # Remove unwanted characters
    doc = nlp(text)  # Tokenization using SpaCy

    tokens = [
        token.lemma_  # Use lemmatized form of tokens
        for token in doc
        if not token.is_stop or token.text in custom_stopwords  # Filter stopwords
    ]
    return " ".join(tokens)

# Apply the preprocessing function to both 'post_title' and 'comment_body'
df["post_title"] = df["post_title"].fillna("").apply(preprocess_text)
df["comment_body"] = df["comment_body"].fillna("").apply(preprocess_text)

# Save the processed data
preprocessed_file_path = "/content/drive/MyDrive/Colab Notebooks/DatasetsCapstone/Cleaned Dataset/preprocessed_reddit_costco_data.csv"
df.to_csv(preprocessed_file_path, index=False)

print(f"Preprocessing completed. Data saved to {preprocessed_file_path}")

Preprocessing completed. Data saved to /content/drive/MyDrive/Colab Notebooks/DatasetsCapstone/Cleaned Dataset/preprocessed_reddit_costco_data.csv


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
## Categorization method 1.

In [None]:
!pip install spacy
!python -m spacy download en_core_web_md


Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import re
import spacy
import pandas as pd
import numpy as np

In [None]:
nlp = spacy.load("en_core_web_md")

# Define keywords for categorization

product_keywords = [
    # Kirkland Signature Products
    "kirkland", "kirkland signature", "kirkland water", "kirkland shampoo", "kirkland conditioner",
    "kirkland dog treats", "kirkland diapers", "kirkland protein bars", "kirkland almond milk",
    "kirkland chocolate", "kirkland peanut butter", "kirkland sparkling wine", "kirkland beer",

    # Electronics & Appliances
    "smart tv", "gaming laptop", "desktop computer", "wireless earbuds", "bluetooth speaker",
    "4k tv", "oled tv", "gaming pc", "wifi router", "hard drive", "ssd", "fitness tracker",
    "keurig", "instant pot", "air fryer", "vacuum cleaner", "robot vacuum", "home theater",

    # Food & Grocery
    "organic produce", "costco pizza", "costco muffins", "rotisserie chicken", "plant-based",
    "vegan snacks", "gluten-free", "cold cuts", "frozen meals", "energy drinks", "costco pies",
    "kirkland ice cream", "pasta sauce", "granola", "nut butter", "trail mix", "frozen desserts",

    # Furniture & Home
    "living room set", "coffee table", "bar stools", "sectional sofa", "ottoman", "kitchen island",
    "costco mattresses", "adjustable bed", "bunk bed", "outdoor furniture", "patio umbrella",
    "garden decor", "lighting fixtures", "home organization", "storage bins", "closet organizers",

    # Clothing & Apparel
    "activewear", "sneakers", "boots", "winter jackets", "raincoat", "sports bras", "base layers",
    "costco jeans", "costco socks", "swim trunks", "beachwear", "costco hats", "scarves", "gloves",

    # Health & Wellness
    "immune support", "probiotics", "vitamin c", "collagen", "fish oil", "protein shakes",
    "kirkland vitamins", "essential oils", "sleep aids", "pain relief", "costco pharmacy rx",
    "hand sanitizing wipes", "home medical devices", "hearing aid batteries", "compression socks",

    # Household & Cleaning
    "all-purpose cleaner", "window cleaner", "vacuum bags", "dryer sheets", "costco mop",
    "swiffer", "garbage disposal", "air freshener spray", "costco bleach", "antibacterial wipes",
    "rechargeable batteries", "led bulbs", "smart home devices", "air filters", "costco solar lights",

    # Outdoor & Auto
    "snow tires", "all-season tires", "grilling accessories", "propane tanks", "outdoor heaters",
    "garden tools", "lawn chairs", "hammock", "camping tents", "sleeping bags", "portable cooler",
    "electric scooter", "bike accessories", "outdoor speakers", "costco kayaks", "golf equipment",
    "pool supplies", "garden hose", "power tools"
]


service_keywords = [
    # Membership & Policies
    "executive member", "gold star membership", "auto renewal", "membership refund",
    "corporate membership", "membership benefits", "price guarantee", "product recall",
    "membership downgrade", "student discount", "military discount",

    # Customer Experience
    "friendly staff", "customer satisfaction", "waiting time", "self-checkout",
    "greeter", "in-store experience", "costco support", "feedback", "customer feedback",
    "store layout", "crowd management", "complaint resolution", "in-store wifi", "cleanliness",

    # Online Shopping & Delivery
    "express delivery", "membership online", "order history", "subscription services",
    "auto-ship", "product availability", "out of stock", "online chat support",
    "costco digital", "home delivery", "scheduled delivery", "delivery slots",
    "shipping policy", "refund status",

    # Pricing & Discounts
    "seasonal sale", "holiday deals", "weekly specials", "employee discount",
    "bundle offers", "buy one get one", "cashback rewards", "reward points", "gift voucher",
    "sale event", "exclusive discount", "birthday rewards", "anniversary offer",

    # Costco Travel & Financial Services
    "vacation packages", "flight discounts", "travel insurance policy", "roadside assistance",
    "travel booking", "travel refund", "costco visa cashback", "auto insurance",
    "credit card rewards", "personal loan", "home insurance", "costco mortgage",

    # Gas Station & Automotive Services
    "fuel efficiency", "premium fuel", "diesel", "car tire installation", "alignment check",
    "emission testing", "car battery", "brake service", "oil filter", "auto parts",
    "road trip essentials", "car cleaning products", "engine diagnostic", "windshield wipers"
]


#product_keywords = ["kirkland", "laptop", "pizza", "wine", "electronics", "furniture", "tvs", "groceries", "clothing"]
#service_keywords = ["membership", "customer service", "refund", "return policy", "delivery", "checkout", "discounts"]

# Path to the combined cleaned dataset
cleaned_file_path = "/content/sample_data/cleaned_combined_reddit_costco_data.csv"


In [None]:
def semantic_similarity(text, keywords):
    """
    Computes similarity score between a text and category keywords using SpaCy word embeddings.
    Handles empty vectors to avoid warnings.
    """
    if not text.strip():
        return 0

    doc = nlp(text)

    similarities = []
    for word in keywords:
        word_doc = nlp(word)
        if doc.has_vector and word_doc.has_vector:
            similarities.append(doc.similarity(word_doc))

    return max(similarities) if similarities else 0


def categorize_post(post_title, comment_body):
    """
    Categorizes the post as 'Product', 'Service', or 'General Discussion' based on keywords & semantic similarity.
    """
    text = f"{post_title} {comment_body}".lower()

    if any(re.search(rf"\b{kw}\b", text) for kw in product_keywords):
        return "Product"
    if any(re.search(rf"\b{kw}\b", text) for kw in service_keywords):
        return "Service"

    # Step 2: Semantic Similarity (Threshold: 0.7)
    product_similarity = semantic_similarity(text, product_keywords)
    service_similarity = semantic_similarity(text, service_keywords)

    if product_similarity > 0.7:
        return "Product"
    elif service_similarity > 0.7:
        return "Service"

    return "General Discussion"


In [None]:
df = pd.read_csv(cleaned_file_path)

df["post_title"] = df["post_title"].fillna("")
df["comment_body"] = df["comment_body"].fillna("")

df["category"] = df.apply(lambda row: categorize_post(row["post_title"], row["comment_body"]), axis=1)



In [None]:
# Save categorized dataset
categorized_file_path = "/content/sample_data/categorized_reddit_costco_data.csv"
df.to_csv(categorized_file_path, index=False)

print(f"Data categorization completed. Categorized data saved to {categorized_file_path}")
print(df["category"].value_counts())

In [None]:
## Categorization method 2.

In [None]:
import re
import spacy
import pandas as pd
import numpy as np

In [None]:
nlp = spacy.load("en_core_web_md")

product_keywords = [
    # Kirkland Signature Products
    "kirkland", "kirkland signature", "kirkland water", "kirkland shampoo", "kirkland conditioner",
    "kirkland dog treats", "kirkland diapers", "kirkland protein bars", "kirkland almond milk",
    "kirkland chocolate", "kirkland peanut butter", "kirkland sparkling wine", "kirkland beer",

    # Electronics & Appliances
    "smart tv", "gaming laptop", "desktop computer", "wireless earbuds", "bluetooth speaker",
    "4k tv", "oled tv", "gaming pc", "wifi router", "hard drive", "ssd", "fitness tracker",
    "keurig", "instant pot", "air fryer", "vacuum cleaner", "robot vacuum", "home theater",

    # Food & Grocery
    "organic produce", "costco pizza", "costco muffins", "rotisserie chicken", "plant-based",
    "vegan snacks", "gluten-free", "cold cuts", "frozen meals", "energy drinks", "costco pies",
    "kirkland ice cream", "pasta sauce", "granola", "nut butter", "trail mix", "frozen desserts",

    # Furniture & Home
    "living room set", "coffee table", "bar stools", "sectional sofa", "ottoman", "kitchen island",
    "costco mattresses", "adjustable bed", "bunk bed", "outdoor furniture", "patio umbrella",
    "garden decor", "lighting fixtures", "home organization", "storage bins", "closet organizers",

    # Clothing & Apparel
    "activewear", "sneakers", "boots", "winter jackets", "raincoat", "sports bras", "base layers",
    "costco jeans", "costco socks", "swim trunks", "beachwear", "costco hats", "scarves", "gloves",

    # Health & Wellness
    "immune support", "probiotics", "vitamin c", "collagen", "fish oil", "protein shakes",
    "kirkland vitamins", "essential oils", "sleep aids", "pain relief", "costco pharmacy rx",
    "hand sanitizing wipes", "home medical devices", "hearing aid batteries", "compression socks",

    # Household & Cleaning
    "all-purpose cleaner", "window cleaner", "vacuum bags", "dryer sheets", "costco mop",
    "swiffer", "garbage disposal", "air freshener spray", "costco bleach", "antibacterial wipes",
    "rechargeable batteries", "led bulbs", "smart home devices", "air filters", "costco solar lights",

    # Outdoor & Auto
    "snow tires", "all-season tires", "grilling accessories", "propane tanks", "outdoor heaters",
    "garden tools", "lawn chairs", "hammock", "camping tents", "sleeping bags", "portable cooler",
    "electric scooter", "bike accessories", "outdoor speakers", "costco kayaks", "golf equipment",
    "pool supplies", "garden hose", "power tools"
]


service_keywords = [
    # Membership & Policies
    "executive member", "gold star membership", "auto renewal", "membership refund",
    "corporate membership", "membership benefits", "price guarantee", "product recall",
    "membership downgrade", "student discount", "military discount",

    # Customer Experience
    "friendly staff", "customer satisfaction", "waiting time", "self-checkout",
    "greeter", "in-store experience", "costco support", "feedback", "customer feedback",
    "store layout", "crowd management", "complaint resolution", "in-store wifi", "cleanliness",

    # Online Shopping & Delivery
    "express delivery", "membership online", "order history", "subscription services",
    "auto-ship", "product availability", "out of stock", "online chat support",
    "costco digital", "home delivery", "scheduled delivery", "delivery slots",
    "shipping policy", "refund status",

    # Pricing & Discounts
    "seasonal sale", "holiday deals", "weekly specials", "employee discount",
    "bundle offers", "buy one get one", "cashback rewards", "reward points", "gift voucher",
    "sale event", "exclusive discount", "birthday rewards", "anniversary offer",

    # Costco Travel & Financial Services
    "vacation packages", "flight discounts", "travel insurance policy", "roadside assistance",
    "travel booking", "travel refund", "costco visa cashback", "auto insurance",
    "credit card rewards", "personal loan", "home insurance", "costco mortgage",

    # Gas Station & Automotive Services
    "fuel efficiency", "premium fuel", "diesel", "car tire installation", "alignment check",
    "emission testing", "car battery", "brake service", "oil filter", "auto parts",
    "road trip essentials", "car cleaning products", "engine diagnostic", "windshield wipers"
]


In [None]:
cleaned_file_path = "/content/sample_data/cleaned_combined_reddit_costco_data.csv"

def semantic_similarity(text, keyword_vectors):
    """
    Computes similarity score between a text and precomputed keyword vectors using SpaCy word embeddings.
    """
    if not text.strip():
        return 0

    doc = nlp(text)
    similarities = [doc.similarity(kw_vector) for kw_vector in keyword_vectors if kw_vector.has_vector]
    return max(similarities) if similarities else 0

def keyword_partial_match(text, keywords):
    """
    Checks if any individual word from multi-word keywords appears in the text.
    """
    text_words = set(re.findall(r'\b\w+\b', text))
    for kw in keywords:
        kw_words = set(kw.split())  # Split multi-word keywords into individual words
        if kw_words & text_words:  # Check if there's an intersection between keyword words and text words
            return True
    return False

def categorize_post(post_title, comment_body, product_vectors, service_vectors):
    """
    Categorizes the post as 'Product', 'Service', or 'General Discussion' based on keywords & semantic similarity.
    """
    text = f"{post_title} {comment_body}".lower()

    # Step 1: Keyword Matching (Exact or Partial)
    if keyword_partial_match(text, product_keywords):
        return "Product"
    if keyword_partial_match(text, service_keywords):
        return "Service"

    # Step 2: Semantic Similarity (Threshold: 0.7)
    product_similarity = semantic_similarity(text, product_vectors)
    service_similarity = semantic_similarity(text, service_vectors)

    if product_similarity > 0.7:
        return "Product"
    elif service_similarity > 0.7:
        return "Service"

    return "General Discussion"

# Precompute keyword vectors
product_vectors = [nlp(kw) for kw in product_keywords]
service_vectors = [nlp(kw) for kw in service_keywords]

In [None]:
df = pd.read_csv(cleaned_file_path)

# Fill NaN values in text columns
df["post_title"] = df["post_title"].fillna("")
df["comment_body"] = df["comment_body"].fillna("")

# Apply categorization function
df["category"] = df.apply(lambda row: categorize_post(row["post_title"], row["comment_body"], product_vectors, service_vectors), axis=1)

# Save categorized dataset
categorized_file_path = "/content/sample_data/categorized_reddit_costco_data.csv"
df.to_csv(categorized_file_path, index=False)

# Print category distribution
print(f"Data categorization completed. Categorized data saved to {categorized_file_path}")
print(df["category"].value_counts())

Data categorization completed. Categorized data saved to /content/sample_data/categorized_reddit_costco_data.csv
category
General Discussion    2560
Product                645
Service                537
Name: count, dtype: int64
