# Documentation for `negative-complaint-words.ipynb`

This notebook extracts, processes, and classifies words from consumer product incident reports to identify complaint-related words.
We use an LLM to classify each token as complaint-related (`1`) or neutral/positive (`0`).


## Input Files

- `../Data/cpsc_data/incident_reports/Toysandchildren_ArtsandCrafts.csv`
- `../Data/cpsc_data/incident_reports/Toysandchildren_Riding_Toys.csv`
- `../Data/cpsc_data/incident_reports/Toysandchildren_Toys.csv`

## Output Files

- `token_labels_individual.pkl` — Pickle file mapping tokens to their complaint/neutral label.
- `filtered_word_lists.pkl` — Pickle file containing the final filtered complaint and neutral word lists.



In [None]:
import sys
from pathlib import Path

project_root = Path().resolve().parent   
sys.path.insert(0, str(project_root / "src"))

In [8]:
import nltk
nltk.download('punkt')       
nltk.download('stopwords') 
nltk.download('punkt_tab')  
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import Counter
import requests
import json
import pickle
from pathlib import Path
from tqdm import tqdm
from helper_functions import *

In [None]:
# Paths to all three files
recall_files = [
    "../Data/cpsc_data/incident_reports/Toysandchildren_ArtsandCrafts.csv",
    "../Data/cpsc_data/incident_reports/Toysandchildren_Riding_Toys.csv",
    "../Data/cpsc_data/incident_reports/Toysandchildren_Toys.csv"
]

recall_dfs = [load_clean_csv(path) for path in recall_files]
recalls_df = pd.concat(recall_dfs, ignore_index=True)


In [6]:
# from the recalls data, embed the incident description
combined_indicent_text = " ".join(recalls_df['Incident Description'].dropna().tolist())
incident_desc_embedding = model.encode(combined_indicent_text)
incident_desc_embedding = np.array(incident_desc_embedding).reshape(1,-1)


  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [9]:
# preprocess the complaints data to remove stop words and get down to lemm
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [11]:
# Apply preprocessing
incidents = recalls_df['Incident Description'].dropna().astype(str)
all_tokens = incidents.apply(preprocess)

# Flatten to single list of tokens
flattened_tokens = [token for sublist in all_tokens for token in sublist]


In [12]:
# Optional: count top words
word_freq = Counter(flattened_tokens)
top_words = word_freq.most_common(20)
print(top_words)

[('toy', 2782), ('child', 1810), ('product', 1228), ('old', 1184), ('battery', 1149), ('get', 1129), ('play', 957), ('son', 917), ('one', 908), ('daughter', 893), ('year', 833), ('could', 815), ('come', 748), ('use', 716), ('consumer', 632), ('take', 608), ('small', 601), ('purchase', 595), ('would', 582), ('piece', 580)]


### LLM

In [41]:
TOKEN_BATCH_PROMPT = """
You are an expert at analyzing customer reviews for toys.

For each token below, classify it as:

- "1" → if it frequently appears in reviews describing problems, defects, malfunctions, poor quality, user dissatisfaction, safety concerns, or any kind of negative experience.
- "0" → if it is typically used in descriptive, generic, or positive contexts and is not strongly related to complaints.

Return only the answer as a JSON list of 1s and 0s, without explanation or extra text.
Example: ["1", "0", "0", "1"]

Tokens: {token_list}
Answer:
"""


In [43]:
def classify_tokens_batch_http(tokens, model="mixtral", host="http://localhost:11434"):
    prompt = TOKEN_BATCH_PROMPT.format(token_list=", ".join(tokens))

    r = requests.post(
        f"{host}/api/chat",
        json={
            "model": model,
            "messages": [{"role": "user", "content": prompt}],
        },
        stream=True,
        timeout=120,
    )

    full = ""
    for line in r.iter_lines():
        if line:
            piece = json.loads(line.decode())
            if "message" in piece and "content" in piece["message"]:
                full += piece["message"]["content"]

    # --- this is the important part ---
    try:
        result = json.loads(full.strip())
    except Exception:
        cleaned = full.strip().replace(",", "").replace("[", "").replace("]", "").replace('"', "")
        result = [w for w in cleaned.split() if w in {"0", "1"}]

    return result



In [51]:

# Vocabulary list
vocab = list(word_freq.keys())

# Output path for labeled tokens
OUT_FILE = Path("token_labels_individual.pkl")

# Result dictionary
labels = {}

# Function to classify a single token
def classify_token(token, model="mixtral", host="http://localhost:11434"):
    prompt = f"""
You are an expert at analyzing customer reviews for toys and a binary classifier.

If the word is commonly associated with **complaints or problems** in toy reviews, respond with **1**.

Otherwise, respond with **0**.

Respond ONLY with a single digit: 0 or 1. Do NOT include explanation.

Word: {token}
Answer:
    """.strip()

    response = requests.post(
        f"{host}/api/chat",
        json={
            "model": model,
            "messages": [{"role": "user", "content": prompt}],
        },
        stream=True,
        timeout=60,
    )

    full = ""
    for line in response.iter_lines():
        if line:
            piece = json.loads(line.decode())
            if "message" in piece and "content" in piece["message"]:
                full += piece["message"]["content"]

    # Extract the first 0 or 1
    for c in full:
        if c in {"0", "1"}:
            return int(c)
    return None  # fallback if nothing valid is found

# Main loop — classify each token individually
for token in tqdm(vocab, desc="Classifying tokens one by one"):
    try:
        label = classify_token(token)
        if label is not None:
            labels[token] = label
            # Save after 50 labels
            if len(labels) % 50 == 0:
                OUT_FILE.write_bytes(pickle.dumps(labels))
        else:
            print(f"❌ No label found for token: {token}")
    except Exception as e:
        print(f"❌ Error for token '{token}': {e}")

# Final save
OUT_FILE.write_bytes(pickle.dumps(labels))
print(f"\n✅ Done! Saved {len(labels)} labels to {OUT_FILE}")

Classifying tokens one by one: 100%|████████████████████████████████████████████| 9149/9149 [11:34:31<00:00,  4.55s/it]


✅ Done! Saved 9149 labels to token_labels_individual.pkl





## ChatGPT Eliminated Negative Words and Neutral Words

Negative words still need elimination. Use ChatGPT to eliminate them by giving 300 to ChatGPT at a time.

In [57]:
# Words labeled as 1 (complaint-related)
complaint_words = [word for word, label in labels.items() if label == 1]

# Words labeled as 0 (neutral or positive)
neutral_words = [word for word, label in labels.items() if label == 0]


In [None]:
b/home/alex/safeify_video/1 file from Emelie Arvidsson on Jun 26, 2025/Finalversionofintro_export1.movegin_ind = 6150
end_ind = begin_ind+300
print(', '.join(complaint_words[begin_ind:end_ind]))


graduation, nearer, tinnitus, privateer, heath, lingered, squad, scabby, burried, splatr, microhyphema, sandy, sepsis, beating, posed, ignatius, itt, twine, widen, ferrite


In [315]:

# List of unrelated or neutral words
unrelated_words = [
    "graduation",
    "nearer",
    "privateer",
    "heath",
    "lingered",
    "squad",
    "posed",
    "ignatius",
    "itt",
    "twine",
    "widen",
    "ferrite"
]


# Convert to sets
complaint_set = set(complaint_words[begin_ind:end_ind]) 
unrelated_set = set(unrelated_words)

# Get only complaint-related words (i.e., those not in unrelated set)
filtered_complaint_words = list(complaint_set - unrelated_set)

# Show result
print("Complaint-related words:")
print(filtered_complaint_words)

Complaint-related words:
['sepsis', 'scabby', 'sandy', 'tinnitus', 'burried', 'splatr', 'beating', 'microhyphema']


In [317]:
filtered_complaint_words_all.extend(filtered_complaint_words)
filtered_unrelated_words_all.extend(unrelated_words)
print(f'Length of complaint {len(filtered_complaint_words_all)}')
print(f'Length of unrelated {len(filtered_unrelated_words_all)}')

Length of complaint 3044
Length of unrelated 3185


In [327]:
# Save both lists to a pickle file
with open("filtered_word_lists.pkl", "wb") as f:
    pickle.dump({
        "complaint_words_filtered": filtered_complaint_words_all,
        "neutral_words": neutral_words
    }, f)

In [331]:
with open("filtered_word_lists.pkl", "rb") as f:
    data = pickle.load(f)

# Access the lists
complaint_words3 = data["complaint_words_filtered"]
neutral_words3 = data["neutral_words"]

In [345]:
 data.keys()

dict_keys(['complaint_words_filtered', 'neutral_words'])

In [335]:
len(neutral_words3)

6164

In [333]:
len(complaint_words3)


3044