# Small Pipeline Demo (Laptop / CPU)
This notebook runs the entire pipeline on a **small sample** of the Amazon Electronics reviews.
Place your dataset at `data/Electronics_5.json.gz`. The code writes all artifacts locally.

**Note**: Charts use matplotlib (no seaborn). Each chart is on its own figure.

In [None]:

# Optional: if running in a fresh env, install deps (uncomment)
# !pip install -r ../requirements.txt

import json, os
from pathlib import Path
import pandas as pd
from cfpipe.config import load_config, get_paths, ROOT
cfg = load_config()
paths = get_paths(cfg)
ROOT, paths


## 1) Preprocess (sample 10k)

In [None]:

from cfpipe.preprocess import run as preprocess_run
preprocess_run(paths.raw_path, paths.cleaned_csv, paths.removed_csv, paths.language_stats, 
               max_rows=100000, sample=cfg["defaults"]["sample_rows"], infer_cols=True)
pd.read_csv(paths.cleaned_csv).head()


## 2) Chunking (RoBERTa tokenizer)

In [None]:

from cfpipe.chunking import run as chunk_run
chunk_run(paths.cleaned_csv, paths.chunked_csv, chunk_tokens=cfg["defaults"]["chunk_tokens"], overlap=cfg["defaults"]["chunk_overlap"])
pd.read_csv(paths.chunked_csv).head()


## 3) ABSA (zero-shot aspects + RoBERTa sentiment)

In [None]:

from cfpipe.absa import run as absa_run
aspects = cfg["defaults"]["absa_aspects"]
absa_run(paths.chunked_csv, paths.aspect_sentiment, aspects=aspects, batch_size=8, device="cpu")
import json
print(json.loads(open(paths.aspect_sentiment).read())[0])


## 4) Embeddings (distilroberta) + TF-IDF

In [None]:

from cfpipe.embed import run as embed_run
from cfpipe.tfidf import run as tfidf_run
embed_run(paths.chunked_csv, paths.id_to_text_map, paths.dense_memmap, 
          model_name=cfg["defaults"]["embedding_model_laptop"], batch_size=64, device="cpu")
tfidf_run(paths.chunked_csv, paths.tfidf_vectorizer, paths.tfidf_matrix, backend="tfidf", max_features=200000)


## 5) FAISS (CPU HNSW) + KG

In [None]:

from cfpipe.build_faiss import run as faiss_run
from cfpipe.kg import run as kg_run
# distilroberta dim=768
faiss_run(paths.dense_memmap, dim=768, out_path=paths.faiss_index, index_type="hnsw", gpu=False)
kg_run(paths.chunked_csv, paths.aspect_sentiment, paths.kg_pickle)


## 6) Retrieval sanity check

In [None]:

from cfpipe.retriever import HybridRetriever
r = HybridRetriever(paths.id_to_text_map, cfg["defaults"]["embedding_model_laptop"],
                    paths.faiss_index, paths.tfidf_vectorizer, paths.tfidf_matrix, paths.kg_pickle)
q = "battery life of these headphones with noise cancelling"
r.search(q, top_k=5)


## 7) Visualizations (matplotlib)
We will plot: ratings histogram, sentiment pie, top negative aspects bar, and sentiment over time.

In [None]:

import matplotlib.pyplot as plt
import pandas as pd
import json

df_chunks = pd.read_csv(paths.chunked_csv)
aspect = json.load(open(paths.aspect_sentiment))
df_absa = pd.DataFrame(aspect)

# Ratings histogram
fig, ax = plt.subplots()
df_clean = pd.read_csv(paths.cleaned_csv)
ax.hist(df_clean['rating'].dropna(), bins=10)
ax.set_title("Ratings distribution")
ax.set_xlabel("Stars"); ax.set_ylabel("Count")
plt.show()


In [None]:

# Sentiment pie
import matplotlib.pyplot as plt
import pandas as pd
df_absa = pd.DataFrame(json.load(open(paths.aspect_sentiment)))
fig, ax = plt.subplots()
sent_counts = df_absa['sentiment'].value_counts()
ax.pie(sent_counts.values, labels=sent_counts.index, autopct="%1.1f%%")
plt.show()


In [None]:

# Top negative aspects bar
import matplotlib.pyplot as plt
import pandas as pd
df_absa = pd.DataFrame(json.load(open(paths.aspect_sentiment)))
neg = df_absa[df_absa['sentiment']=='negative']
freq = {}
for xs in neg['aspects']:
    for a in xs: freq[a] = freq.get(a, 0) + 1
top = sorted(freq.items(), key=lambda x: -x[1])[:15]
fig, ax = plt.subplots()
ax.bar([a for a,_ in top], [c for _,c in top])
plt.xticks(rotation=45, ha="right")
ax.set_ylabel("Count"); ax.set_title("Top negative aspects")
plt.show()


In [None]:

# Sentiment over time (avg signed score)
import matplotlib.pyplot as plt
import pandas as pd
df_chunks = pd.read_csv(paths.chunked_csv, usecols=["chunk_id","unix_time"])
df_absa = pd.DataFrame(json.load(open(paths.aspect_sentiment)))
df = df_absa.merge(df_chunks, on="chunk_id", how="left")
df['ts'] = pd.to_datetime(df['unix_time'], unit='s', errors='coerce')
df['month'] = df['ts'].dt.to_period('M').astype(str)
df['signed'] = df['sentiment'].apply(lambda s: 1.0 if s=='positive' else -1.0) * df['sentiment_score'].astype(float)
trend = df.groupby('month')['signed'].mean().reset_index()
fig, ax = plt.subplots()
ax.plot(trend['month'], trend['signed'])
plt.xticks(rotation=45, ha="right"); ax.set_ylabel("Avg sentiment score"); ax.set_title("Trend over time")
plt.show()


## 8) Start API + Dashboard (run in terminal)
```bash
uvicorn cfpipe.api_server:app --port 8000

streamlit run src/cfpipe/dashboard.py
```