<a href="https://colab.research.google.com/github/sreevanimtcs2502/sreevanimtcs2502/blob/main/gibbs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import numpy as np
import random
from sklearn.feature_extraction.text import CountVectorizer
from datasets import load_dataset


print("Loading Yelp dataset...")
dataset = load_dataset("yelp_review_full", split="train[:2%]")  # Use small subset for speed
texts = dataset["text"]
labels = dataset["label"]  # 0â€“4 (1 to 5 stars)

# Map to sentiment: 0,1,2 = negative; 3,4 = positive
sentiments = ["Negative" if l < 2 else "Positive" for l in labels]


print("Vectorizing text...")
vectorizer = CountVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(texts)
vocab = vectorizer.get_feature_names_out()


n_docs, n_words = X.shape
n_topics = 100        # ðŸŸ¢ 100 Topics
alpha = 0.1           # Dirichlet prior for doc-topic
beta = 0.1            # Dirichlet prior for topic-word
n_iters = 20          # Lower iterations for faster run


word_indices = [X[i].nonzero()[1] for i in range(n_docs)]
z = [np.random.randint(0, n_topics, len(wi)) for wi in word_indices]

doc_topic = np.zeros((n_docs, n_topics)) + alpha
topic_word = np.zeros((n_topics, n_words)) + beta
topic_count = np.zeros(n_topics) + n_words * beta

# Initialize counts
for d, wi in enumerate(word_indices):
    for i, w in enumerate(wi):
        topic = z[d][i]
        doc_topic[d, topic] += 1
        topic_word[topic, w] += 1
        topic_count[topic] += 1


print(f"Running Gibbs Sampling for {n_topics} topics...")
for it in range(n_iters):
    for d, wi in enumerate(word_indices):
        for i, w in enumerate(wi):
            topic = z[d][i]

            # Decrement counts
            doc_topic[d, topic] -= 1
            topic_word[topic, w] -= 1
            topic_count[topic] -= 1

            # Conditional probability
            p_z = (topic_word[:, w] / topic_count) * doc_topic[d, :]
            p_z /= np.sum(p_z)

            # Sample new topic
            new_topic = np.random.choice(np.arange(n_topics), p=p_z)

            # Increment counts
            z[d][i] = new_topic
            doc_topic[d, new_topic] += 1
            topic_word[new_topic, w] += 1
            topic_count[new_topic] += 1

    print(f"Iteration {it+1}/{n_iters} complete")

print(f"\nâœ… Gibbs sampling complete for {n_topics} topics!")


# Compare predicted vs actual sentiment (based on positive/negative words)
positive_words = {"good", "great", "amazing", "love", "excellent", "awesome", "nice", "fantastic"}
negative_words = {"bad", "worst", "terrible", "poor", "awful", "boring", "disappointing"}

pred_sentiments = []
for d, wi in enumerate(word_indices):
    doc_words = vocab[wi]
    pos = sum(w in positive_words for w in doc_words)
    neg = sum(w in negative_words for w in doc_words)
    pred_sentiments.append("Positive" if pos >= neg else "Negative")


correct = sum(p == t for p, t in zip(pred_sentiments, sentiments))
accuracy = correct / len(sentiments)

print(f"\nModel Accuracy (approx): {accuracy:.2f}")
print("\nSample Results:\n")
for i in range(5):
    print(f"Review: {texts[i][:80]}...")
    print(f"â†’ True: {sentiments[i]} | Predicted: {pred_sentiments[i]}\n")


Loading Yelp dataset...
Vectorizing text...
Running Gibbs Sampling for 100 topics...
Iteration 1/20 complete
Iteration 2/20 complete
Iteration 3/20 complete
Iteration 4/20 complete
Iteration 5/20 complete
Iteration 6/20 complete
Iteration 7/20 complete
Iteration 8/20 complete
Iteration 9/20 complete
Iteration 10/20 complete
Iteration 11/20 complete
Iteration 12/20 complete
Iteration 13/20 complete
Iteration 14/20 complete
Iteration 15/20 complete
Iteration 16/20 complete
Iteration 17/20 complete
Iteration 18/20 complete
Iteration 19/20 complete
Iteration 20/20 complete

âœ… Gibbs sampling complete for 100 topics!

Model Accuracy (approx): 0.66

Sample Results:

Review: dr. goldberg offers everything i look for in a general practitioner.  he's nice ...
â†’ True: Positive | Predicted: Positive

Review: Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of th...
â†’ True: Negative | Predicted: Positive

Review: Been going to Dr. Goldberg for over 10 years. I think 