In [None]:
import re
import pickle
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import spacy

from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import TfidfModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis


In [3]:
nlp = spacy.load("en_core_web_sm", disable=['parser','ner'])
stop_words = stopwords.words('english')
stop_words.extend([
    "product","amazon","review","reviews","purchase","purchased","buy","bought",
    "item","items","order","ordered","shipping","delivery","packaging","seller",
    "customer","service","price","prices","brand","brands","store","stores"
])

In [None]:
def clean_text(text):
    text = re.sub(r'<.*?>',' ',str(text))
    text = re.sub(r'[^a-zA-Z]',' ',text)
    return re.sub(r'\s+',' ',text).strip().lower()


In [5]:
def tokenize(text,min_len=3):
    return [token for token in simple_preprocess(text,deacc=True) 
            if token not in stop_words and len(token) >= min_len]

In [6]:
def build_phrasers(token_docs, bigram_min_count=10, bigram_threshold=50, trigram_threshold=50):
    bigram = Phrases(token_docs, min_count=bigram_min_count, threshold=bigram_threshold)
    bigram_mod = Phraser(bigram)
    trigram = Phrases((bigram_mod[doc] for doc in token_docs), threshold=trigram_threshold)
    trigram_mod = Phraser(trigram)
    return bigram_mod, trigram_mod


In [7]:
def apply_phrases(token_docs, bigram_mod, trigram_mod):
    return [trigram_mod[bigram_mod[doc]] for doc in token_docs]


In [8]:
def lemmatization(token_docs, allowed_postags=('NOUN','ADJ','VERB','ADV')):
    texts_out = []
    for doc_tokens in token_docs:
        doc = nlp(" ".join(doc_tokens))
        lemmas = [token.lemma_ for token in doc if token.pos_ in allowed_postags and token.lemma_ not in stop_words]
        texts_out.append(lemmas)
    return texts_out


In [9]:
def build_dictionary(texts, no_below=10, no_above=0.4):
    id2word = Dictionary(texts)
    id2word.filter_extremes(no_below=no_below,no_above=no_above)
    return id2word

In [10]:
def build_corpus(texts,id2word):
    return [id2word.doc2bow(text) for text in texts]


In [11]:
def tfidf_filter_corpus(corpus,id2word,low_thresh=0.03):
    tfidf = TfidfModel(corpus,id2word=id2word)
    filtered = []
    for bow in corpus:
        tfidf_bow = tfidf[bow]
        filtered_bow = [(wid,freq) for (wid,freq) in bow
                        if dict(tfidf_bow).get(wid,0.0) >= low_thresh]
        filtered.append(filtered_bow if filtered_bow else bow)
    return filtered


In [12]:
df = pd.read_csv("dataset\\Reviews.csv")


In [13]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [14]:
df = df.drop(columns=[c for c in ['Id','ProductId','UserId','ProfileName',
                                    'HelpfulnessNumerator','HelpfulnessDenominator',
                                    'Time','Summary'] if c in df.columns])


In [17]:
df.Score.value_counts()

Score
5    363122
4     80655
1     52268
3     42640
2     29769
Name: count, dtype: int64

In [18]:
df['Text'][0]

'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'

In [19]:
df['clean_text'] = df['Text'].map(clean_text)


In [20]:
df['clean_text'][0]

'i have bought several of the vitality canned dog food products and have found them all to be of good quality the product looks more like a stew than a processed meat and it smells better my labrador is finicky and she appreciates this product better than most'

In [21]:
token_docs = df['clean_text'].map(tokenize).tolist()

In [22]:
print("Tokenized sample:", token_docs[0][:20])

Tokenized sample: ['several', 'vitality', 'canned', 'dog', 'food', 'products', 'found', 'good', 'quality', 'looks', 'like', 'stew', 'processed', 'meat', 'smells', 'better', 'labrador', 'finicky', 'appreciates', 'better']


In [23]:
bigram_mod,trigram_mod = build_phrasers(token_docs)
phrased_docs = apply_phrases(token_docs,bigram_mod,trigram_mod)

In [28]:
phrased_docs[0][:20]

['several',
 'vitality',
 'canned',
 'dog',
 'food',
 'products',
 'found',
 'good',
 'quality',
 'looks',
 'like',
 'stew',
 'processed',
 'meat',
 'smells',
 'better',
 'labrador',
 'finicky',
 'appreciates',
 'better']

In [29]:
data_lemmatized = lemmatization(phrased_docs)
print("Lemmatized sample:", data_lemmatized[0][:20])

Lemmatized sample: ['several', 'vitality', 'dog', 'food', 'find', 'good', 'quality', 'look', 'stew', 'process', 'meat', 'smell', 'well', 'labrador', 'finicky', 'appreciate', 'well']


In [33]:
id2word = build_dictionary(data_lemmatized)
corpus = build_corpus(data_lemmatized,id2word)

In [34]:
print("Dictionary size:", len(id2word))
print("Sample tokens:", list(id2word.items())[:20])

Dictionary size: 27421
Sample tokens: [(0, 'appreciate'), (1, 'dog'), (2, 'find'), (3, 'finicky'), (4, 'food'), (5, 'good'), (6, 'labrador'), (7, 'look'), (8, 'meat'), (9, 'process'), (10, 'quality'), (11, 'several'), (12, 'smell'), (13, 'stew'), (14, 'vitality'), (15, 'well'), (16, 'actually'), (17, 'arrive'), (18, 'error'), (19, 'intend')]


In [35]:
print("First document BOW:", corpus[0])
print("Decoded:", [(id2word[id], freq) for id, freq in corpus[0]])

First document BOW: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2)]
Decoded: [('appreciate', 1), ('dog', 1), ('find', 1), ('finicky', 1), ('food', 1), ('good', 1), ('labrador', 1), ('look', 1), ('meat', 1), ('process', 1), ('quality', 1), ('several', 1), ('smell', 1), ('stew', 1), ('vitality', 1), ('well', 2)]


In [None]:
corpus_filtered = tfidf_filter_corpus(corpus,id2word)


Corpus length: 568454
Corpus filtered length: 568454


In [None]:
def train_lda(corpus, id2word,num_topics,passes = 12,iterations = 300,alpha = 'auto',eta = 'auto'):
    return LdaModel(
        corpus = corpus,
        id2word = id2word,
        num_topics = num_topics,
        random_state = 42,
        update_every = 1,
        chunksize = 2000,
        passes = passes,
        iterations = iterations,
        alpha = alpha,
        eta = eta,
        per_word_topics = True
    )


In [None]:
def compute_coherence(lda, texts, dictionary, metric='c_v'):
    return CoherenceModel(model = lda,texts = texts,dictionary = dictionary,coherence = metric).get_coherence()


In [None]:
def grid_search_topics(corpus, texts, id2word, topic_range = (5,21,5)):
    results=[]
    for k in range(*topic_range):
        lda = train_lda(corpus, id2word, num_topics = k)
        c_v = compute_coherence(lda, texts, id2word, 'c_v')
        print(f"Topics={k} | c_v={c_v:.4f}")
        results.append({"num_topics":k,"model":lda,"c_v":c_v})
    return results


In [None]:
results = grid_search_topics(corpus_filtered, data_lemmatized, id2word)
best = max(results,key = lambda r:r["c_v"])
lda = best["model"]
print(f"Best num_topics={best['num_topics']} with c_v={best['c_v']:.4f}")


Topics=5 | c_v=0.5625
Topics=10 | c_v=0.5979
Topics=15 | c_v=0.5581
Topics=20 | c_v=0.5377
Best num_topics=10 with c_v=0.5979


In [None]:
for i, topic in lda.print_topics(num_words = 10):
    print(f"Topic {i}: {topic}")

Topic 0: 0.135*"food" + 0.048*"cat" + 0.044*"eat" + 0.023*"dry" + 0.020*"ingredient" + 0.019*"baby" + 0.015*"feed" + 0.014*"grain" + 0.011*"healthy" + 0.011*"chicken"
Topic 1: 0.050*"taste" + 0.048*"good" + 0.042*"flavor" + 0.034*"try" + 0.025*"well" + 0.021*"really" + 0.017*"much" + 0.016*"little" + 0.014*"think" + 0.013*"give"
Topic 2: 0.267*"coffee" + 0.129*"cup" + 0.037*"strong" + 0.031*"blend" + 0.030*"bean" + 0.024*"bitter" + 0.024*"smooth" + 0.023*"brew" + 0.015*"espresso" + 0.013*"morning"
Topic 3: 0.058*"hot" + 0.046*"salt" + 0.046*"sauce" + 0.034*"cook" + 0.024*"recipe" + 0.023*"add" + 0.021*"rice" + 0.020*"spicy" + 0.019*"heat" + 0.018*"cheese"
Topic 4: 0.052*"love" + 0.052*"get" + 0.049*"great" + 0.039*"make" + 0.036*"find" + 0.028*"time" + 0.017*"day" + 0.013*"go" + 0.013*"keep" + 0.012*"long"
Topic 5: 0.321*"tea" + 0.045*"green" + 0.033*"black" + 0.031*"drink" + 0.024*"ice" + 0.017*"ginger" + 0.017*"hot" + 0.016*"lemon" + 0.012*"bag" + 0.012*"mint"
Topic 6: 0.033*"bag" + 

In [49]:
vis_data = gensimvis.prepare(lda, corpus_filtered, id2word)
pyLDAvis.display(vis_data)

In [None]:
# Save LDA model
lda.save("ldaModel.gensim")

# Save dictionary
id2word.save("ldaDictionary.gensim")

with open("Phrasers.pkl","wb") as f: 
    pickle.dump({"bigram_mod":bigram_mod, "trigram_mod":trigram_mod}, f)
