In [40]:
import pandas as pd
import spacy
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")
data = pd.read_csv("train.csv")
print(data["review"].head())

0    Bromwell High is a cartoon comedy. It ran at t...
1    Homelessness (or Houselessness as George Carli...
2    Brilliant over-acting by Lesley Ann Warren. Be...
3    This is easily the most underrated film inn th...
4    This is not the typical Mel Brooks film. It wa...
Name: review, dtype: object


In [41]:
def spacy_tokenizer(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

In [42]:
tf_vectorizer = CountVectorizer(
    # set up your CountVectorizer
    tokenizer=spacy_tokenizer, max_df=0.95, min_df=2, max_features=1000
)

In [43]:
with tqdm(total=len(data)) as pbar:
    tf = tf_vectorizer.fit_transform(data["review"])
    pbar.update(len(data))

100%|██████████| 25000/25000 [15:35<00:00, 26.73it/s]


In [44]:
print(tf_vectorizer.get_feature_names_out()[:10])

['ability' 'able' 'absolutely' 'accent' 'accept' 'act' 'acting' 'action'
 'actor' 'actress']


In [45]:
lda = LatentDirichletAllocation(
    # set up your LatentDirichletAllocation
    n_components=20, learning_method='online', learning_offset=50, random_state=69
)

In [46]:
with tqdm(total=100) as pbar:
    lda.fit(tf)
    pbar.update(100)

100%|██████████| 100/100 [01:53<00:00,  1.13s/it]


In [47]:
def show_topic(model, feature_names, top):
    for index, distribution in enumerate(model.components_):
        sorted_word_indices = distribution.argsort()[::-1][:top]
        print(f"Topic {index}:")
        print(" ".join([feature_names[i] for i in sorted_word_indices]))

tf_feature_names = tf_vectorizer.get_feature_names_out()
top = 10
show_topic(lda, tf_feature_names, top)

Topic 0:
voice john mr tom dog joe michael dr david mary
Topic 1:
love story beautiful life time wonderful great young heart old
Topic 2:
horror budget zombie killer gore low blood kill effect film
Topic 3:
family kid child school old young boy year girl parent
Topic 4:
people life movie character like way know feel story think
Topic 5:
man woman wife find house go father come get home
Topic 6:
black white sex sequence film woman director gay camera sexual
Topic 7:
man murder kill police city gun cop crime new western
Topic 8:
movie bad like watch think time know go thing people
Topic 9:
film like good see scene story work character director watch
Topic 10:
like scene look guy bad get pretty good thing lot
Topic 11:
war american world documentary film soldier history footage country people
Topic 12:
book original version story character read novel time animation king
Topic 13:
film character play good role performance actor cast story give
Topic 14:
movie see time song watch like year 