# Topic Modeling

## Spacy + Sklearn

In [15]:
import os
import sys
from pathlib import Path
from os import PathLike

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import string

# serialization
import json

# sklearn
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# visualization
import spacy.displacy
import spacy.lang.en
# jupyter
from tqdm import tqdm

In [2]:
wine_reviews_file = Path.cwd() / 'data/winemag-data_first150k.csv'
reviews = pd.read_csv(str(wine_reviews_file))
reviews.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [3]:
nlp = spacy.load('en_core_web_lg')

In [4]:
doc = nlp(reviews['description'][3])
spacy.displacy.render(doc, style='ent', jupyter=True)

In [5]:
punctuations = string.punctuation
stopwords = list(spacy.lang.en.stop_words.STOP_WORDS)

In [6]:
r = str(" ".join([i.lemma_ for i in doc]))
doc = nlp(r)
spacy.displacy.render(doc, style='ent', jupyter=True)

In [7]:
# parsing reviews
parser = spacy.lang.en.English()


def spacy_tokenize(sentence: str):
    mytokens = parser(sentence)
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    mytokens = [word for word in mytokens if word not in stopwords and word not in punctuations]
    mytokens = " ".join([i for i in mytokens])
    return mytokens


tqdm.pandas()
reviews['processed_description'] = reviews['description'].progress_apply(spacy_tokenize)

 14%|█▍        | 21074/150930 [00:04<00:26, 4966.00it/s]


KeyboardInterrupt: 

In [None]:
reviews['processed_description']

In [None]:
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase='True', token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
vectors = vectorizer.fit_transform(reviews['description'])

In [None]:
# Functions for printing keywords for each topic
num_topics = 15
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=num_topics, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(vectors)
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, vectorizer)

## Reddit

In [24]:
from tqdm import tqdm

data_dir = Path.cwd() / 'data/PushShiftAndRedditAPICrawler-output'
data_files = list(data_dir.glob('*.json'))

submissions = []
comments = []
for f in tqdm(data_files):
    f: Path
    submission, comment = json.load(
        f.open(mode='r', encoding='utf-8')
    )
    print(submission, comment)
    break
    submissions.append(submission)
    comments.append(comment)

  0%|          | 0/20167 [00:00<?, ?it/s]

submission comments





In [22]:
submissions = pd.DataFrame(submissions)
submissions.head()

Unnamed: 0,0
0,submission
1,submission
2,submission
3,submission
4,submission
