Explore Latent Dirichlet Allocation (LDA) and Latent Semantic Indexing (LSI) on keywords to see if coherent personas emerge. 

Apply to both Advanced Contextual and Keyword API keywords.

Evaluate against known audience names.

## Imports, etc.

In [1]:
import collections
import json
import nltk
import numpy as np
import os
import pandas as pd
import string
import sys

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/pmccarthy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Data Collection

### Advanced Contextual

In [3]:
JOBNAME = 'MLE-4600 - Topic Modeling on Keywords'

# dist_archives = 'hdfs:///user/pmccarthy/conda/webtools37.zip#webtools37'

# os.environ['SPARK_HOME'] = '/home/pmccarthy/nas/opt/spark-2.4.7-bin-hadoop2.7'
os.environ['SPARK_HOME'] = '/opt/spark-hadrs2'
os.environ['PYSPARK_PYTHON'] = './webtools37/webtools37/bin/python'

sys.path.insert(0,os.path.join(os.environ['SPARK_HOME'],'python','lib','py4j-src.zip'))
sys.path.append(os.path.join(os.environ['SPARK_HOME'],'python'))

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import window as W


spark = (SparkSession
         .builder
         .enableHiveSupport()
         .appName(f"{os.uname()[1].split('.')[0]}: {JOBNAME}")
         .config('spark.master','yarn')
         .config('spark.yarn.deployMode','client')
         .config('spark.executor.cores',1)
         .config('spark.executor.memory','2g')
         .config('spark.driver.memory','5g')
         .config('spark.dynamicAllocation.maxExecutors',1500)
         
#          .config('spark.yarn.dist.archives', dist_archives)
         
         .config('spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version','2')
         .config('spark.hadoop.parquet.enable.summary-metadata','false')
         .config('spark.sql.parquet.mergeSchema','false')
         .config('spark.sql.parquet.filterPushdown','true')
         .config('spark.sql.hive.metastorePartitionPruning','true')
         
         .config('spark.sql.execution.arrow.enabled','true')
#          .config('spark.sql.shuffle.partitions','4096')
         
         .getOrCreate())

In [4]:
spark.catalog.refreshTable("pmccarthy.mle4592_content_keywords")

spark.table("pmccarthy.mle4592_content_keywords").show(n=3)

+----------------+-----------+---------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+
|interest_type_id|interest_id|entity_id|               descr|             summary|             domains|          content_id|n_url|       keywords_json|        keywords_map|
+----------------+-----------+---------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+
|             110|      17744|    46444|Vegans Q1 2018 (O...|Vegans Q1 2018 (O...|[blissfulbasil.co...|[1hv6i, 1lyy4, 1m...| 3297|{"vegan": 344, "s...|[vegan -> 344, sa...|
|             110|      56010|   627953|Wedding Content R...|Wedding Content R...|[mywedding.com, t...|[mjkp, 1l0k1, 1nc...|21466|{"wedding": 3418,...|[wedding -> 3418,...|
|             112|      11655|    40981|Assorted Topics (...|People in this au...|[usingenglish.com...|[d7x, my3s, u5u2,...| 1426|{"cel

In [5]:
adv_cont_keywords_df = (spark.table("pmccarthy.mle4592_content_keywords")
               .drop('keywords_map')
              ).toPandas()



In [6]:
adv_cont_keywords_df.to_parquet('mle4600_adv_contextual_keywords.parquet')

In [7]:
adv_cont_keywords_df = pd.read_parquet('mle4600_adv_contextual_keywords.parquet')

In [8]:
adv_cont_keywords_df.head(n=3)

Unnamed: 0,interest_type_id,interest_id,entity_id,descr,summary,domains,content_id,n_url,keywords_json
0,110,17744,46444,Vegans Q1 2018 (Oatly),Vegans Q1 2018 (Oatly),"[blissfulbasil.com, rabbitandwolves.com, emili...","[1hv6i, 1lyy4, 1mj5v, 1mp53, 1qxr8, 1jc3a, 1ih...",3297,"{""vegan"": 344, ""sauce"": 306, ""recipe"": 254, ""t..."
1,110,56010,627953,Wedding Content Researchers DNU,Wedding Content Researchers DNU,"[mywedding.com, travelfashiongirl.com, wedding...","[mjkp, 1l0k1, 1ncn8, 1dbyf, fhmi, 2xg, qwd1, 1...",21466,"{""wedding"": 3418, ""dress"": 702, ""venue"": 692, ..."
2,112,11655,40981,Assorted Topics (This Audience Are Interested ...,People in this audience are interested in abbr...,"[usingenglish.com, militaryfactory.com, online...","[d7x, my3s, u5u2, 1o3d4, kre2, 15d3i, unb1, u7...",1426,"{""cell"": 87, ""formula"": 72, ""cell range"": 48, ..."


In [9]:
spark.stop()

### Keywords API

In [10]:
! head interests.tsv

36864	110	Vegetarians
77825	110	Small Business Administration Cares Act Researchers
1	110	Fitness Enthusiasts
77824	110	Remdesivir Information Researchers
2	110	Gamers
61442	140	Popeye
3	110	Car Enthusiasts
36867	110	Blender Shoppers
77826	110	Financially Aware (COVID-19 related)
61443	140	Raising Canes


In [11]:
! head keywords.tsv

36864	desserts	13
36864	gluten	5
36864	quick	5
36864	foods	14
36864	wholesome	5
36864	happy	4
36864	whole	14
36864	these	8
36864	veg	5
36864	blogging	4


In [12]:
interests_df = pd.read_table('interests.tsv',
                             names=['interest_id','interest_type_id','descr'])

interests_df.head(n=3)

Unnamed: 0,interest_id,interest_type_id,descr
0,36864,110,Vegetarians
1,77825,110,Small Business Administration Cares Act Resear...
2,1,110,Fitness Enthusiasts


In [13]:
keywords_from_api_df = pd.read_table('keywords.tsv',
                             names=['interest_id','keyword','n'])

keywords_from_api_df.head(n=3)

Unnamed: 0,interest_id,keyword,n
0,36864,desserts,13
1,36864,gluten,5
2,36864,quick,5


In [14]:
keyword_api_df = pd.merge(
    interests_df,
    keywords_from_api_df, on='interest_id', how='left')

keyword_api_df.head()

Unnamed: 0,interest_id,interest_type_id,descr,keyword,n
0,36864,110,Vegetarians,desserts,13
1,36864,110,Vegetarians,gluten,5
2,36864,110,Vegetarians,quick,5
3,36864,110,Vegetarians,foods,14
4,36864,110,Vegetarians,wholesome,5


In [15]:
keyword_api_dict = collections.defaultdict(dict)

for i, r in keyword_api_df.iterrows():
    keyword_api_dict[r['descr']][r['keyword']] = r['n']

In [16]:
keyword_api_json_df = (
    pd.merge(
        
        pd.DataFrame([(k, json.dumps(keyword_api_dict[k])) for k in keyword_api_dict.keys()])
        .rename(columns={0:'descr',1:'keywords_json'}),
        
        keyword_api_df[['interest_id','descr']].drop_duplicates(),
        
        on='descr',
        how='left')
)

keyword_api_json_df.head(n=3)

Unnamed: 0,descr,keywords_json,interest_id
0,Vegetarians,"{""desserts"": 13, ""gluten"": 5, ""quick"": 5, ""foo...",36864
1,Small Business Administration Cares Act Resear...,"{""capital"": 7, ""education"": 4, ""lawyers"": 4, ""...",77825
2,Fitness Enthusiasts,"{""celebrity"": 3, ""foods"": 5, ""beauty"": 5, ""str...",1


In [17]:
common_audiences = list(
    set(keyword_api_json_df['descr'].tolist())
        .intersection(set(adv_cont_keywords_df['descr'].tolist()))
)[0:10]

common_audiences

['eSports Enthusiasts',
 'Los Angeles Clippers Fans',
 'Gastroenterologists',
 'Networking Marketing',
 'School Curriculum',
 'Holiday Season Department Store Deal Shoppers',
 'Military Servicemembers & Veterans',
 'Streaming Linear TV Watchers (National Channels)',
 'Job Interview Process',
 'Colombia Trip Planners']

In [18]:
prediction_list = keyword_api_json_df.query('descr in @common_audiences')['interest_id'].tolist()
prediction_list

[123, 104164, 22415, 6264, 39333, 11272, 35964, 11825, 11847, 57155]

## Helper Functions

In [19]:
class JsonReader(BaseEstimator, TransformerMixin):
    
    def __init__(self, stemming=False):
        self.stemming = stemming
        
        super(JsonReader).__init__()
    
    
    @staticmethod
    def _stem_and_add(json_str):
        """
        Transform each token into its stemmed substring,
        and build the dict as {stem: (sum of counts of words with this stem)}
        """
        stemmed_counts = collections.defaultdict(int)        
        stem = nltk.stem.SnowballStemmer('english')
        
        json_dict = json.loads(json_str)
        
        for k in json_dict.keys():
            stemmed_counts[stem.stem(k)] += json_dict[k]
        
        return stemmed_counts
    
    
    def fit(self, X, y=None):
        return self
    
    
    def transform(self, X):        
        if self.stemming:
            return X.apply(self._stem_and_add)
        else:
            return X.apply(json.loads)

In [20]:
j_reader = JsonReader()
print("without stemming\n")
print(j_reader.fit_transform(keyword_api_json_df['keywords_json'].head()))
print('\n')

j_reader = JsonReader(stemming=True)
print("with stemming\n")
print(j_reader.fit_transform(keyword_api_json_df['keywords_json'].head()))

without stemming

0    {'desserts': 13, 'gluten': 5, 'quick': 5, 'foo...
1    {'capital': 7, 'education': 4, 'lawyers': 4, '...
2    {'celebrity': 3, 'foods': 5, 'beauty': 5, 'str...
3    {'medical': 4, 'movie': 8, 'channel': 4, 'poli...
4    {'3ds': 5, 'mods': 4, 'achievements': 5, 'movi...
Name: keywords_json, dtype: object


with stemming

0    {'dessert': 13, 'gluten': 5, 'quick': 5, 'food...
1    {'capit': 7, 'educ': 4, 'lawyer': 4, 'softwar'...
2    {'celebr': 3, 'food': 5, 'beauti': 5, 'strengt...
3    {'medic': 4, 'movi': 8, 'channel': 4, 'polit':...
4    {'3ds': 5, 'mod': 4, 'achiev': 5, 'movi': 4, '...
Name: keywords_json, dtype: object


In [21]:
class LatentPipeline():
    
    def __init__(self, data, **kwargs):
        
        self.data = data.sort_values('interest_id')
        self.pipeline = self._get_pipeline(**kwargs)
        self.predict_data = (data.query('interest_id in @prediction_list')
                .sort_values('descr')
                .reset_index())
        

    def _get_pipeline(self):
        raise NotImplemented
        
        
    def _get_topics(self, n_keywords=10):
        
        names = self.pipeline['dict_vectorizer'].get_feature_names()
        self.topics = {}
        
        for idx, topic in enumerate(self.pipeline['model'].components_):
            features = topic.argsort()[:-(n_keywords-1):-1]
            tokens = [names[i] for i in features]
            self.topics[idx] = ",".join(tokens)
        
        
    def fit_transform(self):
        self.predicted = self.pipeline.fit_transform(self.data['keywords_json'])
        self._get_topics()
        self.predicted_subset = (
            self.pipeline.transform(self.predict_data['keywords_json']))                     
    
    
    def __repr__(self):
        return("\n".join(list(self.topics.values())[0:10]))
    
    
    def demonstrate(self, n_top_topics=5):                        

        # For each row, get the column indices for the largest, second largest... elements
        top_topics_idx = np.flip(
            np.apply_along_axis(np.argsort, 1, self.predicted_subset)[:,-n_top_topics:], axis=1)

        
        for aud_idx in range(self.predicted_subset.shape[0]):
            
            descr = self.predict_data['descr'][aud_idx]
            interest = self.predict_data['interest_id'][aud_idx]
            
            print(f"{descr} [{interest}]\n==============================")
            
            for topic_idx in range(n_top_topics):
                target_idx = top_topics_idx[aud_idx, topic_idx]
                
                weight_val = np.round(self.predicted_subset[aud_idx, target_idx],3)
                print_topics = self.topics[target_idx]
                
                print(f"Topic {target_idx}: Weight:{weight_val}:\n\t{print_topics}...")        
            print('\n')

In [22]:
class LDAPipeline(LatentPipeline):
    
    def _get_pipeline(self, **kwargs):        
        
        var_dict = {
            'stemming':False,
            'n_components':10
        }
        
        var_dict.update(**kwargs)
            
        return Pipeline([

            # Reads json strings into word:count dicts
            ('json_reader', JsonReader(stemming=var_dict['stemming'])),

            # Composes the dicts into vectors, where the cardinality
            # of the vector is that of all dict keys in all rows
            ('dict_vectorizer', DictVectorizer()),

            # word:count row-column pairs are normalized via TF-IDF,
            # producing a matrix
            ('tfidf_transformer', TfidfTransformer()),

            # Latent Dirichlet Analysis is performed on the TF-IDF matrix
            ('model', LatentDirichletAllocation(n_components=var_dict['n_components']))    
        ])

In [23]:
class LSIPipeline(LatentPipeline):
    
    def _get_pipeline(self, **kwargs):
        
        var_dict = {
            'stemming':False,
            'n_components':10
        }
        
        var_dict.update(**kwargs)
        
        return Pipeline([

            # Reads json strings into word:count dicts
            ('json_reader', JsonReader(stemming=var_dict['stemming'])),

            # Composes the dicts into vectors, where the cardinality
            # of the vector is that of all dict keys in all rows
            ('dict_vectorizer', DictVectorizer()),

            # word:count row-column pairs are normalized via TF-IDF,
            # producing a matrix
            ('tfidf_transformer', TfidfTransformer()),

            # Latent Semantic Indexing is performed on the TF-IDF matrix
            ('model', TruncatedSVD(n_components=var_dict['n_components']))    
        ])

## Run Pipelines - Advanced Contextual

### Latent Dirichlet Allocation

#### Unstemmed

In [24]:
## n = 10
lda_pipe = LDAPipeline(data=adv_cont_keywords_df, n_components=10)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 1: Weight:0.418:
	venezuela,por,para,financial samurai,arthritis,nike,menopause,cuba...
Topic 5: Weight:0.282:
	game,wedding,engine,ford,app store,paint,likes,truck...
Topic 0: Weight:0.153:
	trump,covid-19,capitol,donald trump,movie online,senate,vaccine,joe biden...
Topic 4: Weight:0.096:
	recipe,chicken,sauce,baby,keto,milk,students,cake...
Topic 3: Weight:0.008:
	pokémon,horse,aol mail,warzone,mcdonough,horses,stab,black ops cold war...


Gastroenterologists [6264]
Topic 2: Weight:0.418:
	amd,cpu,intel,gpu,les,nvidia,sony,asus...
Topic 4: Weight:0.231:
	recipe,chicken,sauce,baby,keto,milk,students,cake...
Topic 5: Weight:0.2:
	game,wedding,engine,ford,app store,paint,likes,truck...
Topic 8: Weight:0.102:
	nba,lakers,list of our partners,new orleans,rite aid,stocks,saints,alberta...
Topic 1: Weight:0.008:
	venezuela,por,para,financial samurai,arthritis,nike,menopause,cuba...


Holiday Season Department Store Deal Shoppers [104164]
Topic 2: Weight

In [25]:
## n = 25
lda_pipe = LDAPipeline(data=adv_cont_keywords_df, n_components=25)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 15: Weight:0.477:
	list of our partners,por,venezuela,para,pokémon,job,rite aid,madrid...
Topic 7: Weight:0.281:
	paint,wood,kitchen,stocks,financial samurai,cabinets,wall,fabric...
Topic 18: Weight:0.138:
	goats,pr,jst,airplane,golden eagles,dentist,tooth,teeth...
Topic 5: Weight:0.021:
	tasb,aol mail,rifle,barrel,purchases,dealam.com,bullet,instarimages...
Topic 8: Weight:0.012:
	bts,netizens,korea,blackpink,k-pop,nct,tesla,jimin...


Gastroenterologists [6264]
Topic 11: Weight:0.303:
	medical school,mcat,prompts,landmarks,sunsets,medical schools,real housewives,canon...
Topic 3: Weight:0.205:
	coffee,wisconsin,espresso,brew,nie,cooking made easy bundle,more energy,lac county...
Topic 17: Weight:0.181:
	nfl,nba,indiana,powershell,maryland,draft,boat,nfl draft...
Topic 13: Weight:0.132:
	wedding,dow jones,indices,cable news network,morningstar,cornell,patient,patients...
Topic 0: Weight:0.113:
	calories,pain,symptoms,weight,patients,alberta,adverti

In [26]:
## n = 50
lda_pipe = LDAPipeline(data=adv_cont_keywords_df, n_components=50)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 20: Weight:0.592:
	terps,lens,rutgers,northwestern,spartans,morsell,wolverines,número de suerte...
Topic 19: Weight:0.204:
	philadelphia,venezuela,philadelphia recovery residences,recovery,madrid,por,addiction,sobriety...
Topic 37: Weight:0.063:
	skin,hair,rite aid,joe,terms of use,trader joe,job,covid-19...
Topic 41: Weight:0.046:
	para,tu juego,por,streaming format,por ejemplo,más,embarazo,python...
Topic 11: Weight:0.013:
	stocks,stock market,income,investors,retirement,financial samurai,market,fund...


Gastroenterologists [6264]
Topic 1: Weight:0.329:
	revision,member details,j-14,volunteers of america,aasld,consultant id,us top,edvisors...
Topic 45: Weight:0.225:
	baby,pregnancy,milk,babies,breastfeeding,birth,padres,child...
Topic 9: Weight:0.173:
	liverpool,manchester,dutchnews,dutchnews .nl,patients,chelsea,arsenal,premier league...
Topic 15: Weight:0.114:
	recipe,chicken,sauce,keto,cake,dough,soup,butter...
Topic 25: Weight:0.064:
	cruise,

In [27]:
## n = 100
lda_pipe = LDAPipeline(data=adv_cont_keywords_df, n_components=100)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 66: Weight:0.296:
	por,para,venezuela,madrid,más,ella,wwe,lo...
Topic 53: Weight:0.181:
	bikes,bike,cycling,trucking,truck,everyday makeup,cdl,truckers...
Topic 41: Weight:0.1:
	número de suerte,poker,solteros,amor,cambios,parejas,nuevas,mujer...
Topic 69: Weight:0.096:
	venezuela,patria,dólares,sc,sitio,estas cookies,puerto rico,gobierno...
Topic 16: Weight:0.06:
	acrobat,reader,embarazo,pdf,nombres,tampa water street,first jw marriott hotel,adobe...


Gastroenterologists [6264]
Topic 49: Weight:0.341:
	marketing,customers,brand,content,social media,aasld,customer,audience...
Topic 44: Weight:0.326:
	guitar,meghan,chords,archtoolbox,aspen,meghan markle,harry,royal family...
Topic 34: Weight:0.146:
	patients,patient,nursing,nurses,healthcare,care,nurse,theory...
Topic 50: Weight:0.081:
	tu juego,itstillworks,por ejemplo,animal testing,classify brands,cruelty-free,luminate,blackbaud client...
Topic 48: Weight:0.02:
	cornell,joe,trader joe,patients,co

In [28]:
## n = 250
lda_pipe = LDAPipeline(data=adv_cont_keywords_df, n_components=250)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 61: Weight:0.347:
	por,para,venezuela,más,ella,dólares,cuba,estados unidos...
Topic 211: Weight:0.29:
	plants,soil,plant,garden,seeds,compost,leaves,seed...
Topic 55: Weight:0.076:
	alvarito,dac,cook county,fiio,con el,iem,ruales,sound...
Topic 91: Weight:0.067:
	madrid,uruguay,fc barcelona,ha,barça,chiste de periodista,uno,ronald koeman...
Topic 224: Weight:0.062:
	matrix,column,sergio ramos,motagua,fichajes,pyspark,olimpia,cristiano...


Gastroenterologists [6264]
Topic 141: Weight:0.347:
	aasld,arlington,liver disease,arlington county,hcv,hepatology,northern virginia,virginia...
Topic 229: Weight:0.312:
	aspen,scrabble,la antigua,contexto,parenteral nutrition,silver spring,meriden,jpen...
Topic 103: Weight:0.18:
	a360,grand rapids,sueños,full price,walmart,huron capital,slavica,sheila johnson...
Topic 20: Weight:0.061:
	patients,nutrition,clinical guidelines,task force,guidance,hospitalized patients,consensus,uln...
Topic 116: Weight:0.02:
	covid

#### Stemmed

In [29]:
## n = 10
lda_pipe = LDAPipeline(data=adv_cont_keywords_df, n_components=10, stemming=True)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 3: Weight:0.862:
	movie onlin,wed,barbi,download,paint,actress,coin mast,por...
Topic 8: Weight:0.071:
	babi,pregnanc,milk,pump,kid,toddler,breastfeed,letter...
Topic 5: Weight:0.008:
	recip,chicken,sauc,skin,engin,keto,cake,oil...
Topic 4: Weight:0.008:
	trail,disney,hike,rite aid,boat,terms of us,walt disney world,hawaii...
Topic 6: Weight:0.008:
	nba,list of our partn,laker,new orlean,nfl,philadelphia,saint,advertis...


Gastroenterologists [6264]
Topic 2: Weight:0.413:
	covid-19,trump,capitol,vaccin,donald trump,congress,senat,united st...
Topic 5: Weight:0.324:
	recip,chicken,sauc,skin,engin,keto,cake,oil...
Topic 0: Weight:0.205:
	cat,les,vos,du,mysav,nfc,afc,hors...
Topic 3: Weight:0.008:
	movie onlin,wed,barbi,download,paint,actress,coin mast,por...
Topic 1: Weight:0.008:
	dog,game,bts,dow jon,cable news network,morningstar,indic,app stor...


Holiday Season Department Store Deal Shoppers [104164]
Topic 1: Weight:0.505:
	dog,game,bts,dow jon

In [30]:
## n = 25
lda_pipe = LDAPipeline(data=adv_cont_keywords_df, n_components=25, stemming=True)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 3: Weight:0.39:
	por,venezuela,para,medical school,madrid,más,ella,dólare...
Topic 11: Weight:0.354:
	philadelphia,philadelphia recovery resid,recoveri,addict,sobrieti,12-step program,drug,hawaii...
Topic 14: Weight:0.128:
	les,du,le,padr,marlin,pas,ell,oriol...
Topic 2: Weight:0.055:
	acrobat,pdf,len,reader,niña,first jw marriott hotel,tampa water street,el embarazo...
Topic 6: Weight:0.005:
	engin,ford,like,truck,fuel,car,batteri,gm...


Gastroenterologists [6264]
Topic 3: Weight:0.344:
	por,venezuela,para,medical school,madrid,más,ella,dólare...
Topic 0: Weight:0.184:
	list of our partn,dow jon,cable news network,indic,morningstar,terms of us,job,walt disney world...
Topic 16: Weight:0.167:
	new orlean,rite aid,laker,saint,alberta,nba,times-picayun,patient...
Topic 13: Weight:0.101:
	cornel,cornell engin,aol mail,golf,golfer,engin,college of engin,pga...
Topic 21: Weight:0.083:
	nurs,patient,pain,arthriti,yelp,symptom,rental hous,rentpath...


Ho

In [31]:
## n = 50
lda_pipe = LDAPipeline(data=adv_cont_keywords_df, n_components=50, stemming=True)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 49: Weight:0.629:
	venezuela,por,para,thought catalog,madrid,ella,dólare,cuba...
Topic 42: Weight:0.125:
	hemi,michelle seidel,jayne thompson,f150,food truck,tapatalk,llb,truck...
Topic 16: Weight:0.081:
	never miss,número de suert,soltero,prospectus,freeerisa,valid email,valid email address email,customer servic...
Topic 10: Weight:0.032:
	alabama,ncaa,nfl,nfl draft,maryland,indiana,tennesse,tu juego...
Topic 33: Weight:0.026:
	vos,hors,nfc,afc,tus,zu,daten,sie...


Gastroenterologists [6264]
Topic 20: Weight:0.404:
	derri,rifl,bullet,barrel,longford,ireland,bitcoin,samsung...
Topic 1: Weight:0.309:
	game,dow jon,cable news network,morningstar,indic,app stor,quest,skin...
Topic 4: Weight:0.125:
	patient,canada,nurs,covid-19,vaccin,student,tasb,harvard...
Topic 3: Weight:0.075:
	disney,walt disney world,disney world,wwe,magic kingdom,aew,bay area,newser...
Topic 2: Weight:0.012:
	terms of us,neogov,term,patient,servic,menopaus,clinical guidelin,job 

In [32]:
## n = 100
lda_pipe = LDAPipeline(data=adv_cont_keywords_df, n_components=100, stemming=True)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 1: Weight:0.496:
	arthriti,gout,gout attack,ra,nsaid,oa,rheumatoid arthr,uric acid...
Topic 30: Weight:0.185:
	hair,skin,por,hollywood,para,venezuela,madrid,covid-19...
Topic 53: Weight:0.177:
	venezuela,patria,pitch,sitio,estas cooki,gobierno,cuba,número de suert...
Topic 85: Weight:0.033:
	early bird brief,navi,defense industri,air forc,ship,pentagon,capabl,jazz...
Topic 32: Weight:0.017:
	stock,canada,incom,covid-19,invest,compani,ir,retir...


Gastroenterologists [6264]
Topic 60: Weight:0.338:
	engin,ford,like,truck,fuel,batteri,car,boat...
Topic 37: Weight:0.288:
	indiana,hoosier,iowa,big ten,nebraska,michigan st,michigan,illinoi...
Topic 56: Weight:0.164:
	shutterstock,e-mail,readmor,picture courtesi,allhiphop,wayn,tracey boaky,afia schwar...
Topic 0: Weight:0.086:
	patient,india,clinical guidelin,pga,golf,dutchnews .nl,dutchnew,task forc...
Topic 49: Weight:0.025:
	covid-19,trump,capitol,vaccin,congress,donald trump,senat,joe biden...


Holid

In [33]:
## n = 250
lda_pipe = LDAPipeline(data=adv_cont_keywords_df, n_components=250, stemming=True)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 177: Weight:0.351:
	introvert,emcc,seterra,seterra app,nashvill,lebanon,middletown,visite su zona de usuario...
Topic 93: Weight:0.221:
	por,para,disney,venezuela,walt disney world,más,ella,disney world...
Topic 200: Weight:0.172:
	cheapflight,venezuela,patria,gobierno,nonstop flight,sitio,estas cooki,cuba...
Topic 13: Weight:0.061:
	instarimag,kate,duchess,new orlean,loveland,laplac,city park,larimer counti...
Topic 137: Weight:0.041:
	engin,ford,like,batteri,car,truck,fuel,gm...


Gastroenterologists [6264]
Topic 191: Weight:0.35:
	job,interview,co-work,resum,recruit,boss,gaslight,infographic resum...
Topic 128: Weight:0.292:
	letters in order filt,order filt,houghton mifflin harcourt publishing compani,houghton mifflin compani,word,filter,letter,sew...
Topic 249: Weight:0.161:
	gator,auburn,sec,tiger,florida,clemson,cox media group,acc...
Topic 194: Weight:0.081:
	patient,clinical guidelin,nutrit,guidanc,task forc,consensus,hospitalized pati,uln.

### Latent Semantic Indexing

#### Unstemmed

In [34]:
## n = 10
lsi_pipe = LSIPipeline(data=adv_cont_keywords_df, n_components=10)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 1: Weight:0.025:
	trump,capitol,covid-19,donald trump,senate,congress,vaccine,joe biden...
Topic 0: Weight:0.014:
	recipe,chicken,sauce,dough,cake,soup,keto,butter...
Topic 5: Weight:0.001:
	bts,netizens,korea,blackpink,k-pop,nct,jimin,jungkook...
Topic 9: Weight:0.001:
	engine,ford,likes,truck,fuel,car,gm,battery...
Topic 4: Weight:-0.001:
	dow jones,indices,cable news network,morningstar,cnn,boeing,deul,gainbridge lpga...


Gastroenterologists [6264]
Topic 1: Weight:0.041:
	trump,capitol,covid-19,donald trump,senate,congress,vaccine,joe biden...
Topic 0: Weight:0.025:
	recipe,chicken,sauce,dough,cake,soup,keto,butter...
Topic 6: Weight:0.012:
	dog,baby,dogs,pregnancy,skin,breed,milk,puppy...
Topic 7: Weight:0.002:
	nba,lakers,new orleans,list of our partners,nfl,saints,rite aid,los angeles...
Topic 5: Weight:-0.001:
	bts,netizens,korea,blackpink,k-pop,nct,jimin,jungkook...


Holiday Season Department Store Deal Shoppers [104164]
Topic 0: Weight:0.

In [35]:
## n = 25
lsi_pipe = LSIPipeline(data=adv_cont_keywords_df, n_components=25)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 22: Weight:0.354:
	venezuela,por,para,madrid,ella,más,dólares,cuba...
Topic 19: Weight:0.043:
	philadelphia,philadelphia recovery residences,recovery,sobriety,addiction,tasb,students,canada...
Topic 1: Weight:0.025:
	trump,capitol,covid-19,donald trump,senate,congress,vaccine,joe biden...
Topic 18: Weight:0.023:
	advertisement,canada,alberta,les,dion,heart will go on,angélil,céline dion...
Topic 15: Weight:0.022:
	nba,stocks,canada,nfl,draft,income,philadelphia,nfl draft...


Gastroenterologists [6264]
Topic 23: Weight:0.052:
	students,school,pokémon,early bird brief,teachers,many teachers,patients,united states...
Topic 1: Weight:0.041:
	trump,capitol,covid-19,donald trump,senate,congress,vaccine,joe biden...
Topic 20: Weight:0.04:
	trail,hike,students,park,georgia,list of our partners,trails,canada...
Topic 12: Weight:0.032:
	list of our partners,stocks,income,canada,stock market,retirement,investors,financial samurai...
Topic 19: Weight:0.03:
	ph

In [36]:
## n = 50
lsi_pipe = LSIPipeline(data=adv_cont_keywords_df, n_components=50)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 22: Weight:0.359:
	venezuela,por,para,madrid,ella,más,dólares,cuba...
Topic 19: Weight:0.054:
	philadelphia,philadelphia recovery residences,recovery,sobriety,addiction,tasb,canada,12-step groups...
Topic 18: Weight:0.031:
	advertisement,canada,alberta,les,vos,dion,heart will go on,angélil...
Topic 1: Weight:0.025:
	trump,capitol,covid-19,donald trump,senate,congress,vaccine,joe biden...
Topic 15: Weight:0.022:
	nba,stocks,canada,nfl,philadelphia,draft,income,kings...


Gastroenterologists [6264]
Topic 45: Weight:0.069:
	nursing,patient,nurses,order filter,letters in order filter,patients,houghton mifflin harcourt publishing company,houghton mifflin company...
Topic 29: Weight:0.069:
	disney,walt disney world,disney world,magic kingdom,patients,star wars,epcot,disneyland...
Topic 23: Weight:0.048:
	students,school,teachers,early bird brief,many teachers,patients,united states,letter...
Topic 32: Weight:0.043:
	terms of use,neogov,terms,services,cat,

In [37]:
## n = 100
lsi_pipe = LSIPipeline(data=adv_cont_keywords_df, n_components=100)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 22: Weight:0.36:
	venezuela,por,para,madrid,ella,más,dólares,cuba...
Topic 19: Weight:0.055:
	philadelphia,philadelphia recovery residences,recovery,sobriety,addiction,tasb,canada,12-step groups...
Topic 65: Weight:0.046:
	mysavings,next week,guide2free,guide2free .com,real free samples,other stores,venezuela,rifle...
Topic 18: Weight:0.031:
	advertisement,canada,alberta,les,vos,dion,heart will go on,angélil...
Topic 1: Weight:0.025:
	trump,capitol,covid-19,donald trump,senate,congress,vaccine,joe biden...


Gastroenterologists [6264]
Topic 29: Weight:0.074:
	disney,walt disney world,disney world,patients,magic kingdom,epcot,star wars,patient...
Topic 39: Weight:0.049:
	letters in order filter,order filter,letters,words,houghton mifflin harcourt publishing company,houghton mifflin company,filter,kids...
Topic 23: Weight:0.048:
	students,school,teachers,early bird brief,many teachers,patients,united states,letter...
Topic 32: Weight:0.044:
	terms of 

In [38]:
## n = 250
lsi_pipe = LSIPipeline(data=adv_cont_keywords_df, n_components=250)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 22: Weight:0.36:
	venezuela,por,para,madrid,ella,más,dólares,cuba...
Topic 64: Weight:0.059:
	mysavings,next week,guide2free,real free samples,guide2free .com,venezuela,other stores,protests...
Topic 19: Weight:0.055:
	philadelphia,philadelphia recovery residences,recovery,sobriety,addiction,tasb,canada,12-step groups...
Topic 248: Weight:0.037:
	unregistered player,quiz,touch typing,typing,oregon,kentucky,vaping,e-liquid...
Topic 242: Weight:0.034:
	names,florida state parks,nhra,us top,guide2free,guide2free .com,real free samples,egpu...


Gastroenterologists [6264]
Topic 181: Weight:0.079:
	pharmacists,harvard,marvel,american pharmacists association,apha,diabetes,michelle seidel,jayne thompson...
Topic 29: Weight:0.074:
	disney,walt disney world,disney world,patients,magic kingdom,epcot,star wars,patient...
Topic 182: Weight:0.05:
	hy-vee,revision,connecticut,florida state parks,bellevue university,goats,minecraft,hartford...
Topic 234: Weight:0.

#### Stemmed

In [39]:
## n = 10
lsi_pipe = LSIPipeline(data=adv_cont_keywords_df, n_components=10, stemming=True)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 1: Weight:0.025:
	trump,capitol,covid-19,donald trump,vaccin,senat,congress,joe biden...
Topic 0: Weight:0.015:
	recip,chicken,sauc,dough,cake,trump,soup,keto...
Topic 8: Weight:0.014:
	babi,pregnanc,trump,pump,milk,breastfeed,joe biden,toddler...
Topic 7: Weight:0.006:
	dog,puppi,trump,breed,cat,capitol,joe biden,donald trump...
Topic 6: Weight:0.002:
	bts,netizen,korea,blackpink,k-pop,nct,jimin,jungkook...


Gastroenterologists [6264]
Topic 1: Weight:0.052:
	trump,capitol,covid-19,donald trump,vaccin,senat,congress,joe biden...
Topic 0: Weight:0.032:
	recip,chicken,sauc,dough,cake,trump,soup,keto...
Topic 5: Weight:0.012:
	dog,babi,puppi,cat,breed,pregnanc,skin,pump...
Topic 8: Weight:0.012:
	babi,pregnanc,trump,pump,milk,breastfeed,joe biden,toddler...
Topic 6: Weight:-0.002:
	bts,netizen,korea,blackpink,k-pop,nct,jimin,jungkook...


Holiday Season Department Store Deal Shoppers [104164]
Topic 0: Weight:0.027:
	recip,chicken,sauc,dough,cake,trump

In [40]:
## n = 25
lsi_pipe = LSIPipeline(data=adv_cont_keywords_df, n_components=25, stemming=True)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 23: Weight:0.284:
	venezuela,list of our partn,por,para,madrid,ella,más,dólare...
Topic 22: Weight:0.246:
	venezuela,por,para,student,early bird brief,madrid,patient,nurs...
Topic 20: Weight:0.086:
	philadelphia,philadelphia recovery resid,recoveri,addict,student,sobrieti,12-step program,canada...
Topic 24: Weight:0.043:
	keto,patient,keto diet,carb,nurs,ketogenic diet,recip,low carb...
Topic 19: Weight:0.04:
	advertis,canada,submarin,les,alberta,vos,covid-19,du...


Gastroenterologists [6264]
Topic 24: Weight:0.1:
	keto,patient,keto diet,carb,nurs,ketogenic diet,recip,low carb...
Topic 1: Weight:0.052:
	trump,capitol,covid-19,donald trump,vaccin,senat,congress,joe biden...
Topic 22: Weight:0.042:
	venezuela,por,para,student,early bird brief,madrid,patient,nurs...
Topic 13: Weight:0.036:
	calori,weight,exercis,muscl,skin,fat,weight loss,protein...
Topic 16: Weight:0.036:
	philadelphia,philadelphia recovery resid,counter-argu,recoveri,studi,addict,pe

In [41]:
## n = 50
lsi_pipe = LSIPipeline(data=adv_cont_keywords_df, n_components=50, stemming=True)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 22: Weight:0.384:
	venezuela,por,para,madrid,ella,más,dólare,cuba...
Topic 23: Weight:0.078:
	list of our partn,terms of us,canada,credit scor,neogov,servic,down pay,car...
Topic 20: Weight:0.064:
	philadelphia,philadelphia recovery resid,recoveri,addict,12-step program,sobrieti,drug,canada...
Topic 19: Weight:0.031:
	canada,advertis,alberta,les,submarin,vos,covid-19,ontario...
Topic 28: Weight:0.026:
	letter,skin,word,student,kid,school,hair,activ...


Gastroenterologists [6264]
Topic 25: Weight:0.109:
	skin,patient,nurs,keto,hair,nfl,alberta,keto diet...
Topic 27: Weight:0.073:
	patient,nba,nurs,clipper,pelican,hornet,blake griffin,hawk...
Topic 1: Weight:0.052:
	trump,capitol,covid-19,donald trump,vaccin,senat,congress,joe biden...
Topic 34: Weight:0.05:
	les,du,vos,le,nurs,patient,pas,nfc...
Topic 36: Weight:0.039:
	pokémon,list of our partn,letter,stab,word,letters in order filt,order filt,patient...


Holiday Season Department Store Deal Shopp

In [42]:
## n = 100
lsi_pipe = LSIPipeline(data=adv_cont_keywords_df, n_components=100, stemming=True)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 22: Weight:0.385:
	venezuela,por,para,madrid,ella,más,dólare,cuba...
Topic 23: Weight:0.072:
	list of our partn,terms of us,canada,credit scor,neogov,les,servic,down pay...
Topic 20: Weight:0.063:
	philadelphia,philadelphia recovery resid,recoveri,addict,12-step program,sobrieti,drug,canada...
Topic 65: Weight:0.054:
	mysav,next week,guide2fre,real free sampl,guide2free .com,venezuela,other stor,coupon...
Topic 19: Weight:0.031:
	canada,advertis,alberta,les,submarin,vos,covid-19,ontario...


Gastroenterologists [6264]
Topic 25: Weight:0.111:
	skin,patient,nurs,keto,hair,nfl,alberta,keto diet...
Topic 27: Weight:0.069:
	patient,nba,nurs,clipper,pelican,hornet,blake griffin,hawk...
Topic 82: Weight:0.053:
	cooki,cheapflight,menopaus,hors,recip,maps and skin,bonk .io,bonk...
Topic 1: Weight:0.052:
	trump,capitol,covid-19,donald trump,vaccin,senat,congress,joe biden...
Topic 34: Weight:0.048:
	les,du,vos,nurs,le,patient,pas,ell...


Holiday Season Depar

In [43]:
## n = 250
lsi_pipe = LSIPipeline(data=adv_cont_keywords_df, n_components=250, stemming=True)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 22: Weight:0.385:
	venezuela,por,para,madrid,ella,más,dólare,cuba...
Topic 23: Weight:0.072:
	list of our partn,terms of us,canada,credit scor,neogov,les,servic,down pay...
Topic 20: Weight:0.063:
	philadelphia,philadelphia recovery resid,recoveri,addict,12-step program,sobrieti,drug,canada...
Topic 65: Weight:0.049:
	mysav,next week,guide2fre,guide2free .com,real free sampl,other stor,venezuela,coupon...
Topic 129: Weight:0.041:
	venezuela,tu juego,san diego,jst,acrobat,pr,pdf,reader...


Gastroenterologists [6264]
Topic 25: Weight:0.111:
	skin,patient,nurs,keto,hair,nfl,alberta,keto diet...
Topic 83: Weight:0.081:
	rental hous,rentals.com,rentpath,rental,patient,nike,clinical guidelin,seat...
Topic 175: Weight:0.072:
	websit,opt-out,pharmacist,patient,las vega,american pharmacists associ,apha,diabet...
Topic 27: Weight:0.069:
	patient,nba,nurs,clipper,pelican,hornet,blake griffin,hawk...
Topic 241: Weight:0.061:
	cooking made easy bundl,oregon,rea

## Run Pipelines - Keywords API

### Latent Dirichlet Allocation

#### Unstemmed

In [44]:
## n = 10
lda_pipe = LDAPipeline(data=keyword_api_json_df, n_components=10)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 9: Weight:0.87:
	mundo,sobre,deportes,informaci,ltimas,diario,firearms,gun...
Topic 4: Weight:0.014:
	care,medical,health,american,education,medicine,healthcare,patient...
Topic 6: Weight:0.014:
	sports,entertainment,weather,politics,celebrity,headlines,health,advice...
Topic 1: Weight:0.014:
	sports,weather,obituaries,iowa,michigan,minnesota,wisconsin,chevron-right...
Topic 0: Weight:0.014:
	institute,policy,economic,military,political,law,american,freedom...


Gastroenterologists [6264]
Topic 4: Weight:0.877:
	care,medical,health,american,education,medicine,healthcare,patient...
Topic 2: Weight:0.014:
	games,software,gaming,technology,music,tutorials,financial,movie...
Topic 7: Weight:0.014:
	fan,horizontal,vox,more-arrow,yes,perspective,clock,follow...
Topic 3: Weight:0.014:
	women,family,healthy,delicious,furniture,kitchen,canada,care...
Topic 6: Weight:0.014:
	sports,entertainment,weather,politics,celebrity,headlines,health,advice...


Holiday 

In [45]:
## n = 25
lda_pipe = LDAPipeline(data=keyword_api_json_df, n_components=25)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 22: Weight:0.817:
	mundo,sobre,deportes,informaci,ltimas,diario,espa,actualidad...
Topic 5: Weight:0.027:
	restaurants,hotels,vacation,travel,attractions,trip,york,florida...
Topic 23: Weight:0.018:
	education,students,university,library,oxford,college,science,academic...
Topic 11: Weight:0.017:
	music,guitar,oregon,rock,washington,metal,artists,seattle...
Topic 19: Weight:0.006:
	women,accessories,american,fashion,shoes,dresses,discover,express...


Gastroenterologists [6264]
Topic 2: Weight:0.69:
	care,health,medical,american,patient,healthcare,medicine,patients...
Topic 23: Weight:0.184:
	education,students,university,library,oxford,college,science,academic...
Topic 4: Weight:0.005:
	fan,horizontal,vox,more-arrow,yes,perspective,clock,follow...
Topic 10: Weight:0.005:
	movie,comics,movies,games,farewell,topix,series,wrestling...
Topic 0: Weight:0.005:
	technology,software,science,tutorials,politics,financial,policy,economic...


Holiday Season De

In [46]:
## n = 50
lda_pipe = LDAPipeline(data=keyword_api_json_df, n_components=50)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 9: Weight:0.801:
	mundo,sobre,deportes,informaci,ltimas,diario,espa,actualidad...
Topic 33: Weight:0.037:
	cias,jeans,mais,brasil,smartphone,deos,canal,tudo...
Topic 22: Weight:0.015:
	speakers,sound,high-end,headphone,clemson,oil,trade,benchmark...
Topic 17: Weight:0.014:
	homes,properties,realtors,mls,houses,cable,agents,selling...
Topic 12: Weight:0.003:
	fallout,listen-solid,harvard,singles,dating,meet,chevron-down,housing...


Gastroenterologists [6264]
Topic 44: Weight:0.797:
	medical,care,health,american,education,medicine,healthcare,patient...
Topic 37: Weight:0.064:
	politics,technology,entertainment,sports,science,sport,english,military...
Topic 46: Weight:0.011:
	fan,horizontal,vox,more-arrow,yes,perspective,clock,follow...
Topic 49: Weight:0.003:
	firearms,gun,guns,shooting,tactical,firearm,hunting,accessories...
Topic 11: Weight:0.003:
	hidden,ibm,led,manufacturing,manufacturers,hotmail,rvs,distributor...


Holiday Season Department Sto

In [47]:
## n = 100
lda_pipe = LDAPipeline(data=keyword_api_json_df, n_components=100)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 29: Weight:0.78:
	mundo,sobre,deportes,informaci,ltimas,diario,espa,actualidad...
Topic 13: Weight:0.045:
	louisiana,medio,orleans,rouge,baton,inicio,lauderdale,canal...
Topic 34: Weight:0.013:
	voices,lifestyles,obituaries,flyers,region,toronto,sun,entertainment...
Topic 31: Weight:0.013:
	backyard,chickens,merch,chicken,raising,homestead,tees,giants...
Topic 37: Weight:0.011:
	oregon,washington,seattle,portland,northwest,idaho,reporter,spokane...


Gastroenterologists [6264]
Topic 10: Weight:0.758:
	care,medical,health,american,medicine,healthcare,patient,education...
Topic 99: Weight:0.039:
	students,education,teachers,university,math,study,college,oxford...
Topic 38: Weight:0.034:
	bible,cycling,bike,christian,bikes,running,church,bicycle...
Topic 5: Weight:0.027:
	general,lot,spanish,friends,believe,translation,myrtle,greenville...
Topic 83: Weight:0.012:
	tutorials,software,developers,security,programming,linux,technologies,java...


Holiday S

In [48]:
## n = 250
lda_pipe = LDAPipeline(data=keyword_api_json_df, n_components=250)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 230: Weight:0.725:
	mundo,sobre,deportes,informaci,ltimas,diario,espa,actualidad...
Topic 142: Weight:0.084:
	moda,vida,belleza,vivo,tendencias,lisis,prensa,consejos...
Topic 25: Weight:0.046:
	hockey,canal,ecuador,deos,informado,mucho,medios,written...
Topic 109: Weight:0.001:
	resultados,miss,deportivo,entrevistas,mucho,hometown,panam,cougars...
Topic 34: Weight:0.001:
	india,bollywood,indian,telugu,cricket,cinema,movie,telangana...


Gastroenterologists [6264]
Topic 36: Weight:0.777:
	care,health,medical,american,healthcare,medicine,education,patient...
Topic 120: Weight:0.037:
	sciences,publisher,presentations,scientists,standards,rewards,internal,chemistry...
Topic 112: Weight:0.02:
	tutorials,programming,developers,linux,java,languages,software,python...
Topic 166: Weight:0.012:
	allergy,tests,asthma,procedures,stuff,frozen,relevant,residents...
Topic 243: Weight:0.011:
	oxford,education,change,policy,american,rights,environment,books...


Hol

#### Stemmed

In [49]:
## n = 10
lda_pipe = LDAPipeline(data=keyword_api_json_df, n_components=10, stemming=True)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 5: Weight:0.867:
	mundo,ltima,deport,sobr,informaci,diario,espa,actualidad...
Topic 0: Weight:0.015:
	educ,financi,softwar,student,invest,job,law,american...
Topic 9: Weight:0.015:
	gun,firearm,india,rifl,shoot,tactic,farewel,topix...
Topic 7: Weight:0.015:
	sport,entertain,weather,polit,celebr,health,headlin,technolog...
Topic 3: Weight:0.015:
	fan,horizont,vox,more-arrow,yes,perspect,clock,follow...


Gastroenterologists [6264]
Topic 6: Weight:0.865:
	health,sport,restaur,entertain,hotel,vacat,care,york...
Topic 0: Weight:0.015:
	educ,financi,softwar,student,invest,job,law,american...
Topic 3: Weight:0.015:
	fan,horizont,vox,more-arrow,yes,perspect,clock,follow...
Topic 8: Weight:0.015:
	women,healthi,famili,delici,cook,kitchen,garden,craft...
Topic 7: Weight:0.015:
	sport,entertain,weather,polit,celebr,health,headlin,technolog...


Holiday Season Department Store Deal Shoppers [104164]
Topic 8: Weight:0.797:
	women,healthi,famili,delici,cook,kitc

In [50]:
## n = 25
lda_pipe = LDAPipeline(data=keyword_api_json_df, n_components=25, stemming=True)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 17: Weight:0.835:
	mundo,patient,ltima,medic,deport,care,sobr,health...
Topic 7: Weight:0.018:
	fan,horizont,vox,more-arrow,yes,clock,perspect,game...
Topic 10: Weight:0.017:
	gun,firearm,rifl,shoot,accessori,tactic,hunt,ammo...
Topic 2: Weight:0.006:
	invest,financi,investor,chart,stock,retir,insight,financ...
Topic 5: Weight:0.006:
	educ,american,scienc,student,polit,univers,english,polici...


Gastroenterologists [6264]
Topic 17: Weight:0.824:
	mundo,patient,ltima,medic,deport,care,sobr,health...
Topic 5: Weight:0.038:
	educ,american,scienc,student,polit,univers,english,polici...
Topic 9: Weight:0.006:
	tutori,java,python,program,linux,develop,code,champion...
Topic 18: Weight:0.006:
	project,pattern,firm,softwar,garden,craft,client,crochet...
Topic 13: Weight:0.006:
	job,michigan,colorado,veteran,career,employe,benefit,hire...


Holiday Season Department Store Deal Shoppers [104164]
Topic 1: Weight:0.417:
	women,accessori,fashion,shoe,dress,styl

In [51]:
## n = 50
lda_pipe = LDAPipeline(data=keyword_api_json_df, n_components=50, stemming=True)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 12: Weight:0.855:
	mundo,ltima,deport,sobr,informaci,diario,espa,actualidad...
Topic 49: Weight:0.003:
	polari,aew,utv,receta,funer,frozen,cocina,cremat...
Topic 22: Weight:0.003:
	tin,24h,nam,resid,relev,trong,simco,trang...
Topic 21: Weight:0.003:
	coin,collector,antiqu,gold,hobbi,collect,silver,ebay...
Topic 20: Weight:0.003:
	food,plant-bas,gluten-fre,nutrit,paleo,authent,tradit,high-qual...


Gastroenterologists [6264]
Topic 8: Weight:0.813:
	health,patient,educ,medic,care,american,student,medicin...
Topic 30: Weight:0.032:
	bibl,christian,church,cathol,diabet,ministri,god,literari...
Topic 46: Weight:0.014:
	fan,horizont,vox,more-arrow,yes,perspect,clock,follow...
Topic 49: Weight:0.003:
	polari,aew,utv,receta,funer,frozen,cocina,cremat...
Topic 12: Weight:0.003:
	mundo,ltima,deport,sobr,informaci,diario,espa,actualidad...


Holiday Season Department Store Deal Shoppers [104164]
Topic 41: Weight:0.479:
	women,restaur,hotel,vacat,accessori,disc

In [52]:
## n = 100
lda_pipe = LDAPipeline(data=keyword_api_json_df, n_components=100, stemming=True)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 15: Weight:0.811:
	mundo,ltima,deport,sobr,informaci,diario,espa,actualidad...
Topic 47: Weight:0.023:
	truck,construct,tractor,farm,accessori,use,repair,manufactur...
Topic 85: Weight:0.013:
	reward,metaphys,mind,conscious,holist,china,symbol,pure...
Topic 81: Weight:0.011:
	informado,sin,empresa,know,possibl,deep,costa,mankind...
Topic 26: Weight:0.001:
	game,nintendo,card,xbox,gamer,magic,list,comic...


Gastroenterologists [6264]
Topic 82: Weight:0.789:
	patient,care,health,medic,american,diseas,medicin,healthcar...
Topic 74: Weight:0.027:
	educ,church,york,oxford,bibl,book,polici,american...
Topic 29: Weight:0.019:
	review,may,two,holi,menus,prayer,religi,pediatr...
Topic 92: Weight:0.012:
	texa,oregon,washington,seattl,austin,houston,portland,dalla...
Topic 38: Weight:0.01:
	athlet,fan,horizont,univers,vox,more-arrow,yes,basketbal...


Holiday Season Department Store Deal Shoppers [104164]
Topic 18: Weight:0.366:
	women,accessori,express,dress

In [53]:
## n = 250
lda_pipe = LDAPipeline(data=keyword_api_json_df, n_components=250, stemming=True)
lda_pipe.fit_transform()
lda_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 151: Weight:0.765:
	mundo,ltima,deport,sobr,informaci,diario,espa,actualidad...
Topic 211: Weight:0.076:
	fan,horizont,vox,more-arrow,yes,perspect,clock,follow...
Topic 196: Weight:0.013:
	berita,terkini,dan,politik,antiqu,terbaru,hari,ini...
Topic 249: Weight:0.001:
	sport,wisconsin,weather,calendar,illinoi,warn,obituari,play-button...
Topic 77: Weight:0.001:
	footwear,bag,leather,good,shirt,togeth,tale,entir...


Gastroenterologists [6264]
Topic 197: Weight:0.802:
	patient,medic,care,health,american,medicin,diseas,healthcar...
Topic 184: Weight:0.02:
	job,softwar,financi,legal,career,technolog,insight,advic...
Topic 181: Weight:0.012:
	rapid,may,council,amaz,creek,battl,relief,holland...
Topic 20: Weight:0.01:
	job,becom,career,hire,employ,employe,talent,bing...
Topic 62: Weight:0.009:
	athlet,img,learfield,univers,sidearm,colleg,phoenix,ncaa...


Holiday Season Department Store Deal Shoppers [104164]
Topic 179: Weight:0.372:
	women,express,access

### Latent Semantic Indexing

#### Unstemmed

In [54]:
## n = 10
lsi_pipe = LSIPipeline(data=keyword_api_json_df, n_components=10)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 6: Weight:0.915:
	mundo,deportes,sobre,informaci,diario,ltimas,actualidad,espa...
Topic 7: Weight:0.181:
	delicious,healthy,kitchen,cooking,family,dinner,recipe,desserts...
Topic 5: Weight:0.045:
	care,american,health,family,women,medical,education,delicious...
Topic 3: Weight:0.014:
	accessories,car,enthusiasts,discussion,owners,truck,auto,cars...
Topic 4: Weight:0.011:
	enthusiasts,discussion,owners,accessories,truck,car,award-winning,cnn...


Gastroenterologists [6264]
Topic 5: Weight:0.419:
	care,american,health,family,women,medical,education,delicious...
Topic 4: Weight:0.139:
	enthusiasts,discussion,owners,accessories,truck,car,award-winning,cnn...
Topic 0: Weight:0.125:
	sports,entertainment,weather,politics,health,celebrity,technology,advice...
Topic 6: Weight:0.015:
	mundo,deportes,sobre,informaci,diario,ltimas,actualidad,espa...
Topic 3: Weight:-0.012:
	accessories,car,enthusiasts,discussion,owners,truck,auto,cars...


Holiday Season Depar

In [55]:
## n = 25
lsi_pipe = LSIPipeline(data=keyword_api_json_df, n_components=25)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 6: Weight:0.915:
	mundo,deportes,sobre,informaci,diario,ltimas,actualidad,espa...
Topic 7: Weight:0.181:
	delicious,healthy,kitchen,cooking,family,dinner,recipe,desserts...
Topic 5: Weight:0.045:
	care,american,health,family,women,medical,education,delicious...
Topic 11: Weight:0.023:
	games,yahoo,gaming,accessories,globe,dresses,women,pay...
Topic 12: Weight:0.017:
	canada,canadian,software,financial,women,toronto,accessories,ontario...


Gastroenterologists [6264]
Topic 5: Weight:0.419:
	care,american,health,family,women,medical,education,delicious...
Topic 4: Weight:0.139:
	enthusiasts,discussion,owners,accessories,truck,car,award-winning,cnn...
Topic 0: Weight:0.125:
	sports,entertainment,weather,politics,health,celebrity,technology,advice...
Topic 22: Weight:0.121:
	music,texas,colorado,guitar,science,cooking,electronics,technology...
Topic 13: Weight:0.116:
	canada,canadian,toronto,ontario,voices,games,health,medical...


Holiday Season Depart

In [56]:
## n = 50
lsi_pipe = LSIPipeline(data=keyword_api_json_df, n_components=50)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 6: Weight:0.915:
	mundo,deportes,sobre,informaci,diario,ltimas,actualidad,espa...
Topic 7: Weight:0.181:
	delicious,healthy,kitchen,cooking,family,dinner,recipe,desserts...
Topic 5: Weight:0.045:
	care,american,health,family,women,medical,education,delicious...
Topic 11: Weight:0.023:
	games,yahoo,gaming,accessories,globe,dresses,women,pay...
Topic 12: Weight:0.017:
	canada,canadian,software,financial,women,toronto,accessories,ontario...


Gastroenterologists [6264]
Topic 5: Weight:0.419:
	care,american,health,family,women,medical,education,delicious...
Topic 4: Weight:0.139:
	enthusiasts,discussion,owners,accessories,truck,car,award-winning,cnn...
Topic 22: Weight:0.127:
	music,guitar,cooking,electronics,science,audio,software,technology...
Topic 0: Weight:0.125:
	sports,entertainment,weather,politics,health,celebrity,technology,advice...
Topic 13: Weight:0.117:
	canada,canadian,toronto,ontario,voices,games,health,medical...


Holiday Season Depart

In [57]:
## n = 100
lsi_pipe = LSIPipeline(data=keyword_api_json_df, n_components=100)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 6: Weight:0.915:
	mundo,deportes,sobre,informaci,diario,ltimas,actualidad,espa...
Topic 7: Weight:0.181:
	delicious,healthy,kitchen,cooking,family,dinner,recipe,desserts...
Topic 5: Weight:0.045:
	care,american,health,family,women,medical,education,delicious...
Topic 11: Weight:0.023:
	games,yahoo,gaming,accessories,globe,dresses,women,pay...
Topic 12: Weight:0.017:
	canada,canadian,software,financial,women,toronto,accessories,ontario...


Gastroenterologists [6264]
Topic 5: Weight:0.419:
	care,american,health,family,women,medical,education,delicious...
Topic 4: Weight:0.139:
	enthusiasts,discussion,owners,accessories,truck,car,award-winning,cnn...
Topic 22: Weight:0.126:
	music,guitar,cooking,electronics,science,audio,software,technology...
Topic 0: Weight:0.125:
	sports,entertainment,weather,politics,health,celebrity,technology,advice...
Topic 13: Weight:0.117:
	canada,canadian,toronto,ontario,voices,games,health,medical...


Holiday Season Depart

In [58]:
## n = 250
lsi_pipe = LSIPipeline(data=keyword_api_json_df, n_components=250)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 6: Weight:0.915:
	mundo,deportes,sobre,informaci,diario,ltimas,actualidad,espa...
Topic 7: Weight:0.181:
	delicious,healthy,kitchen,cooking,family,dinner,recipe,desserts...
Topic 5: Weight:0.045:
	care,american,health,family,women,medical,education,delicious...
Topic 11: Weight:0.023:
	games,yahoo,gaming,accessories,globe,dresses,women,pay...
Topic 233: Weight:0.02:
	portal,brings,interviews,smart,hottest,najnovije,jewelry,question...


Gastroenterologists [6264]
Topic 5: Weight:0.419:
	care,american,health,family,women,medical,education,delicious...
Topic 4: Weight:0.139:
	enthusiasts,discussion,owners,accessories,truck,car,award-winning,cnn...
Topic 22: Weight:0.126:
	music,guitar,cooking,electronics,science,audio,software,technology...
Topic 0: Weight:0.125:
	sports,entertainment,weather,politics,health,celebrity,technology,advice...
Topic 13: Weight:0.117:
	canada,canadian,toronto,ontario,voices,games,health,medical...


Holiday Season Departmen

#### Stemmed

In [59]:
## n = 10
lsi_pipe = LSIPipeline(data=keyword_api_json_df, n_components=10, stemming=True)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 7: Weight:0.735:
	mundo,ltima,deport,sobr,informaci,cook,diario,delici...
Topic 6: Weight:0.578:
	mundo,game,ltima,deport,sobr,informaci,diario,movi...
Topic 5: Weight:0.046:
	care,famili,american,health,women,patient,educ,delici...
Topic 8: Weight:0.03:
	cnn,usatoday,deliv,usa,subscrib,countri,investig,journalist...
Topic 4: Weight:0.017:
	truck,car,enthusiast,technolog,discuss,owner,award-win,cnn...


Gastroenterologists [6264]
Topic 5: Weight:0.416:
	care,famili,american,health,women,patient,educ,delici...
Topic 4: Weight:0.192:
	truck,car,enthusiast,technolog,discuss,owner,award-win,cnn...
Topic 6: Weight:0.149:
	mundo,game,ltima,deport,sobr,informaci,diario,movi...
Topic 0: Weight:0.131:
	sport,entertain,weather,polit,celebr,health,technolog,advic...
Topic 3: Weight:-0.086:
	car,truck,enthusiast,discuss,accessori,owner,weather,auto...


Holiday Season Department Store Deal Shoppers [104164]
Topic 5: Weight:0.148:
	care,famili,american,health,wo

In [60]:
## n = 25
lsi_pipe = LSIPipeline(data=keyword_api_json_df, n_components=25, stemming=True)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 7: Weight:0.735:
	mundo,ltima,deport,sobr,informaci,cook,diario,delici...
Topic 6: Weight:0.578:
	mundo,game,ltima,deport,sobr,informaci,diario,movi...
Topic 5: Weight:0.046:
	care,famili,american,health,women,patient,educ,delici...
Topic 11: Weight:0.038:
	game,mortgag,yahoo,accessori,women,globe,dress,weather...
Topic 8: Weight:0.03:
	cnn,usatoday,deliv,usa,subscrib,countri,investig,journalist...


Gastroenterologists [6264]
Topic 5: Weight:0.416:
	care,famili,american,health,women,patient,educ,delici...
Topic 12: Weight:0.223:
	vacat,restaur,hotel,game,medic,health,patient,attract...
Topic 4: Weight:0.192:
	truck,car,enthusiast,technolog,discuss,owner,award-win,cnn...
Topic 6: Weight:0.149:
	mundo,game,ltima,deport,sobr,informaci,diario,movi...
Topic 0: Weight:0.131:
	sport,entertain,weather,polit,celebr,health,technolog,advic...


Holiday Season Department Store Deal Shoppers [104164]
Topic 11: Weight:0.173:
	game,mortgag,yahoo,accessori,women,g

In [61]:
## n = 50
lsi_pipe = LSIPipeline(data=keyword_api_json_df, n_components=50, stemming=True)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 7: Weight:0.735:
	mundo,ltima,deport,sobr,informaci,cook,diario,delici...
Topic 6: Weight:0.578:
	mundo,game,ltima,deport,sobr,informaci,diario,movi...
Topic 5: Weight:0.046:
	care,famili,american,health,women,patient,educ,delici...
Topic 11: Weight:0.038:
	game,mortgag,yahoo,accessori,women,globe,dress,weather...
Topic 8: Weight:0.03:
	cnn,usatoday,deliv,usa,subscrib,countri,investig,journalist...


Gastroenterologists [6264]
Topic 5: Weight:0.416:
	care,famili,american,health,women,patient,educ,delici...
Topic 12: Weight:0.223:
	vacat,restaur,hotel,game,medic,health,patient,attract...
Topic 4: Weight:0.192:
	truck,car,enthusiast,technolog,discuss,owner,award-win,cnn...
Topic 6: Weight:0.149:
	mundo,game,ltima,deport,sobr,informaci,diario,movi...
Topic 0: Weight:0.131:
	sport,entertain,weather,polit,celebr,health,technolog,advic...


Holiday Season Department Store Deal Shoppers [104164]
Topic 11: Weight:0.173:
	game,mortgag,yahoo,accessori,women,g

In [62]:
## n = 100
lsi_pipe = LSIPipeline(data=keyword_api_json_df, n_components=100, stemming=True)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 7: Weight:0.735:
	mundo,ltima,deport,sobr,informaci,cook,diario,delici...
Topic 6: Weight:0.578:
	mundo,game,ltima,deport,sobr,informaci,diario,movi...
Topic 5: Weight:0.046:
	care,famili,american,health,women,patient,educ,delici...
Topic 11: Weight:0.038:
	game,mortgag,yahoo,accessori,women,globe,dress,weather...
Topic 8: Weight:0.03:
	cnn,usatoday,deliv,usa,subscrib,countri,investig,journalist...


Gastroenterologists [6264]
Topic 5: Weight:0.416:
	care,famili,american,health,women,patient,educ,delici...
Topic 12: Weight:0.223:
	vacat,restaur,hotel,game,medic,health,patient,attract...
Topic 4: Weight:0.192:
	truck,car,enthusiast,technolog,discuss,owner,award-win,cnn...
Topic 6: Weight:0.149:
	mundo,game,ltima,deport,sobr,informaci,diario,movi...
Topic 0: Weight:0.131:
	sport,entertain,weather,polit,celebr,health,technolog,advic...


Holiday Season Department Store Deal Shoppers [104164]
Topic 11: Weight:0.173:
	game,mortgag,yahoo,accessori,women,g

In [63]:
## n = 250
lsi_pipe = LSIPipeline(data=keyword_api_json_df, n_components=250, stemming=True)
lsi_pipe.fit_transform()
lsi_pipe.demonstrate()

Colombia Trip Planners [22415]
Topic 7: Weight:0.735:
	mundo,ltima,deport,sobr,informaci,cook,diario,delici...
Topic 6: Weight:0.578:
	mundo,game,ltima,deport,sobr,informaci,diario,movi...
Topic 5: Weight:0.046:
	care,famili,american,health,women,patient,educ,delici...
Topic 11: Weight:0.038:
	game,mortgag,yahoo,accessori,women,globe,dress,weather...
Topic 8: Weight:0.03:
	cnn,usatoday,deliv,usa,subscrib,countri,investig,journalist...


Gastroenterologists [6264]
Topic 5: Weight:0.416:
	care,famili,american,health,women,patient,educ,delici...
Topic 12: Weight:0.223:
	vacat,restaur,hotel,game,medic,health,patient,attract...
Topic 4: Weight:0.192:
	truck,car,enthusiast,technolog,discuss,owner,award-win,cnn...
Topic 6: Weight:0.149:
	mundo,game,ltima,deport,sobr,informaci,diario,movi...
Topic 0: Weight:0.131:
	sport,entertain,weather,polit,celebr,health,technolog,advic...


Holiday Season Department Store Deal Shoppers [104164]
Topic 11: Weight:0.173:
	game,mortgag,yahoo,accessori,women,g