## Load Files and Vectorize 

In [2]:
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine, cdist

In [3]:
with open(r"data/df_corpus2.pkl", "rb") as input_file:
    df_corpus2 = pickle.load(input_file)

In [4]:
df_corpus2.head()

Unnamed: 0,content,score
0,"Nature Reviews Nephrology (before 2009, Nature...",20.6
1,Food blogging represents a complex interweavin...,13.6
2,The International Motorcycle Shows are a serie...,13.3
3,"Education in Ancient Greece was vastly ""democr...",11.3
4,Rural economics is the study of rural economie...,12.5


## Data Preprocessing

In [9]:
#! pip install gensim

In [10]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sherzyang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [12]:
processed_docs = df_corpus2['content'].map(preprocess)
#processed_docs[:10]

In [13]:
# import pickle
# with open('data/processed_docs_lda.pkl', 'wb') as f:
#     pickle.dump(processed_docs, f)

In [None]:
with open(r"data/processed_docs_lda.pkl", "rb") as input_file:
    df_corpus2 = pickle.load(input_file)

In [14]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 adult
1 apply
2 cancer
3 child
4 chronic
5 citation
6 clinical
7 coverage
8 diagnosis
9 dialysis
10 disorder


In [15]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [16]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


In [17]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.11463756621225898),
 (1, 0.05765586532617539),
 (2, 0.13892712182502004),
 (3, 0.09102312571345948),
 (4, 0.1579255683226295),
 (5, 0.15342950456468332),
 (6, 0.12687624510705903),
 (7, 0.13979034449587607),
 (8, 0.15698604788106313),
 (9, 0.11687015539050881),
 (10, 0.19033509306520321),
 (11, 0.0714941413459945),
 (12, 0.10949456875642996),
 (13, 0.15330801836500396),
 (14, 0.2324912851948214),
 (15, 0.2429152820827657),
 (16, 0.07326354021919622),
 (17, 0.10778082117698098),
 (18, 0.1918594902864103),
 (19, 0.24513212262921247),
 (20, 0.13877712452498228),
 (21, 0.20848419321557235),
 (22, 0.09049542779379403),
 (23, 0.27958124452493754),
 (24, 0.18772634012563447),
 (25, 0.10850653149179024),
 (26, 0.16206489016436754),
 (27, 0.05663309758786264),
 (28, 0.1465052227617137),
 (29, 0.05518954915829813),
 (30, 0.2653244279611047),
 (31, 0.11564191336679187),
 (32, 0.2139591889419809),
 (33, 0.32931380435014446),
 (34, 0.17846492643608908),
 (35, 0.09913014119814909)]


In [18]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [19]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.007*"century" + 0.007*"literature" + 0.006*"write" + 0.004*"book" + 0.004*"displaystyle" + 0.004*"bear" + 0.003*"ancient" + 0.003*"roman" + 0.003*"form" + 0.003*"museum"
Topic: 1 
Words: 0.006*"self" + 0.006*"music" + 0.006*"culture" + 0.006*"people" + 0.005*"health" + 0.004*"psychology" + 0.004*"group" + 0.003*"study" + 0.003*"social" + 0.003*"form"
Topic: 2 
Words: 0.007*"social" + 0.006*"study" + 0.005*"theory" + 0.004*"science" + 0.004*"human" + 0.004*"philosophy" + 0.004*"self" + 0.004*"research" + 0.003*"society" + 0.003*"people"
Topic: 3 
Words: 0.006*"film" + 0.006*"series" + 0.005*"star" + 0.004*"television" + 0.004*"american" + 0.003*"award" + 0.003*"music" + 0.003*"show" + 0.003*"release" + 0.003*"season"
Topic: 4 
Words: 0.009*"university" + 0.007*"program" + 0.007*"engineer" + 0.005*"school" + 0.005*"technology" + 0.005*"science" + 0.004*"college" + 0.004*"education" + 0.004*"research" + 0.004*"institute"
Topic: 5 
Words: 0.008*"food" + 0.006*"company" +

In [20]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.002*"film" + 0.001*"language" + 0.001*"actors" + 0.001*"nepal" + 0.001*"list" + 0.001*"drink" + 0.001*"movies" + 0.001*"bear" + 0.001*"music" + 0.001*"hindi"
Topic: 1 Word: 0.003*"music" + 0.002*"finance" + 0.002*"university" + 0.001*"journal" + 0.001*"engineer" + 0.001*"sciences" + 0.001*"global" + 0.001*"program" + 0.001*"science" + 0.001*"technology"
Topic: 2 Word: 0.002*"self" + 0.002*"social" + 0.002*"behavior" + 0.002*"theory" + 0.002*"psychology" + 0.002*"economics" + 0.002*"displaystyle" + 0.001*"model" + 0.001*"market" + 0.001*"study"
Topic: 3 Word: 0.001*"music" + 0.001*"food" + 0.001*"social" + 0.001*"museum" + 0.001*"science" + 0.001*"cities" + 0.001*"university" + 0.001*"anthropology" + 0.001*"album" + 0.001*"sciences"
Topic: 4 Word: 0.003*"journal" + 0.002*"transportation" + 0.002*"environment" + 0.002*"psychology" + 0.002*"science" + 0.002*"department" + 0.002*"index" + 0.002*"finance" + 0.002*"philosophy" + 0.001*"nature"
Topic: 5 Word: 0.005*"music" + 

In [21]:
#processed_docs[4310]


In [22]:
#performance evaluation for BOW
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.5158300399780273	 
Topic: 0.007*"social" + 0.006*"study" + 0.005*"theory" + 0.004*"science" + 0.004*"human" + 0.004*"philosophy" + 0.004*"self" + 0.004*"research" + 0.003*"society" + 0.003*"people"

Score: 0.31248170137405396	 
Topic: 0.004*"government" + 0.004*"century" + 0.004*"unite" + 0.003*"people" + 0.003*"world" + 0.003*"city" + 0.003*"economic" + 0.003*"empire" + 0.003*"south" + 0.003*"force"

Score: 0.0725770965218544	 
Topic: 0.009*"university" + 0.007*"program" + 0.007*"engineer" + 0.005*"school" + 0.005*"technology" + 0.005*"science" + 0.004*"college" + 0.004*"education" + 0.004*"research" + 0.004*"institute"

Score: 0.06361857056617737	 
Topic: 0.008*"food" + 0.006*"company" + 0.005*"market" + 0.004*"drink" + 0.004*"finance" + 0.004*"service" + 0.003*"world" + 0.003*"unite" + 0.003*"financial" + 0.003*"bank"

Score: 0.03517885133624077	 
Topic: 0.006*"self" + 0.006*"music" + 0.006*"culture" + 0.006*"people" + 0.005*"health" + 0.004*"psychology" + 0.004*"group" + 

In [23]:
#performance evaluation for TFIDF
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.756618857383728	 
Topic: 0.002*"self" + 0.002*"social" + 0.002*"behavior" + 0.002*"theory" + 0.002*"psychology" + 0.002*"economics" + 0.002*"displaystyle" + 0.001*"model" + 0.001*"market" + 0.001*"study"

Score: 0.1328658163547516	 
Topic: 0.002*"food" + 0.002*"university" + 0.002*"engineer" + 0.002*"science" + 0.002*"literature" + 0.002*"sciences" + 0.001*"anthropology" + 0.001*"economics" + 0.001*"technology" + 0.001*"ministry"

Score: 0.11007410287857056	 
Topic: 0.002*"museum" + 0.002*"cities" + 0.002*"century" + 0.002*"city" + 0.001*"ancient" + 0.001*"anthropology" + 0.001*"university" + 0.001*"culture" + 0.001*"population" + 0.001*"south"


In [35]:
unseen_document = 'Glossier is a makeup brand that every girl loves.'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.819762110710144	 Topic: 0.006*"film" + 0.006*"series" + 0.005*"star" + 0.004*"television" + 0.004*"american"
Score: 0.020032105967402458	 Topic: 0.008*"food" + 0.006*"company" + 0.005*"market" + 0.004*"drink" + 0.004*"finance"
Score: 0.020026806741952896	 Topic: 0.016*"music" + 0.004*"record" + 0.004*"number" + 0.003*"release" + 0.003*"group"
Score: 0.020026497542858124	 Topic: 0.008*"road" + 0.007*"county" + 0.006*"city" + 0.005*"highway" + 0.004*"north"
Score: 0.02002619579434395	 Topic: 0.019*"language" + 0.016*"film" + 0.007*"languages" + 0.005*"write" + 0.005*"word"
Score: 0.02002609707415104	 Topic: 0.007*"century" + 0.007*"literature" + 0.006*"write" + 0.004*"book" + 0.004*"displaystyle"
Score: 0.020025933161377907	 Topic: 0.006*"self" + 0.006*"music" + 0.006*"culture" + 0.006*"people" + 0.005*"health"
Score: 0.02002512291073799	 Topic: 0.007*"social" + 0.006*"study" + 0.005*"theory" + 0.004*"science" + 0.004*"human"
Score: 0.02002478763461113	 Topic: 0.009*"university"

In [36]:
lda_model

[(0, 0.02002611),
 (1, 0.020025944),
 (2, 0.020025136),
 (3, 0.8197604),
 (4, 0.020024799),
 (5, 0.020033775),
 (6, 0.020024315),
 (7, 0.020026209),
 (8, 0.02002651),
 (9, 0.02002682)]

In [37]:
# with open("data/lda_model_1.pkl", 'wb') as f:
#      pickle.dump(lda_model, f)

In [39]:
with open(r"data/lda_model_1.pkl", "rb") as input_file:
    lda_model_1 = pickle.load(input_file)

In [None]:
unseen_document = 'Glossier is a makeup brand that every girl loves.'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

In [68]:
def get_aggregate_score(lda_model, bow_corpus, i=0):
    """
    Get the score for topics from wikipedia
    """

    lda_all_docs = []   
    
    for i in range(len(bow_corpus)):
        article = lda_model[bow_corpus[i]]
    
        if len(article) != 10:
            indices = []
            for item in article:
                indices.append(item[0])
            for index in list(range(10)):
                if index not in indices:
                    article.append((index, 0))
        
        lda_all_docs.append([i,article])


    return lda_all_docs

In [69]:
aggregate_scores_mod1 = get_aggregate_score(lda_model, bow_corpus, i=0)

In [70]:
topics_df = pd.DataFrame(aggregate_scores_mod1)

In [71]:
topics_df.head()

Unnamed: 0,0,1
0,0,"[(1, 0.34407565), (2, 0.6388408), (0, 0), (3, ..."
1,1,"[(2, 0.04269844), (3, 0.14525636), (5, 0.80569..."
2,2,"[(3, 0.23120733), (5, 0.7549764), (0, 0), (1, ..."
3,3,"[(0, 0.21230577), (1, 0.03643345), (2, 0.34926..."
4,4,"[(2, 0.27511144), (4, 0.040314987), (5, 0.0807..."


In [99]:
list_of_dicts = []
for i in range(len(topics_df[1])):
    one_doc = dict(topics_df[1][i])
    list_of_dicts.append(one_doc)

In [100]:
list_of_dicts

[{1: 0.34407565, 2: 0.6388408, 0: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0},
 {2: 0.04269844,
  3: 0.14525636,
  5: 0.8056983,
  0: 0,
  1: 0,
  4: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0},
 {3: 0.23120733, 5: 0.7549764, 0: 0, 1: 0, 2: 0, 4: 0, 6: 0, 7: 0, 8: 0, 9: 0},
 {0: 0.21230577,
  1: 0.03643345,
  2: 0.34926352,
  4: 0.39875394,
  3: 0,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0},
 {2: 0.27511144,
  4: 0.040314987,
  5: 0.080787964,
  6: 0.5966076,
  0: 0,
  1: 0,
  3: 0,
  7: 0,
  8: 0,
  9: 0},
 {3: 0.47189677, 9: 0.5206837, 0: 0, 1: 0, 2: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0},
 {2: 0.8696632,
  4: 0.024326645,
  5: 0.09666677,
  0: 0,
  1: 0,
  3: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0},
 {2: 0.047885485,
  4: 0.9510327,
  0: 0,
  1: 0,
  3: 0,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0},
 {2: 0.4127268, 5: 0.58005595, 0: 0, 1: 0, 3: 0, 4: 0, 6: 0, 7: 0, 8: 0, 9: 0},
 {3: 0.06688596, 7: 0.9279092, 0: 0, 1: 0, 2: 0, 4: 0, 5: 0, 6: 0, 8: 0, 9: 0},
 {0: 0.1531832,
  2: 0.27070153,
  5: 0.57336986,
 

In [102]:
topic_matrix_all_wiki = pd.DataFrame(list_of_dicts)

In [103]:
# with open("data/topic_matrix_all_wiki.pkl", 'wb') as f:
#      pickle.dump(topic_matrix_all_wiki, f)

In [None]:
with open(r"data/topic_matrix_all_wiki.pkl", 'rb') as input_file:
      topic_matrix_all_wiki = pickle.load(input_file)

In [145]:
best_category = []
for i in range(len(list_of_dicts)):
    one_dict = list_of_dicts[i]
    max_value = max(one_dict.values())
    max_keys = [k for k, v in one_dict.items() if v == max_value]
    best_category.append(max_keys)

In [169]:
#[float(str(i).strip('[]')) for i in best_category]

In [162]:
df_best_category = pd.DataFrame()

In [166]:
df_best_category['category'] = [int(str(i).strip('[]')) for i in best_category]

In [167]:
df_best_category['article_index'] = range(len(category))

In [168]:
df_best_category.head()

Unnamed: 0,category,article_index
0,2,0
1,5,1
2,5,2
3,4,3
4,6,4


In [174]:
grouped_df = df_best_category.groupby(['category'])

In [175]:
for key, item in grouped_df:
    print(grouped_df.get_group(key), "\n\n")

       category  article_index
55            0             55
61            0             61
83            0             83
87            0             87
95            0             95
104           0            104
108           0            108
122           0            122
123           0            123
131           0            131
142           0            142
145           0            145
148           0            148
150           0            150
154           0            154
159           0            159
196           0            196
236           0            236
263           0            263
269           0            269
275           0            275
334           0            334
349           0            349
352           0            352
368           0            368
416           0            416
419           0            419
434           0            434
491           0            491
492           0            492
...         ...            ...
13823   