## Load Files and Vectorize 

In [29]:
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine, cdist

In [30]:
with open(r"data/df_corpus2.pkl", "rb") as input_file:
    df_corpus2 = pickle.load(input_file)

In [31]:
df_corpus2.head()

Unnamed: 0,content,score
0,"Nature Reviews Nephrology (before 2009, Nature...",20.6
1,Food blogging represents a complex interweavin...,13.6
2,The International Motorcycle Shows are a serie...,13.3
3,"Education in Ancient Greece was vastly ""democr...",11.3
4,Rural economics is the study of rural economie...,12.5


## Data Preprocessing

In [33]:
! pip install gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/b4/fb/c0cefcecf82b445ff2a714935db5b475a25202d6b63241c7e95ca004136a/gensim-3.7.3-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (24.7MB)
[K     |████████████████████████████████| 24.7MB 1.1MB/s eta 0:00:01    |██▌                             | 1.9MB 1.1MB/s eta 0:00:22
[?25hCollecting smart-open>=1.7.0 (from gensim)
  Using cached https://files.pythonhosted.org/packages/37/c0/25d19badc495428dec6a4bf7782de617ee0246a9211af75b302a2681dea7/smart_open-1.8.4.tar.gz
Collecting boto3 (from smart-open>=1.7.0->gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/f8/b2/af1c6efe6fc0242a7acbc69e50e592d25c5d5c18e05dd9afaf69e9317dc9/boto3-1.9.181-py2.py3-none-any.whl (128kB)
[K     |████████████████████████████████| 133kB 954kB/s eta 0:00:01
Collecting s3transfer<0.3.0,>=0.2.0 (from boto3->smart-open>=1.7.0->gensim)
  Using cached https://files.p

In [34]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sherzyang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [35]:
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [37]:
processed_docs = df_corpus2['content'].map(preprocess)
#processed_docs[:10]

In [39]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 adult
1 apply
2 cancer
3 child
4 chronic
5 citation
6 clinical
7 coverage
8 diagnosis
9 dialysis
10 disorder


In [40]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [41]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


In [42]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.11463756621225898),
 (1, 0.05765586532617539),
 (2, 0.13892712182502004),
 (3, 0.09102312571345948),
 (4, 0.1579255683226295),
 (5, 0.15342950456468332),
 (6, 0.12687624510705903),
 (7, 0.13979034449587607),
 (8, 0.15698604788106313),
 (9, 0.11687015539050881),
 (10, 0.19033509306520321),
 (11, 0.0714941413459945),
 (12, 0.10949456875642996),
 (13, 0.15330801836500396),
 (14, 0.2324912851948214),
 (15, 0.2429152820827657),
 (16, 0.07326354021919622),
 (17, 0.10778082117698098),
 (18, 0.1918594902864103),
 (19, 0.24513212262921247),
 (20, 0.13877712452498228),
 (21, 0.20848419321557235),
 (22, 0.09049542779379403),
 (23, 0.27958124452493754),
 (24, 0.18772634012563447),
 (25, 0.10850653149179024),
 (26, 0.16206489016436754),
 (27, 0.05663309758786264),
 (28, 0.1465052227617137),
 (29, 0.05518954915829813),
 (30, 0.2653244279611047),
 (31, 0.11564191336679187),
 (32, 0.2139591889419809),
 (33, 0.32931380435014446),
 (34, 0.17846492643608908),
 (35, 0.09913014119814909)]


In [43]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [44]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.008*"century" + 0.007*"literature" + 0.006*"write" + 0.004*"book" + 0.004*"bear" + 0.004*"ancient" + 0.004*"roman" + 0.004*"museum" + 0.003*"form" + 0.003*"period"
Topic: 1 
Words: 0.007*"self" + 0.007*"music" + 0.006*"people" + 0.006*"culture" + 0.005*"health" + 0.004*"psychology" + 0.004*"group" + 0.003*"study" + 0.003*"social" + 0.003*"form"
Topic: 2 
Words: 0.007*"social" + 0.006*"study" + 0.005*"theory" + 0.005*"science" + 0.004*"human" + 0.004*"philosophy" + 0.004*"research" + 0.004*"self" + 0.003*"society" + 0.003*"university"
Topic: 3 
Words: 0.006*"film" + 0.006*"series" + 0.005*"star" + 0.004*"american" + 0.004*"television" + 0.004*"music" + 0.004*"award" + 0.003*"release" + 0.003*"show" + 0.003*"season"
Topic: 4 
Words: 0.009*"university" + 0.007*"program" + 0.007*"engineer" + 0.006*"school" + 0.006*"technology" + 0.006*"science" + 0.005*"education" + 0.005*"college" + 0.005*"research" + 0.004*"institute"
Topic: 5 
Words: 0.009*"food" + 0.007*"company" + 0

In [45]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.002*"film" + 0.001*"language" + 0.001*"actors" + 0.001*"nepal" + 0.001*"list" + 0.001*"drink" + 0.001*"movies" + 0.001*"bear" + 0.001*"music" + 0.001*"hindi"
Topic: 1 Word: 0.003*"music" + 0.002*"finance" + 0.002*"university" + 0.001*"journal" + 0.001*"engineer" + 0.001*"sciences" + 0.001*"global" + 0.001*"program" + 0.001*"science" + 0.001*"technology"
Topic: 2 Word: 0.002*"self" + 0.002*"social" + 0.002*"behavior" + 0.002*"theory" + 0.002*"psychology" + 0.002*"economics" + 0.002*"displaystyle" + 0.001*"model" + 0.001*"market" + 0.001*"study"
Topic: 3 Word: 0.001*"music" + 0.001*"food" + 0.001*"social" + 0.001*"museum" + 0.001*"science" + 0.001*"cities" + 0.001*"university" + 0.001*"anthropology" + 0.001*"album" + 0.001*"sciences"
Topic: 4 Word: 0.003*"journal" + 0.002*"transportation" + 0.002*"environment" + 0.002*"psychology" + 0.002*"science" + 0.002*"department" + 0.002*"index" + 0.002*"finance" + 0.002*"philosophy" + 0.001*"nature"
Topic: 5 Word: 0.005*"music" + 

In [46]:
processed_docs[4310]


['society',
 'group',
 'individuals',
 'involve',
 'persistent',
 'social',
 'interaction',
 'large',
 'social',
 'group',
 'share',
 'geographical',
 'social',
 'territory',
 'typically',
 'subject',
 'political',
 'authority',
 'dominant',
 'cultural',
 'expectations',
 'societies',
 'characterize',
 'pattern',
 'relationships',
 'social',
 'relations',
 'individuals',
 'share',
 'distinctive',
 'culture',
 'institutions',
 'give',
 'society',
 'describe',
 'total',
 'relationships',
 'constituent',
 'members',
 'social',
 'sciences',
 'larger',
 'society',
 'exhibit',
 'stratification',
 'dominance',
 'pattern',
 'subgroups',
 'insofar',
 'collaborative',
 'society',
 'enable',
 'members',
 'benefit',
 'ways',
 'possible',
 'individual',
 'basis',
 'individual',
 'social',
 'common',
 'benefit',
 'distinguish',
 'case',
 'overlap',
 'society',
 'consist',
 'like',
 'mind',
 'people',
 'govern',
 'norms',
 'value',
 'dominant',
 'larger',
 'society',
 'refer',
 'subculture',
 'term',

In [47]:
#performance evaluation for BOW
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.5150673389434814	 
Topic: 0.007*"social" + 0.006*"study" + 0.005*"theory" + 0.005*"science" + 0.004*"human" + 0.004*"philosophy" + 0.004*"research" + 0.004*"self" + 0.003*"society" + 0.003*"university"

Score: 0.3230469226837158	 
Topic: 0.004*"government" + 0.004*"unite" + 0.004*"century" + 0.004*"world" + 0.004*"people" + 0.003*"city" + 0.003*"economic" + 0.003*"south" + 0.003*"empire" + 0.003*"force"

Score: 0.07118179649114609	 
Topic: 0.009*"university" + 0.007*"program" + 0.007*"engineer" + 0.006*"school" + 0.006*"technology" + 0.006*"science" + 0.005*"education" + 0.005*"college" + 0.005*"research" + 0.004*"institute"

Score: 0.06176217272877693	 
Topic: 0.009*"food" + 0.007*"company" + 0.005*"market" + 0.004*"drink" + 0.004*"finance" + 0.004*"service" + 0.003*"unite" + 0.003*"world" + 0.003*"financial" + 0.003*"bank"

Score: 0.02862800471484661	 
Topic: 0.007*"self" + 0.007*"music" + 0.006*"people" + 0.006*"culture" + 0.005*"health" + 0.004*"psychology" + 0.004*"group

In [48]:
#performance evaluation for TFIDF
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.756618857383728	 
Topic: 0.002*"self" + 0.002*"social" + 0.002*"behavior" + 0.002*"theory" + 0.002*"psychology" + 0.002*"economics" + 0.002*"displaystyle" + 0.001*"model" + 0.001*"market" + 0.001*"study"

Score: 0.1328658163547516	 
Topic: 0.002*"food" + 0.002*"university" + 0.002*"engineer" + 0.002*"science" + 0.002*"literature" + 0.002*"sciences" + 0.001*"anthropology" + 0.001*"economics" + 0.001*"technology" + 0.001*"ministry"

Score: 0.11007410287857056	 
Topic: 0.002*"museum" + 0.002*"cities" + 0.002*"century" + 0.002*"city" + 0.001*"ancient" + 0.001*"anthropology" + 0.001*"university" + 0.001*"culture" + 0.001*"population" + 0.001*"south"


In [49]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.8498205542564392	 Topic: 0.014*"music" + 0.006*"displaystyle" + 0.005*"number" + 0.004*"record" + 0.003*"group"
Score: 0.016689682379364967	 Topic: 0.004*"government" + 0.004*"unite" + 0.004*"century" + 0.004*"world" + 0.004*"people"
Score: 0.01668865606188774	 Topic: 0.007*"self" + 0.007*"music" + 0.006*"people" + 0.006*"culture" + 0.005*"health"
Score: 0.01668822579085827	 Topic: 0.009*"food" + 0.007*"company" + 0.005*"market" + 0.004*"drink" + 0.004*"finance"
Score: 0.01668785884976387	 Topic: 0.007*"social" + 0.006*"study" + 0.005*"theory" + 0.005*"science" + 0.004*"human"
Score: 0.016685713082551956	 Topic: 0.006*"film" + 0.006*"series" + 0.005*"star" + 0.004*"american" + 0.004*"television"
Score: 0.016685357317328453	 Topic: 0.008*"century" + 0.007*"literature" + 0.006*"write" + 0.004*"book" + 0.004*"bear"
Score: 0.01668495684862137	 Topic: 0.019*"language" + 0.017*"film" + 0.007*"languages" + 0.005*"write" + 0.005*"word"
Score: 0.01668478734791279	 Topic: 0.008*"road" +