### Article Selection
API calls are made making use of NY Times Developer platform. This will extract metadata and article snippets from NY Times archives based on the search conditions used. API will primarily use the news_desk attribute to import specific number of articles from each section. The sectional topics to be used are listed out in the developer website with over 100 topics available. Peripheral topics based on city/regional news, obituaries, job advertisments, classifieds, booming, crosswords etc. are excluded to work with only general topics.

In [3]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

news_desk = pd.read_csv("data/news_desk.csv")

news_desk.head()

Unnamed: 0,Section
0,Business
1,Culture
2,Editorial
3,Education
4,Environment


### API Calls
API calling will build a basic framework for extracting articles in bulk from the NY Times website. Due to the rate limiting restriction put up by NY Times, a decorater is used to make dynamic function calls so that the API requests do not exceed 1 call/second. There is also a day-wise limit of 1000 calls. An additional feature is added which will parallelize this process making use of multiple CPU cores and this will speed things up a bit. Brief description of Lock() function used for rate limiting can be found here.

In [13]:
import time, threading

def rate_limited(max_per_second):
  '''Decorator that make functions not to be called faster than 1 call/second'''
  lock = threading.Lock()
  minInterval = 1.0 / float(max_per_second)
  def decorate(func):
    lastTimeCalled = [0.0]
    def rateLimitedFunction(args,*kargs):
      lock.acquire()
      elapsed = time.clock() - lastTimeCalled[0]
      leftToWait = minInterval - elapsed
      if leftToWait>0:
        time.sleep(leftToWait)
      lock.release()
      ret = func(args,*kargs)
      lastTimeCalled[0] = time.clock()
      return ret
    return rateLimitedFunction
  return decorate


from threading import Thread
import requests

@rate_limited(0.9)
def process_id(id):
    try:
        r = requests.get(url % id)
        json_data = r.json()
        print('Appended '+str(page_index.index(id))+ ' out of '+ str(len(page_index)))
        return json_data
    except:
        json_data = ''
        print('Skipping...')
        return json_data

def process_range(id_range, store=None):
    if store is None:
        store = {}
    for id in id_range:
        store[id] = process_id(id)
    return store


def threaded_process_range(nthreads, id_range):
    store = {}
    threads = []

    for i in range(nthreads):
        ids = id_range[i::nthreads]
        t = Thread(target=process_range, args=(ids,store))
        threads.append(t)

    [t.start() for t in threads]
    [t.join() for t in threads]
    return store

news_desk = list(news_desk['Section'])
base_url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?api-key=bdfac6731d1c4590908aa30babcd0c50'

num_pages = 50
article_raw = []
for nd in news_desk:
    param_url = '&fq=news_desk:'+str(nd)+'&sort=newest&page=%s'
    url = base_url + param_url
    print(str(nd)+":")
    page_index = list(range(num_pages))
    try:
        articles_1 = threaded_process_range(2, page_index)
        articles_2 = [articles_1[k]['response']['docs'] for k in page_index if (type(articles_1[k]) is dict) and ('response' in articles_1[k])]
        articles_3 = [item for sublist in articles_2 for item in sublist]
        articles_4 = [{key:item[key] for key in ['web_url','pub_date','_id']} for item in articles_3]
        articles_5 = pd.DataFrame(articles_4)
        articles_5['news_desk'] = str(nd)
        article_raw.append(articles_5)
    except:
        print('Skipping...')

url_data = pd.concat(article_raw).reset_index(drop = True)
url_data.to_csv('data/url_data.csv', index = False)

### Web Scraping
Using the metadata obtained from API calls, the specific URLs with the BeautifulSoup package scraped the article text data. A decorater was used along with the scrape function to limit the rate. URLs with videos/slideshows are excluded from the dataset as they do not have text data. The article content is enclosed within 'p' tags and a loop was used to extract all article content. Article length frequency distribution is observed to exclude articles which failed to scrape meaningful content. Such articles can be identified by high bars in the graph with small article lengths.

In [14]:
from bs4 import BeautifulSoup

@rate_limited(1)
def extract_content(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    name_box = soup.findAll('p', attrs={'class': 'story-body-text story-content'})
    content = [x.text for x in name_box]
    content_final = ' '.join(content)
    
    if(content_final == ''):
        name_box = soup.findAll('p')
        content = [x.text for x in name_box]
        content_final = ' '.join(content)

    return(content_final)

url_data = pd.read_csv('data/url_data.csv')
url_data['video_flag'] = url_data['web_url'].str.contains('/video/')
url_data['slideshow_flag'] = url_data['web_url'].str.contains('/slideshow/')
url_data = url_data.loc[(url_data['video_flag'] == False) & (url_data['slideshow_flag'] == False),]
url_data = url_data.drop(['video_flag','slideshow_flag'], axis = 1)

content = []
for index, i in enumerate(url_data['web_url']):
    try:
        print(index)
        a = extract_content(i)    
        content.append(a)
    except:
        content.append('')
        print('Skipping...')
        
content_data = url_data
content_data['content'] = content
content_data.to_csv('data/content_data.csv', index = False)
##
import pandas as pd
content_data = pd.read_csv('data/content_data.csv')
content_data['length'] = content_data['content'].str.strip().str.len()

import bokeh.plotting as bp
from bokeh.io import show, output_notebook
from bokeh.models import HoverTool
import numpy as np

array = list(content_data['length'][content_data['length'].values < 1000])
hist, edges = np.histogram(array, bins=50)

source = bp.ColumnDataSource(pd.DataFrame({
    'left' : edges[:-1],
    'top' : hist,
    'right' : edges[1:],
    'bottom' : 0,
    'data_value' : hist
}))

p = bp.figure(width = 550, height = 450)
p.quad(top='top', bottom='bottom', left='left', right='right', line_color="white", source = source)
p.add_tools(HoverTool(tooltips= [("Value", "@data_value")]))
p.xaxis.axis_label = 'Length of Article'
p.yaxis.axis_label = 'Frequency'
output_notebook()
show(p)

Looking over the frequency distribution of article length below 1000, there is a spike in the 1st-3rd bar denoting articles with advertisement content, waste text with no relavant information. These articles will be filtered from the corpus.

In [6]:
# Filtering articles with length less than 60 (first three bars)
content_data = content_data.loc[content_data['length'] > 60,]

# Filtering articles with very large length (removing outliers)
content_data = content_data.loc[content_data['length'] < 20000,]

# Removing duplicates from dataset
content_data = content_data.drop_duplicates('web_url')

# Reset the index
content_data = content_data.reset_index(drop = True)

content_data.to_csv('data/content_data.csv', index = False)
#content_data = pd.read_csv('data/content_data.csv')

# Article length distribution over entire corpus
array = content_data['length'].values
hist, edges = np.histogram(array, bins=100)

source = bp.ColumnDataSource(pd.DataFrame({
    'left' : edges[:-1],
    'top' : hist,
    'right' : edges[1:],
    'bottom' : 0,
    'data_value' : hist
}))

p = bp.figure(width = 550, height = 450)
p.quad(top='top', bottom='bottom', left='left', right='right', line_color="white", source = source)
p.add_tools(HoverTool(tooltips= [("Value", "@data_value")]))
p.xaxis.axis_label = 'Length of Article'
p.yaxis.axis_label = 'Frequency'
output_notebook()
show(p)

### Topic Modelling (LDA)
The first step after cleaning article corpus is to use a generative model for finding topic distribution across all articles. We will use the Latent Dirichlet Allocation (LDA) model to generate distribution. Document-term matrix is created using CountVectorizer to tokenize words greater than 3 characters (alphanumeric) and remove tokens which appear in less than 30 documents or in more than 20% of documents. The text will be pre-processed by removing stopwords, punctuations and lemmatizing words with their part of speech. 

In [15]:
import gensim
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import string

def pre_process(text):    
    stopwords = set(nltk.corpus.stopwords.words('english'))
    punctuation = set(string.punctuation)
    lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'[0-9a-zA-Z]+')

    def convert_tag(tag):
        """
        Convert the tag given by nltk.pos_tag to the tag used by wordnet
        """
        tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
        try:
            return tag_dict[tag[0]]
        except KeyError:
            return 'a'

    cl_text = (" ").join(tokenizer.tokenize(text))
    cl_text = (" ").join([s for s in cl_text.lower().split() if s not in stopwords])
    cl_text = ("").join([s for s in cl_text if s not in punctuation])
    cl_text = nltk.word_tokenize(cl_text)
    pos = nltk.pos_tag(cl_text)
    pos = [convert_tag(t[1]) for t in pos]
    cl_text = [lemmatizer.lemmatize(cl_text[i], pos[i]) for i in range(len(cl_text))]
    return cl_text

content_data['content_clean'] = content_data['content'].apply(pre_process)
content_data.to_csv('data/content_data.csv', index = False)

# Using CountVectorizor to find more then three letter tokens, removing stop_words, 
# removing tokens that don't appear in at least 10 documents,
# removing tokens that appear in more than 60% of the documents
vect = CountVectorizer(min_df=10, max_df=0.6, stop_words='english', token_pattern='(?u)\\b\\w\\w\\w+\\b')

X = vect.fit_transform(content_data['content_clean'].apply(lambda x: (" ").join(x)))

# Convert sparse matrix to gensim corpus.
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

# Mapping from word IDs to words (To be used in LdaModel's id2word parameter)
id_map = dict((v, k) for k, v in vect.vocabulary_.items())

# Creating dictionary object from the corpus
dct = gensim.corpora.Dictionary.from_corpus(corpus, id_map)

def n_topic_opt_lda():
    '''Determining topic overlap and finding optimum number of topics'''
    from sklearn.metrics.pairwise import cosine_similarity
    def topic_sim(lda_model, num_topic):
        topic_word = lda_model.get_topics()
        avg_sim = []
        for i in range(num_topic):
            arr1 = topic_word[i]
            sim = []
            for j in range(num_topic):
                arr2 = topic_word[j]
                sim_value = cosine_similarity(arr1.reshape(1,-1),arr2.reshape(1,-1))
                sim.append(sim_value)
            avg_sim.append(np.mean(sim))
        return(np.mean(avg_sim))        

    x = np.linspace(1, 100, 10).astype(int)
    topic_overlap = []
    for i in x:
        ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = i, 
                                                   id2word = id_map, passes = 25, random_state = 34)
        topic_overlap.append(topic_sim(ldamodel, i))


    p = bp.figure(plot_width=400, plot_height=400)
    p.line(list(x), topic_overlap, line_width=2)
    p.circle(list(x), topic_overlap, fill_color="white", size=8)
    p.xaxis.axis_label = 'Number of Topics'
    p.yaxis.axis_label = 'Topic Similarity'
    output_notebook()
    show(p)

n_topic_opt_lda()

#### Number of Topics
The above similarity vs number of topics graph can be used to determine the number of topics for LDA modelling. The optimal number of topics will be the minimal number of topics which produces maximum diversity among constituent topics. Using the elbow method, the number of topics used will be **30**. Topic similarity is calculated using cosine similarity between word-level probability distributions.

In [18]:
# Generating topic distribution for the entire corpus

num_topics = 30
# ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = num_topics, id2word = id_map, passes = 40)
ldamodel.save('data/lda.model')

def topic_corpus():
    bow_corpus = [dct.doc2bow(content_data.loc[i,'content_clean']) for i in range(content_data.shape[0])]
    topic_corpus = []
    for i in range(len(bow_corpus)):
        topic_dist = ldamodel[bow_corpus[i]]
        topic_dist = {x[0]:x[1] for x in topic_dist}
        topic_corpus.append(topic_dist)
    topic_corpus = pd.DataFrame(topic_corpus).fillna(0)
    return(topic_corpus)

topic_corpus = topic_corpus()
topic_corpus.to_csv('data/topic_corpus.csv', index = False)

def query_article_topic(list):
    query_content = []
    for at in list:
        try:
            query_content.append(pre_process(extract_content(at)))
        except:
            query_content.append([''])
        
    bow = [dct.doc2bow(i) for i in query_content]
    query_corpus = []
    for i in range(len(bow)):
        topic_dist = ldamodel[bow[i]]
        topic_dist = {x[0]:x[1] for x in topic_dist}
        query_corpus.append(topic_dist)
    query_corpus.append({i:0 for i in range(num_topics)})    
    query_corpus = pd.DataFrame(query_corpus).fillna(0)
    return(query_corpus.iloc[:-1,])

### Topic Modelling (Non-Negative Matrix Factorization)
Non-negative matrix factorization is another topic modelling technique which makes of matrix decomposition technique to break-down document-term matrix into two low-rank matrices. One matrix represents the document-topic distribution whereas the other matrix is topic-word distribution. Unlike the LDA technique, which can only take in word occurences matrix as input, the current technique can also take TF-IDF weighted document-term matrix. This will help to improve topic distribution by giving more weight to rare terms.

However, after extensive trails, it is observed that LDA does a much better job at finding contextually similar articles and provides a sense of topic continuity to its recommendations. NMF assigns more similarity to articles with similar topics but not necessarily the context.

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

n_features = 1000
n_top_words = 20

tfidf_vectorizer = TfidfVectorizer(max_df=0.6, min_df=10,
                                   max_features=n_features,
                                   stop_words='english', 
                                   token_pattern='(?u)\\b\\w\\w\\w+\\b')

tfidf = tfidf_vectorizer.fit_transform(content_data['content_clean'].apply(lambda x: (" ").join(x)))

def n_topic_opt():
    '''Determining topic overlap and finding optimum number of topics'''
    from sklearn.metrics.pairwise import cosine_similarity
    def topic_sim(nmfmodel, num_topic):
        topic_word = nmfmodel.components_
        avg_sim = []
        for i in range(num_topic):
            arr1 = topic_word[i]
            sim = []
            for j in range(num_topic):
                arr2 = topic_word[j]
                sim_value = cosine_similarity(arr1.reshape(1,-1),arr2.reshape(1,-1))
                sim.append(sim_value)
            avg_sim.append(np.mean(sim))
        return(np.mean(avg_sim))        

    x = np.linspace(1, 100, 10).astype(int)
    topic_overlap = []
    for i in x:
        nmfmodel = NMF(n_components = i, random_state= 1, alpha = .1 , l1_ratio = .5).fit(tfidf)
        topic_overlap.append(topic_sim(nmfmodel, i))


    p = bp.figure(plot_width=400, plot_height=400)
    p.line(list(x), topic_overlap, line_width=2)
    p.circle(list(x), topic_overlap, fill_color="white", size=8)
    p.xaxis.axis_label = 'Number of Topics'
    p.yaxis.axis_label = 'Topic Similarity'
    output_notebook()
    show(p)
    
n_topic_opt()    

In [86]:
# Generating topic distribution for the entire corpus

num_topics = 30

nmf = NMF(n_components=num_topics, random_state = 1, alpha = 0.1, l1_ratio = 0.5).fit(tfidf)

def topic_corpus():
    doctopic = nmf.transform(tfidf)
    doctopic = doctopic/np.sum(doctopic, axis = 1, keepdims = True)
    doctopic = pd.DataFrame(doctopic)
    return(doctopic)

topic_corpus = topic_corpus()

def query_article_topic(list):
    query_content = []
    for at in list:
        try:
            query_content.append(pre_process(extract_content(at)))
        except:
            query_content.append([''])
    
    query_content = pd.Series([(" ").join(x) for x in query_content])
    tfidf_query_content = tfidf_vectorizer.transform(query_content)
    topic_dist = nmf.transform(tfidf_query_content)
    topic_dist = topic_dist/np.sum(topic_dist, axis = 1, keepdims = True)
    topic_dist = pd.DataFrame(topic_dist)
    return(topic_dist)

### Semantic Similarity
**LDA** ([read here](http://blog.echen.me/2011/08/22/introduction-to-latent-dirichlet-allocation/)) provides a good way of clustering documents with similar topic distributions. Since this method relies only on bag of words model, there is no focus on the context/meaning of the document. As a result, two highly similar documents by topics may be very different in terms of article context and may not work well for a recommender system where the user will generally prefer new articles with similar context. Thus, topic similarity should be combined with a model which focusses on the semantic similarity of the articles as well.

**WordNet** is a taxonomy of hypernym relationships and synonym sets. It is a good resource for finding semantic similarity between words/sentences but has some limitations. It does not work great at identifying nuances between sentences. Also, it has some limitations in comparing adjectives and adverbs since the taxonomies are very short and words from different taxonomies cannot be compared. This method, though good at finding similarity, is not very accurate and does not take into account changing/new word meanings. Detailed description of WordNet can be found [here](https://www.codeproject.com/Articles/11835/WordNet-based-semantic-similarity-measurement).

**Word-Embeddings** is a relatively new domain in text analytics with some good algorithms on semantic similarity. Word2Vec and GloVe are two most popular examples. Word2Vec operates on two learning algorithms i.e. skip-gram and Continuous Bag of Words. Basic premise of this model is to represent meaning of the word by understanding the context in which it appears. Neural Network Word embedding is a predictive model which aims to predict between a centre word and context words in terms of word vectors. This model is very effective in comparing word/sentence similarities and also trains on the corpus quickly making it ideal to implement here.

In [11]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence

def pre_process_doc2vec(text):    
    stopwords = set(nltk.corpus.stopwords.words('english'))
    punctuation = set(string.punctuation).union(set(['“','”','—','’','‘']))
    punctuation.remove('-')
    cl_text = (" ").join([s for s in text.lower().split() if s not in stopwords])
    cl_text = ("").join([s for s in cl_text if s not in punctuation])
    cl_text = nltk.word_tokenize(cl_text)
    return cl_text

class LabeledLineSentence(object):
    def __init__(self, doc_list):
       self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield LabeledSentence(words = pre_process_doc2vec(doc), tags = [idx])
            
it = LabeledLineSentence(list(content_data['content']))

model = Doc2Vec(size=50, min_count=5, alpha=0.025, min_alpha=0.025, workers=8)
model.build_vocab(it)

for epoch in range(10):
    model.train(it, total_examples = model.corpus_count, epochs = model.iter)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay
    
model.save("data/doc2vec.model")
model = Doc2Vec.load("data/doc2vec.model")

pca_data = pd.DataFrame(columns = list(range(50)))
for i in range(len(model.docvecs)):
    x = pd.DataFrame(model.docvecs[i].reshape(1,-1), columns = list(range(50)))
    pca_data = pd.concat([pca_data, x], axis = 0)
pca_data = pca_data.reset_index(drop = True)    

from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
pca_fln = pca.fit_transform(pca_data)
pca_fln = pd.DataFrame(pca_fln)
pca_fln['news_desk'] = content_data['news_desk']
pca_fln.columns = ['col1','col2','col3']

pca_fln = pca_fln.loc[pca_fln['col3'].isin(['Politics', 'Business', 'Sports','Culture','World','Health','Technology','Retail','Wealth','Travel']), ]

source = bp.ColumnDataSource.from_df(pca_fln)

from bokeh.palettes import d3
import bokeh
palette = d3['Category10'][len(pca_fln['col3'].unique())]
color_map = bokeh.models.CategoricalColorMapper(factors=list(pca_fln['col3'].unique()), palette=palette)

p = bp.figure(title = "Visualization of Article Semantics using PCA (doc2vec)")
p.scatter('col1','col2',source = source, color = {'field': 'col3', 'transform': color_map},
          legend = {'field':'col3'}, alpha = 1, size = 10)
p.border_fill_color = "whitesmoke"
p.add_tools(HoverTool(tooltips= [("Article","@col3")]))
p.xaxis.axis_label = 'Dimension 1'
p.yaxis.axis_label = 'Dimension 2'
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None
show(p)

In the previous section, an enhanced version of the word2vec is implemented called doc2vec which can provide vector represenations of documents with higher the similarity score closer are the two documents in terms of meaning/context. Since a 50 dimensional vector is generated for every article, PCA is used to reduce the vector to two dimensions so that a visual representation can be generated. In the above plot, each point is an article with the colour representing actual domain of the article. Since most of the colours are clustered together, doc2vec has done a good job of segregating similar articles together. There are some overlaps like world-sports and technology-business which seems plausible. A similar plot has been shown in next section but with LDA technique. Though similar domain articles are clustered together, there is high overlap among various domains making it difficult to filter out similar articles.

In [12]:
# LDA based article segregation
pca_data = pd.DataFrame(columns = list(range(num_topics)))
for i in range(topic_corpus.shape[0]):
    x = pd.DataFrame(topic_corpus.iloc[i,].values.reshape(1,-1), columns = list(range(num_topics)))
    pca_data = pd.concat([pca_data, x], axis = 0)
pca_data = pca_data.reset_index(drop = True)    

pca = PCA(n_components = 2)
pca_fln = pca.fit_transform(pca_data)
pca_fln = pd.DataFrame(pca_fln)
pca_fln['news_desk'] = content_data['news_desk']
pca_fln.columns = ['col1','col2','col3']

pca_fln = pca_fln.loc[pca_fln['col3'].isin(['Politics', 'Business', 'Sports','Culture','World','Health','Technology','Retail','Wealth','Travel']), ]

source = bp.ColumnDataSource.from_df(pca_fln)

from bokeh.palettes import d3
import bokeh
palette = d3['Category10'][len(pca_fln['col3'].unique())]
color_map = bokeh.models.CategoricalColorMapper(factors=list(pca_fln['col3'].unique()), palette=palette)

p = bp.figure(title = "Visualization of Article Topic Similarity using PCA (LDA)")
p.scatter('col1','col2',source = source, color = {'field': 'col3', 'transform': color_map},
          legend = {'field':'col3'}, alpha = 0.8, size = 10)
p.border_fill_color = "whitesmoke"
p.add_tools(HoverTool(tooltips= [("Article","@col3")]))
p.xaxis.axis_label = 'Dimension 1'
p.yaxis.axis_label = 'Dimension 2'
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None
show(p)

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
from bokeh.io import curdoc, show, output_notebook
from bokeh.layouts import column, row
from bokeh.models.widgets import TextInput, Button, Paragraph, Div, RadioButtonGroup
from bokeh.models import CustomJS
from bokeh.application.handlers import FunctionHandler
from bokeh.application import Application
from bokeh.server.server import Server
from tornado.ioloop import IOLoop
from functools import partial
from datetime import datetime
import webbrowser
import requests
import os

def default_recs():
    base_url = 'https://api.nytimes.com/svc/mostpopular/v2/mostviewed/all-sections/1.json?api-key=c77ddf1d1b594f76b2773928f324615f'
    url = base_url

    r = requests.get(url)
    json_data = r.json()
    article_meta_data = json_data['results']

    headlines = []
    urls = []
    snippets = []
    for artc in article_meta_data[:10]:
        url = artc['url']
        headline = artc['title']
        snippet = artc['abstract']
        headlines.append(headline)
        urls.append(url)
        snippets.append(snippet)

    article_topics = query_article_topic(urls)
    article_topics.to_csv('data/article_topics.csv', index = False)

    return(headlines, urls, snippets)

headlines_ls, urls_ls, snippets_ls = default_recs()


def modify_doc(doc):
    
    def reset_pref():
        usr = text_user.value
        df = pd.read_csv('data/df.csv')
        df = df.loc[df['name'] != usr,]
        df.to_csv('data/df.csv', index = False)
        try:
            os.remove('data/user_pref/%s.csv' %usr)
        except:
            None

    def rec_generate():
        if(text_user.value == ''):
            return

        bar.text = "<p style='color:red;font-size:120%;text-align:center'><b>Loading...</b></p>"
        
        pref_data = pd.read_csv('data/df.csv')
        usr = text_user.value
        user_list = list(pref_data['name'].values)

        def get_headline(url):
            r = requests.get(url)
            soup = BeautifulSoup(r.text, 'html.parser')
            name_box = soup.findAll('h1')
            return(name_box[0].text)

        def get_snippet(url):
            r = requests.get(url)
            soup = BeautifulSoup(r.text, 'html.parser')
            name_box = soup.findAll('p')
            for i in name_box:
                try:
                    if(len(i.text)>150):
                        snipp = i.text
                        return(snipp)
                except:
                    return("No Snippet Available")

        if(usr in user_list):
            usr_data = pref_data.loc[pref_data['name'] == usr,]
            usr_arr = usr_data.iloc[0].values[1:]
#             usr_arr = usr_data.iloc[0].values[1:]/sum(usr_arr)

            from sklearn.metrics.pairwise import cosine_similarity
            def dot_product(arr1, arr2):
                return(cosine_similarity(arr1.reshape(1,-1),arr2.reshape(1,-1))[0][0])
            score = []
            for i in range(topic_corpus.shape[0]):
                score.append(dot_product(topic_corpus.iloc[i,].values, usr_arr))

            id_score = pd.DataFrame(list(zip(list(content_data['_id']), score)), columns = ['doc_id','lda_sim'])
            doc_data = pd.read_csv('data/user_pref/%s.csv' % usr)
            similarity_df = pd.merge(left = doc_data[['doc_id','cos_sim']], right = id_score, how = 'left', on = 'doc_id')
            similarity_df['age'] = content_data['pub_date'].apply(lambda x: 
                                        datetime.today()-datetime.strptime(x[:19], '%Y-%m-%dT%H:%M:%S')).apply(lambda y:
                                        int(y.total_seconds()/3600))
            similarity_df['age_string'] = similarity_df['age'].apply(lambda x: 
                                                              str(int(x/24))+' day(s)' if x>24 else str(x)+' hour(s)')
            similarity_df['rank'] = 0.3*similarity_df['cos_sim']+0.7*similarity_df['lda_sim']
            churn_rank = list((1000*similarity_df['rank'])/(np.sqrt(1.00001**similarity_df['age'])))
            sorted_index = sorted(range(len(churn_rank)), key=lambda k: churn_rank[k], reverse = True)[:10]

            url = [content_data['web_url'][i] for i in sorted_index]
            time_stamp = [similarity_df['age_string'][i] for i in sorted_index]
            hdls = [get_headline(i) for i in url]
            snipps = [get_snippet(i) for i in url]
            snipps = ['['+time_stamp[i]+' old] '+snipps[i] for i in range(len(snipps))]
            ui_fill(hdls, snipps, url)
        else:
            base_url = 'https://api.nytimes.com/svc/mostpopular/v2/mostviewed/all-sections/1.json?api-key=c77ddf1d1b594f76b2773928f324615f'
            url = base_url

            r = requests.get(url)
            json_data = r.json()
            article_meta_data = json_data['results']

            headlines = []
            urls = []
            snippets = []
            for artc in article_meta_data[:10]:
                url = artc['url']
                headline = artc['title']
                snippet = artc['abstract']
                headlines.append(headline)
                urls.append(url)
                snippets.append(snippet)

            ui_fill(headlines, snippets, urls)


    def on_value_change(attr, old, new, foo):
        if(text_user.value == '' or new == -1):
            return
        
        # Building user preference based on liked article content
        scale = {0:-1,-1:0,1:1}
        pref_data = pd.read_csv('data/df.csv')
        user_list = list(pref_data['name'].values)
        usr = text_user.value

        if(usr not in user_list):
            # Adding user profile to the topic Similarity Data
            df = pd.DataFrame(columns = ['name'] + [str(i) for i in range(num_topics)])
            df.loc[0] = [usr] + [0 for i in range(num_topics)]
            pref_data = pd.concat([pref_data, df], axis = 0)
            pref_data = pref_data.fillna(0)
            pref_data.to_csv('data/df.csv', index = False)

            # Adding user profile to the semantic similarity Data
            df_sem = pd.DataFrame(columns = ['doc_id','cos_sim','count'])
            df_sem.to_csv('data/user_pref/%s.csv' %usr, index = False)

        article_topics = pd.read_csv('data/article_topics.csv')
        pref_data = pd.read_csv('data/df.csv')
        pref_change = scale[new] - scale[old]
#         print(pref_change)
        for i in range(num_topics):
            pref_data.loc[pref_data['name'] == usr, str(i)] += pref_change*article_topics.iloc[foo, i]
        pref_data.to_csv('data/df.csv', index = False)

        # Building document similarity data utilizing semantics
        docvec = model.infer_vector(pre_process_doc2vec(extract_content(urls_ls[foo])))
        doc_sim = []
        for i in range(len(model.docvecs)):
            x = cosine_similarity(model.docvecs[i].reshape(1,-1), docvec.reshape(1,-1))
            doc_sim.append(x[0][0])
        doc_data = pd.DataFrame(np.column_stack([list(content_data['_id']), doc_sim, 
                                                 [1 for i in range(len(model.docvecs))]]), 
                                columns=['doc_id', 'cos_sim', 'count'])

        doc_data_tmp = pd.read_csv('data/user_pref/%s.csv' %usr)
        doc_data = pd.merge(left = doc_data_tmp, right = doc_data, on = 'doc_id', how = 'outer')
        doc_data = doc_data.drop_duplicates('doc_id')
        doc_data = doc_data.loc[pd.notnull(doc_data['doc_id']),]
        doc_data = doc_data.loc[pd.notnull(doc_data['cos_sim_y']),]

        doc_data['cos_sim'] = np.where(pd.isnull(doc_data['cos_sim_x']) & pd.isnull(doc_data['count_x']), 
                                       doc_data['cos_sim_y'], doc_data['cos_sim_x'])
        doc_data['count'] = np.where(pd.isnull(doc_data['cos_sim_x']) & pd.isnull(doc_data['count_x']), 
                                       doc_data['count_y'], doc_data['count_x'])

        doc_data['cos_sim'] = np.where(pd.notnull(doc_data['cos_sim_x']) & pd.notnull(doc_data['count_x']), 
                                       (doc_data['cos_sim_x']*doc_data['count_x']+
                                        doc_data.fillna(0)['cos_sim_y'].astype(float)*doc_data.fillna(0)['count_y'].astype(float))/
                                       (doc_data.fillna(0)['count_x'].astype(float) + doc_data.fillna(0)['count_y'].astype(float)),
                                       doc_data['cos_sim'])
        doc_data['count'] = np.where(pd.notnull(doc_data['cos_sim_x']) & pd.notnull(doc_data['count_x']), 
                                       doc_data['count_x'].astype(float)+doc_data.fillna(0)['count_y'].astype(float), doc_data['count'])

        doc_data = doc_data.drop(['cos_sim_x','cos_sim_y','count_x','count_y'], axis = 1)
        doc_data.to_csv('data/user_pref/%s.csv' %usr, index = False)


    def ui_fill(headlines, snippets, urls):
    
        article_topics = query_article_topic(urls)
        article_topics.to_csv('data/article_topics.csv', index = False)
        
        bar.text = "<p></p>"

        for i in range(len(headlines)):
            items_hl[i].label = headlines[i]
            items_sn[i].text = snippets[i]
            items_rate[i].active = -1
            
        global urls_ls
        urls_ls = urls


    def onclick():
    
        if(text_query.value == ''):
            return
        
        bar.text = "<p style='color:red;font-size:120%;text-align:center'><b>Loading...</b></p>"
        
        base_url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?api-key=c77ddf1d1b594f76b2773928f324615f'
        param_url = '&q='+str(text_query.value)+'&page=0'
        url = base_url + param_url

        r = requests.get(url)
        json_data = r.json()
        article_meta_data = json_data['response']['docs']

        headlines = []
        urls = []
        snippets = []
        for artc in article_meta_data:
            url = artc['web_url']
            headline = artc['headline']['main']
            snippet = artc['snippet']
            headlines.append(headline)
            urls.append(url)
            snippets.append(snippet)

        ui_fill(headlines, snippets, urls)



    output = Paragraph()
    text_query = TextInput(value = "", width=220, height=10, title = "Enter article query")

    get_alert2 = CustomJS(args=dict(text_query=text_query), code = """

    var str = text_query.value;
    if (str == ""){
        alert("Input cannot be blank")
    }
    """
                        )

    button_query = Button(label = 'Search articles', width=220, 
                          height=10, button_type='success', callback = get_alert2)
    
    empty_div = Div(text="", width=70, height=1)
    query_grid = row(text_query, column(empty_div, button_query), name = 'query_grid')
    
    text_user = TextInput(placeholder='Enter your name', value = 'saket', width = 340)
    username = Div(text="Username:", width=70, height=10)

    get_alert = CustomJS(args=dict(text_user=text_user), code = """
    
    var usr = text_user.value;
    var val = cb_obj.active
    if (usr == ""){
        if (val == 1) 
            {cb_obj.active = -1;} 
        else 
            {cb_obj.active = -1;}
        alert("Enter your name before giving preferences")
    }"""
                        )
    
    rec_button = Button(label = 'Get recommendations', width=110, 
                        height=5, button_type='warning', callback = get_alert)
    
    reset_button = Button(label = 'Reset Preferences', width = 150, 
                          height = 5, button_type = 'warning', callback = get_alert)
    
    user_grid1 = row(column(empty_div, username), text_user, name = 'user_grid1')
    user_grid2 = row(row(Div(text="", width = 70), rec_button), 
                     row(Div(text="", width = 5), reset_button),
                     name = 'user_grid2')
    user_grid = column(user_grid1, user_grid2, name = 'user_grid')
    input_grid = row(query_grid, Div(text="", width = 60), user_grid, name = 'input_grid')
    listing_grid = empty_div
    grid = column(input_grid,
                  Div(text=
                      "<p style='color:#808080'>Rate the article 'Less' or 'More' to personalize recommendations</p>",
                        height = 20), 
                  listing_grid,
                  listing_grid,
                  name = 'grid')
    
    items_hl = [Button(label=w, width = 800, height = 40, button_type = 'primary') 
                       for w in headlines_ls]

    items_sn = [Paragraph(text=w, width = 800, height = 61, style = {'background-color':'#F2F3F4'}) 
                       for w in snippets_ls]

    items_rate = [RadioButtonGroup(labels=['Less', 'More'], active = -1, callback = get_alert) 
                         for i in range(len(headlines_ls))]

    items = column([column(row(items_hl[i], items_rate[i]),
                            items_sn[i]) for i in range(len(headlines_ls))])
        
    def redirect_link(foo):
#         print(urls_ls[foo])
        webbrowser.open_new_tab(urls_ls[foo])

    for i, hl in enumerate(items_hl):
        hl.on_click(partial(redirect_link, foo = i))

    for i, hl in enumerate(items_rate):
        hl.on_change('active', partial(on_value_change, foo = i))
        
    bar = Div(text = "<p></p>", height = 20, width = 700)
    load_grid = row(Div(text="", width=100, height=1), bar, Div(text="", width=100, height=1))

    grid.children[2] = load_grid
    grid.children[3] = items
    
    doc.add_root(grid)
    
    button_query.on_click(onclick)
    rec_button.on_click(rec_generate)
    reset_button.on_click(reset_pref)    

output_notebook()
show(modify_doc)