# Code for running topic modeling on the NY TIMES most shared articles on Facebook in the last 7 days

In [2]:
import streamlit as st
import string
import requests
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import nltk
import pymongo
from pymongo import MongoClient
from datetime import datetime, timedelta

client = MongoClient()



In [None]:
# database
db = client.nytimes
# collection
shared = db.shared

st.image('New-York-Times-Logo8x6_0.png.crdownload')

st.title('Topic Modeling')
num_topics = st.number_input('Number of Topics', min_value=1, max_value=200, value=10)

seven_days_ago = datetime.today() - timedelta(days=6)
seven_days_ago = seven_days_ago.replace(hour=0, minute=0, second=0, microsecond=0)
tomorrow = datetime.today() + timedelta(days=1)
tomorrow = tomorrow.replace(hour=0, minute=0, second=0, microsecond=0)

extraction = list(shared.find({'updated': {'$lt': tomorrow, '$gte': seven_days_ago}}, {'_id': 0, 'abstract':1, 'published_date':1, 'title':1, 'adx_keywords':1, 'byline':1, 'section':1}))

# function for transforming articles into dataframe and text preprocessing

def transformation(extraction):
    df = pd.DataFrame(extraction)
    df=df.drop_duplicates(subset='abstract')
    alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
    punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x.lower())
    df['abstract'] = df.abstract.map(alphanumeric).map(punc_lower)
    return df['abstract']

clean_abstract = transformation(extraction)

stopwords = nltk.corpus.stopwords.words('english')
newStopWords = ['much', 'people', 'one', 'thing', 'always', 'every', 'everyone', 'yes', 'know', 'years', 'stop', 'let', 'need', 
               'something', 'find', 'others', 'enough', 'seems', 'often', 'never', 'still', 'like', 'say', 'hope', 'small', 
               'almost', 'take', 'important', 'said', 'turned', 'making', 'like', 'also', 'need', 'get', 'way', 'got', 'came', 'would',
               'could']
stopwords.extend(newStopWords)

def tfidf(docs, stopwords):
    tf = TfidfVectorizer(stop_words=stopwords)
    word_matrix = tf.fit_transform(docs)
    vocab = tf.get_feature_names()
    return word_matrix, vocab

word_matrix, vocab = tfidf(clean_abstract, stopwords)
nmf=NMF(n_components=num_topics)
fitted_nmf=nmf.fit(word_matrix)


st.write('Topics of the Week')
button = st.button('Analyze')
if button:
	topic_names=None
	for ix, topic in enumerate(fitted_nmf.components_):
		if not topic_names or not topic_names[ix]:
			st.write("\nTopic ", ix)
		else:
			st.write("\nTopic: '",topic_names[ix],"'")
		st.write(", ".join([vocab[i] for i in topic.argsort()[:-10 - 1:-1]]))
    