In [1]:
import simplejson as json
import urllib2
import pandas as pd
import nltk
import re
import numpy as np
import nltk
import string
import time
import pickle
import os

from pandas.io.json import json_normalize
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
# Returns URLs in the range of years given
def get_url(begin_yr, end_yr):
    apiURLroot = 'http://api.nytimes.com/svc/archive/v1/'
#     apiUrl='http://api.nytimes.com/svc/archive/v1/1945/1.json'
    key='api-key=ab651f166e104ed9b22f2cf34bdcdc9b'  # replace with api-kek here
    for year in range(begin_yr, end_yr + 1):
        for month in range(1,13):
            apiUrl = apiURLroot+str(year)+'/'+str(month)+'.json'
            link=[apiUrl, key]
            ReqUrl='?'.join(link)
            yield year,month, ReqUrl

In [3]:
def parse_news_content(df):
    df_news_json_body = json_normalize(df['response.docs'][0])
    df_news_frnt_pg = df_news_json_body[df_news_json_body['type_of_material'] == 'Front Page'][df_news_json_body['type_of_material'] == 'Front Page']
    df_news_body = df_news_frnt_pg[['pub_date','lead_paragraph']]
    df_news_body['para_len'] = df_news_body['lead_paragraph'].apply(lambda(x) : len(str(x)))
    df_news_body = df_news_body[df_news_body['para_len'] > 100]
    df_news_body['news_body'] =  pd.DataFrame(df_news_body['lead_paragraph'].apply(lambda x : pre_process_text(x)))
    return df_news_body

In [4]:
def save_to_pickle(year, month, df):
    if not os.path.exists('news_pickle'):
        os.makedirs('news_pickle')
    file_name = 'news_pickle/' + str(year) + '_' + str(month) + '_' + 'news.pickle'
    with open(file_name,'wb') as f:
        pickle.dump(df, f,pickle.HIGHEST_PROTOCOL)
        f.close()

In [5]:
stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()

with open('exception_words.txt','r') as f:
    exception_words = f.read()
    f.close()
exception_words = re.split('\n',exception_words)
# print(exception_words)
def pre_process_text(text):
#     remove punct
    text_no_punct = "".join([char for char in text if char not in string.punctuation])
#     tokenize
    text_tokens = re.split('\W+',text_no_punct.lower())
#     remove stop words
    text_no_stop_w = [word for word in text_tokens if word not in stopwords ]
#     clean text
    text_remv_excp = [word for word in text_no_stop_w if word not in exception_words ]
    text_clean = [word for word in text_remv_excp if word != 'nan']
#     Lemmatize
    text_lemmatized =  " ".join([wn.lemmatize(word) for word in text_clean])
    return text_lemmatized

In [6]:
# Use JSON to download the archive for one month as a pandas data-fram
def download_content(from_year=1941,to_year=1945):
    for year, month, url in get_url(from_year,to_year):
        
        jstr = urllib2.urlopen(url).read()
        ns_js = json.loads(jstr)
        
        df_news_arch = json_normalize(ns_js)
        
        df_vect = parse_news_content(df_news_arch)
#         Save only the post-processed body of text
        save_to_pickle(year, month, df_vect['news_body'])
        time.sleep(5)
#         break

In [8]:
def execute_lda(from_year=1941,to_year=1945):
    df_by_year = None
    for year in range(from_year, to_year + 1):
        for month in range(1,13):
            v_file = 'news_pickle/' + str(year) + '_' + str(month) + '_news.pickle'
            data = None
            no_of_words = 10
            with open(v_file,'rb') as f:
                data = pickle.load(f)
                f.close()
            df_data = pd.DataFrame(data)
            
            vector_tf = TfidfVectorizer(smooth_idf=True)
            X_tf = vector_tf.fit_transform(df_data['news_body'])
            
            lda = LatentDirichletAllocation(
                                n_components=1, 
                                #learning_decay=.6,
                                max_iter=5,
                                learning_method='batch',
                                #learning_offset=30.,
                                random_state=99
                                )
            lda.fit(X_tf)
            
            for topic_idx, topic in enumerate(lda.components_):
                topics =  " ".join([vector_tf.get_feature_names()[i]
                for i in topic.argsort()[:-no_of_words - 1:-1]])
                
            
            if df_by_year is None:
                df_by_year =  pd.DataFrame(columns = ['year_month','topics'])
                df_by_year = df_by_year.append ({ 'year_month' : str(year) + '_' + str(month), 'topics' : topics}, ignore_index=True)
            else:
                df_by_year = df_by_year.append ({ 'year_month' : str(year) + '_' + str(month), 'topics' : topics}, ignore_index=True)
    return df_by_year
# execute_lda(1950, 1950).head()

In [None]:
download_content(1941, 1950)

  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [None]:
df_by_year = execute_lda(1941, 1950)

In [None]:
pd.set_option('display.max_colwidth',100)
df_by_year.head()