In [17]:
%%time

# article retrieval
import json
import requests
import os
from os import mkdir
from os.path import join, exists
from datetime import date, timedelta

# define dev key and query object
MY_API_KEY = retrieve_key()
API_ENDPOINT = 'http://content.guardianapis.com/search'
my_params = {
    'from-date': "",
    'to-date': "",
    'order-by': "newest",
    'show-fields': 'all',
    'page-size': 50,
    'production-office': 'AUS',
    'lang': 'en',
    'page': 1,
    'api-key': MY_API_KEY
}

def retrieve_articles(start_date, end_date):
    dayrange = range((end_date - start_date).days + 1)
    
    # store articles
    ARTICLES_DIR = 'articles'

    for daycount in dayrange:
        dt = start_date + timedelta(days=daycount)
        datestr = dt.strftime('%Y-%m-%d')
        fname = join(ARTICLES_DIR, datestr + '.json')

        if not exists(fname):
            # then let's download it
            print("Downloading", datestr)
            all_results = []
            my_params['from-date'] = datestr
            my_params['to-date'] = datestr
            current_page = 1
            total_pages = 1
            
            while current_page <= total_pages:
                print("...page", current_page)
                my_params['page'] = current_page
                resp = requests.get(API_ENDPOINT, my_params)
                data = resp.json()
                all_results.extend(data['response']['results'])
                # if there is more than one page
                current_page += 1
                total_pages = data['response']['pages']

            with open(fname, 'w') as f:
                print("Writing to", fname)
                # re-serialize it for pretty indentation
                f.write(json.dumps(all_results, indent=2))

Downloading 2017-03-01
...page 1
Writing to articles/2017-03-01.json
Downloading 2017-03-02
...page 1
Writing to articles/2017-03-02.json
CPU times: user 42.6 ms, sys: 35.5 ms, total: 78.1 ms
Wall time: 3.91 s


In [16]:
import os
import pandas as pd
import json

def flatten(article):
    # collapse internal feature dictionary
    if isinstance(article, dict):
        temp = {}
        temp.update(article['fields'])
        article.pop('fields')
        temp.update(article)
        return temp

def read_in():
    # collapse all articles into single df
    files = os.listdir('./articles')
    articles = []

    # flatten and append each article
    for each in files:
        try: #ensure parsing is encoding-resistant..
            with open('./articles/' + each) as json_data:
                for article in json.load(json_data):
                    articles.append(flatten(article))
        except:
            pass
        
    df = pd.DataFrame(articles)
    return df


def scrub(df):
    # include useful components only
    df = df.filter(['type', 'sectionId', 'webPublicationDate', 'webTitle',
                    'trailText', 'byline', 'wordcount', 'firstPublicationDate',
                    'bodyText', 'charCount'])

    # cast date/time types
    df['webPublicationDate'] = pd.to_datetime(df['webPublicationDate'])
    df['firstPublicationDate'] = pd.to_datetime(df['firstPublicationDate'])
    
    #generate meta information for each article
    df['wordcount'] = df['wordcount'].astype(int)
    df['charCount'] = df['charCount'].astype(int)
    
    df.dropna() #all nan and 0 field entries
    df = df.loc[(df!=0).any(axis=1)]
    
    return df

def export_frame(file_name, df):
    #single frame export
    df.to_csv(file_name,index=False)

def write_json(file_name, results):
    # Writing JSON data
    with open(file_name, 'w') as f:
        json.dump(results, f)

def retrieve_key():
    with open('./keys/guardian_key.txt') as f:
        data = f.readlines()
    return data[0]

'372adb16-2704-4b36-9f99-3472dd5ac682'

In [11]:
#NLP analysis
import string, time
import pandas as pd

import nltk
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet
from nltk import wordpunct_tokenize, WordNetLemmatizer, sent_tokenize, pos_tag

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

# #local modules
# from retrieve import *
# from clean import *

# initialize constants, lematizer, punctuation and stopwords
lemmatizer = WordNetLemmatizer()
punct = set(string.punctuation)

#define stopwords
custom_stop_words = ['–', '\u2019', 'u', '\u201d', '\u201d.',
                     '\u201c', 'say', 'saying', 'sayings',
                     'says', 'us', 'un', '.\"', 'would',
                     'let', '.”', 'said', ',”'
                     ]
stopwords = set(sw.words('english') + custom_stop_words)

def lemmatize(token, tag):
    # collapse word inflections into single representation
    tag = {
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV,
        'J': wordnet.ADJ
    }.get(tag[0], wordnet.NOUN)

    return lemmatizer.lemmatize(token, tag)

def cab_tokenizer(document):
    # tokenize the corpus
    tokens = []

    # split the document into sentences
    for sent in sent_tokenize(document):
        # tokenize each sentence
        for token, tag in pos_tag(wordpunct_tokenize(sent)):
            # preprocess and remove unnecessary characters
            token = token.lower()
            token = token.strip()
            token = token.strip('_')
            token = token.strip('*')

            # If punctuation, ignore token and continue
            if all(char in punct for char in token):
                continue

            # If stopword, ignore token and continue
            if token in stopwords:
                continue

            # Lemmatize the token and add back to the token
            lemma = lemmatize(token, tag)

            # Append lemmatized token to list
            tokens.append(lemma)
    return tokens

def format_topics(model,feature_names,no_top_words,time_elapsed):
    #top words for topic within given decomposition model
    analysis = dict()
    for topic_idx, topic in enumerate(model.components_):
        topic_placeholder = "Topic {}".format(topic_idx)
        analysis[topic_placeholder] = [feature_names[i] for i in (-topic).argsort()[:no_top_words]]
    analysis['time_sec'] = time_elapsed
    return analysis

def nmf(df,main,topics):
    #NMF requires TFIDF vectorizer
    st = time.time()
    tfidf_vectorizer = TfidfVectorizer(tokenizer=cab_tokenizer,ngram_range=(1,2),
                                       min_df=0.1, max_df=0.90)
    tfidf = tfidf_vectorizer.fit_transform(df['bodyText'])
    tfidf_feature_names = tfidf_vectorizer.get_feature_names() #fit, transform, retrieve features names
    
    #Non-Negative Matrix Factorization - fit model using tfidf vector
    nmf = NMF(n_components=topics,random_state=1,alpha=0.1,l1_ratio=0.5,init='nndsvd').fit(tfidf)
    et = time.time() - st
    
    main['nmf'] = format_topics(nmf, tfidf_feature_names, topics, et)
    return main

def lda(df,main,topics):
    #LDA requires Count Vectorizer
    st = time.time()
    tf_vectorizer = CountVectorizer(tokenizer=cab_tokenizer,ngram_range=(1,2),
                                       min_df=0.1, max_df=0.90)
    tf = tf_vectorizer.fit_transform(df['bodyText'])
    tf_feature_names = tf_vectorizer.get_feature_names()
    
    #Latent Dirilicht Analysis - fit the model using term frequency vector
    lda = LatentDirichletAllocation(n_components=topics,max_iter=5,learning_method='online',learning_offset=50,random_state=0).fit(tf)
    et = time.time() - st
    
    main['lda'] = format_topics(lda, tf_feature_names, topics, et)
    return main
    
def lsa(df,main,topics):
    #Singular Value Decomposition becomes Latent Semantic Analysis when paired with tfidf vectorizer
    st = time.time()
    tfidf_vectorizer = TfidfVectorizer(tokenizer=cab_tokenizer,ngram_range=(1,2),
                                       min_df=0.1, max_df=0.90)
    tfidf = tfidf_vectorizer.fit_transform(df['bodyText'])
    tfidf_feature_names = tfidf_vectorizer.get_feature_names() #fit, transform, retrieve features names
    
    svd = TruncatedSVD(n_components=topics, algorithm='randomized', n_iter=5, random_state=42).fit(tfidf)
    et = time.time() - st
    
    main['lsa'] = format_topics(svd, tfidf_feature_names, topics, et)
    return main
    
def descriptive(df, main):
    #overview of provided df
    main['articleCount'] = len(df.index)
    main['totalChar'] = df['charCount'].sum()
    main['totalWord'] = df['wordcount'].sum()
    
    return main

def all_analysis(df, label):
    #all desciptive stats, decomposition methods for a provided df
    main_analysis = dict()
    main_analysis['name'] = label
    main_analysis = descriptive(df,main_analysis)
    main_analysis = nmf(df,main_analysis,10)
    main_analysis = lda(df,main_analysis,10)
    main_analysis = lsa(df,main_analysis,10)
    return main_analysis

#To Do
#segmentation => author, time period, topic tagging

In [12]:
def main():
    #1.0 retrieve
    retrieve_articles(date(2017, 3, 1), date(2017, 3, 2))
    
    #2.0 clean
    df = read_in()
    df = scrub(df)
    export_frame('./output/corpus.csv', df)
        
    #3.0 analyse
    analysis = all_analysis(df,'all_articles')
    print(analysis)
    
    #4.0 write results
    write_json('./output/analysis.json', analysis)
    
# if __name__ == '__main__':
#     main()

main()

{'name': 'all_articles', 'articleCount': 61, 'totalChar': 341483, 'totalWord': 57682, 'nmf': {'Topic 0': ['one', 'go', 'get', 'time', 'work', 'play', 'people', 'year', 'need', 'come'], 'Topic 1': ['labor', 'party', 'suggest', 'nation', 'coalition', 'key', 'majority', 'liberal', 'enough', 'election'], 'Topic 2': ['growth', 'quarter', 'economy', 'price', 'wage', 'gdp', '1', 'figure', 'market', 'rate'], 'Topic 3': ['information', 'release', 'department', 'personal', 'security', 'social', 'detail', 'recipient', 'legal', 'minister'], 'Topic 4': ['18c', 'complaint', 'discrimination', 'section', 'speech', 'committee', 'right', 'act', 'australian', 'liberal'], 'Topic 5': ['government', 'power', 'coal', 'minister', 'project', 'party', 'agreement', 'fund', 'australia', 'policy'], 'Topic 6': ['woman', 'men', 'set', 'executive', 'news', 'figure', '‘', 'appear', 'see', 'pay'], 'Topic 7': ['worker', 'penalty', 'rate', 'cut', 'turnbull', 'pay', 'commission', 'labor', 'minister', 'support'], 'Topic 8'