In [17]:
#NLP analysis
import string
import pandas as pd

import nltk
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet
from nltk import wordpunct_tokenize, WordNetLemmatizer, sent_tokenize, pos_tag

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans

#local modules
from retrieve import *
from clean import *

# initialize constants, lematizer, punctuation and stopwords
lemmatizer = WordNetLemmatizer()
punct = set(string.punctuation)

#define stopwords
custom_stop_words = ['–', '\u2019', 'u', '\u201d', '\u201d.',
                     '\u201c', 'say', 'saying', 'sayings',
                     'says', 'us', 'un', '.\"', 'would',
                     'let', '.”', 'said', ',”'
                     ]
stopwords = set(sw.words('english') + custom_stop_words)

def lemmatize(token, tag):
    # collapse word inflections into single representation
    tag = {
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV,
        'J': wordnet.ADJ
    }.get(tag[0], wordnet.NOUN)

    return lemmatizer.lemmatize(token, tag)

def cab_tokenizer(document):
    # tokenize the corpus
    tokens = []

    # split the document into sentences
    for sent in sent_tokenize(document):
        # tokenize each sentence
        for token, tag in pos_tag(wordpunct_tokenize(sent)):
            # preprocess and remove unnecessary characters
            token = token.lower()
            token = token.strip()
            token = token.strip('_')
            token = token.strip('*')

            # If punctuation, ignore token and continue
            if all(char in punct for char in token):
                continue

            # If stopword, ignore token and continue
            if token in stopwords:
                continue

            # Lemmatize the token and add back to the token
            lemma = lemmatize(token, tag)

            # Append lemmatized token to list
            tokens.append(lemma)
    return tokens

def retrieveTopTDIDF(df, main):
    # index each term's Term Frequency and Inverse Document Frequency
    df = df['bodyText'] # text entries only
    
    # use count vectorizer to find TF and DF of each term
    count_vec = CountVectorizer(tokenizer=cab_tokenizer,
                                ngram_range=(1, 2), min_df=0.2, max_df=0.8)
    X_count = count_vec.fit_transform(df)

    # return total number of tokenized words
    totalTokens = len(count_vec.get_feature_names())

    # cast numpy integers back to python integers
    terms = [{'term': t,
              'tf': X_count[:, count_vec.vocabulary_[t]].sum(),
              'df': X_count[:, count_vec.vocabulary_[t]].count_nonzero()}
             for t in count_vec.vocabulary_]

    topTenTerms = sorted(terms, key=lambda k: (
        k['tf'], k['df']), reverse=True)[:10]

    tokenSum = sum(term['tf'] for term in terms)
    
    #update main data object
    main.update({'totalTokens':totalTokens,'tokenSum':tokenSum,'topTenTerms':topTenTerms})
    
    return main

def createKMeans(df, main):
    tfidf_vec = TfidfVectorizer(tokenizer=cab_tokenizer, ngram_range=(1,2), min_df=0.2, max_df=0.8)
    X = tfidf_vec.fit_transform(df['bodyText'])
    
    kmeans = KMeans(n_clusters=7, random_state=42).fit(X)
    
    #update main data object
    main['kMeanClusters'] = visKMeans(kmeans.n_clusters, kmeans.cluster_centers_, tfidf_vec.get_feature_names())
    
    return main

def visKMeans(n_clusters, cluster_centers, terms, num_word = 5):
    # find features/terms closest to centroids
    ordered_centroids = cluster_centers.argsort()[:, ::-1]
    
    clusters = dict()

    for cluster in range(n_clusters):
        temp = []
        for term_idx in ordered_centroids[cluster, :5]:
            temp.append(terms[term_idx])
        clusters[cluster]= temp
    
    return clusters #formatted clusters
    
def descriptive(df, main):
    #total articles
    main['articleCount'] = len(df.index)
    
    #char tally
    main['totalChar'] = df['charCount'].sum()
    
    #word tally
    main['totalWord'] = df['wordcount'].sum()
    
    return main

#descriptive -> macros, tfidf
#specific -> LSA, K-means

def main():
    #retrieve and clean
    retrieve_articles()
    df = read_in()
    df = scrub(df)
        
    #analyse
    main = dict()
    main = descriptive(df, main)
    main = retrieveTopTDIDF(df, main)
    main = createKMeans(df, main)
    
    print(main)

if __name__ == '__main__':
    main()


{'articleCount': 273, 'totalChar': 1732263, 'totalWord': 294321, 'totalTokens': 251, 'tokenSum': 50827, 'topTenTerms': [{'term': 'woman', 'tf': 1037, 'df': 76}, {'term': 'australia', 'tf': 1013, 'df': 192}, {'term': 'one', 'tf': 933, 'df': 207}, {'term': 'year', 'tf': 771, 'df': 208}, {'term': 'government', 'tf': 753, 'df': 151}, {'term': 'people', 'tf': 684, 'df': 176}, {'term': 'make', 'tf': 679, 'df': 200}, {'term': 'get', 'tf': 643, 'df': 149}, {'term': 'work', 'tf': 592, 'df': 154}, {'term': 'go', 'tf': 580, 'df': 158}], 'kMeanClusters': {0: ['child', 'report', 'government', 'go', 'ask'], 1: ['woman', 'work', 'year', 'make', 'one'], 2: ['report', 'system', 'year', 'claim', 'people'], 3: ['one', 'year', 'go', 'get', 'people'], 4: ['government', 'turnbull', 'minister', 'rate', 'service'], 5: ['party', 'nation', 'labor', 'one', 'election'], 6: ['government', 'market', 'policy', 'australia', 'company']}}
