### GIZ Initial Data Exploration
#### author: Emily Robitschek

Purpose: Before we build any model, we need to be able to take a look at the documents we have.

Some resources: 

#### papers mentioned in project proposal: 
https://medium.com/fiscalnoteworthy/citing-your-sources-79062248f468
https://www2.deloitte.com/us/en/insights/focus/cognitive-technologies/natural-language-processing-examples-in-government-data.html
https://documents1.worldbank.org/curated/en/634591516387264234/pdf/WPS8310.pdf

#### NLP related links: 
- https://spacy.io/usage/spacy-101
- https://towardsdatascience.com/text-pre-processing-stop-words-removal-using-different-libraries-f20bac19929a
- https://arunm8489.medium.com/getting-started-with-natural-language-processing-6e593e349675
- https://towardsdatascience.com/natural-language-processing-pipeline-decoded-f97a4da5dbb7

### Import libraries

In [None]:
import os
import glob
import time
import json

import scipy
import numpy as np
import pandas as pd

#set up packages for processing data types and for NLP analysis
from collections import OrderedDict
import contractions
import spacy
nlp=spacy.load('en_core_web_sm') #or the multi-language one: spacy.load('xx_ent_wiki_sm')

#graphing/visualization packages: 
import matplotlib.pyplot as plt
plt.style.use('ggplot')

### Define helper functions for preprocessing

In [None]:
#input data helper functions
def get_docs_df_from_folder(policy_doc_folder):
    """
    Takes in a folder (can also be with different subfolders) with policy-related text documents 
    and gathers txt docs to analyze from those folders and makes a dataframe of their names and paths.
    
    NOTE: If want to preserve names and paths of the documents and make them easily searchable, it might be useful 
    to export the dictionary/keep that as well to add more summary information about the document for instance. 
    """
    #get the paths and file names
    policy_doc_names, policy_doc_paths = list_docs(policy_doc_folder)
    #print the number of docs and the names of some of them 
    print(("There are %d policy docs" % (len(policy_doc_names))),
          "Some of the policy docs include: ", policy_doc_names[:10])

    policy_doc_dict = {'policy_doc_names': policy_doc_names, 'policy_doc_paths': policy_doc_paths}
    policy_doc_df = pd.DataFrame(data=policy_doc_dict, dtype='string')
    #set index as policy doc names (can clean up/add other column with a neater name without the .txt pieces later)
    policy_doc_df['policy_doc_name_clean'] = (policy_doc_df['policy_doc_names']
                                              .apply(lambda x: x.split('.txt')[0].split('.pdf.ocr')[0]))
    policy_doc_df.index = policy_doc_df['policy_doc_names']
    del policy_doc_df['policy_doc_names'] #remove duplicate column
    return policy_doc_df

def list_docs(folder):
    """
    Generates a list of document names for reference and tracking. 
    This command currently extracts the .txt documents from all the subfolders of a parent folder, 
    and filters out the ones containing source information, which we might not want to use in our analysis.
    """
    doc_names = []
    doc_paths = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith('.txt') and (file not in ['Source.txt', 'Source Link.txt', 'Source Links.txt']):
                doc_names.append(file)
                doc_paths.append(os.path.join(root, file))  
    return doc_names, doc_paths

#NLP related helper functions
#used these resources: 
#https://www.geeksforgeeks.org/nlp-expand-contractions-in-text-processing/)
#https://realpython.com/natural-language-processing-spacy-python/#how-to-download-models-and-data

def fix_contractions(document): 
    """
    Switches contractions like can't to cannot so potentially important words/pieces of words are 
    removed with punctuation removal.
    """
    # original contracted text
    text = open(document).read()
    
    #creating an empty list
    expanded_words = []    
    for word in text.split():
        #using contractions.fix to expand the shortened words 
        expanded_words.append(contractions.fix(word))   

    expanded_text = ' '.join(expanded_words)
    #print('Original text: ', len(text))
    #print('Expanded_text: ', len(expanded_text))
    return expanded_text

def is_token_allowed(token):
    '''
        Only allow valid tokens which are not stop words
        and punctuation symbols.
    '''
    if (not token or not token.text.strip() or
        token.is_stop or token.is_punct):
        return False
    return True

def preprocess_token(token):
    # Reduce token to its lowercase lemma form
    return token.lemma_.strip().lower()

def preprocess_doc(doc_path): 
    """
    Applies NLP framework to a document returns the word tokens and sentence tokens.
    NOTE: may want to also return sentences and other objects too depending on the use case
    """
    #remove contracted words and tokenize the document
    tokens = nlp(fix_contractions(doc_path))
    
    #extract list of word tokens
    token_list = [token for token in tokens]
    #print('These are some of the unfiltered tokens: ', tokens[0:50], '\n')

    #find sentences
    sentences = list(tokens.sents)
    #for sentence in sentences[:10]:
    #    print(sentence)
    return tokens, token_list, sentences

def filter_modify_tokens(tokens):
    """
    This function takes a collection of tokens from the nlp() function applied to text 
    and generates a list of filtered tokens that we then convert into a filtered text and 
    collection of filtered tokens.
    
    NOTE: still need to filter out super weird non words and may want to filter numbers and
    may want to find some important accronyms too (so maybe modify this function later)
    """
    #filter tokens, and make lowercase and lemmatize (with preprocess function): 
    filtered_text_list = [preprocess_token(token) for token in 
                       tokens if is_token_allowed(token)]

    filtered_text = ' '.join(filtered_text_list)
    filtered_tokens = nlp(filtered_text)
    return filtered_tokens

def make_filtered_tokens_from_ndc(ndc_dict): 
    """
    Takes an NDC dictionary and processes the topics and keywords for searching within the documents, 
    assuming we are searching for individual words and to process the words in the same way are processing 
    the document text to have the best chance of finding keywords in the documents.
    """
    ndc_dict_processed = dict()
    unique_keywords = []
    for i in range(0, len(list(ndc_dict.keys()))):
        topic = list(ndc_dict.keys())[i]
        keywords = list(ndc_dict.values())[i]
        print(('Original words from the NDC related to %s : ' % (topic)), 
              len(keywords), keywords, '/n')
        #add keywords from topic (key) to list of values and tokenize those values: 
        keywords.append(topic)
        keywords_tokens = nlp(' '.join(keywords))
        #generate a filtered list of keywords using the same token preprocessing we use in the documents
        keywords_tokens_list = [str(token) for token in filter_modify_tokens(keywords_tokens)]
        #filter non-unique words generated by splitting terms with shared words (e.g. two types of 'plan')
        unique_keywords = list(OrderedDict.fromkeys(keywords_tokens_list))
        print(('Filtered words from the NDC related to %s : ' % (topic)),
              len(unique_keywords), unique_keywords, '/n')
        ndc_dict_processed[topic] = unique_keywords
    return ndc_dict_processed

def calculate_word_freq_ndc(word_freq, ndc_dict, key): 
    """
    Input: The word frequencies calculated by the Counter for the whole document (word_freq), 
    the dictionary of ndc key words organized by topic (ndc_dict), and the ndc topic (key).
    Output: Pull out the word frequencies (word_scores) for each of the NDC words associated 
    with a topic (words) for graphing.
    """
    words = ndc_dict[key]
    word_scores = []
    for word in words: 
        word_scores.append(word_freq[word])
    print(("This document has the following number of words related to %s NDCs: " % (key)),
            sum(word_scores), '\n')
    return words, word_scores 

def graph_word_freq_ndc(words, word_scores, ndc_name, doc_name, output_folder):
    """
    Input: The word frequencies (word_scores) for the NDC words associated 
    with a topic (words) for graphing, including the associated NDC topic (ndc_name) and 
    document name (doc_name) to include in graph and file name for output to the output_folder. 
    Output: Bar graph in the output folder of word frequencies for the NDC words associated 
    with a topic/theme.
    """
    #input data
    x=words
    y=word_scores
    x_pos = [i for i, _ in enumerate(x)]
    
    #set plot parameters
    plt.rcParams["figure.figsize"] = ((len(words)/3),4)
    plt.bar(x, y, color='mediumseagreen')
    plt.xlabel("NDC words: %s" % (ndc_name))
    plt.ylabel("Frequency")
    title = ("%s NDC words in: %s" % (ndc_name, doc_name))
    plt.title(title)
    plt.xticks(x_pos, x, rotation=90)
    plt.savefig((graphs_folder + 'bar_chart_%s.pdf' % (title)), 
                bbox_inches='tight')
    plt.show()

### Import data: Keywords from NDCs

In [None]:
#keywords (just for testing purposes - from the policy proposal from GIZ)
policy = ["policy", "integrate", "implement", "committee", "consultation"]
food = ["nutritions", "diets", "farm", "agriculture", "ecology"]
ndc_national_adaption_plan = ["nap", "sector plan", "nccrp", "vulnerable sector", 
                              "geographic vulnerability"]
ndc_climate_change = ["adaption", "program", "projects", "resilience", "institution",
                      "capacity", "response", "budget", "reprioritisation", "development", 
                      "planner", "regulator", "practitioners", "geographical", 
                      "circumstances", "land", "scheme", "authorisation", "system", 
                      "spluma"]
ndc_early_warning = ["system", "vulnerability", "needs", "assessment", "network", "weather",
   "earth", "observation", "academic", "community"]

#keywrods from json file
json_keywords_SA_file = '../ndc_keywords/ndc_south_africa.json'
keywords_SA_dict = None
with open(json_keywords_SA_file, 'r') as f: 
    keywords_SA_dict = json.load(f)
keywords_SA_dict

ndc_dict = make_filtered_tokens_from_ndc(keywords_SA_dict)

In [None]:
#get words directly from NDCs (eventually want to get words from the NDC itself and group them by topics.)
#
#
#

### Import data: Policy-related documents to analyze

In [None]:
#here is the general folder with the different types of policy documents
policy_doc_folder = '../../giz-policy_tracking_docs/SouthAfrica/Data'
#get df of docs
policy_doc_df = get_docs_df_from_folder(policy_doc_folder)
policy_doc_df.head()

### Preprocess data

In [None]:
##lets take a look at the document from South Africa used in the example in the proposal first:
doc_name = '2nd National Biodiversity Strategy Action Plan 2015-2025.pdf_ocr.txt'
##lets try with another document
#test_doc_name = 'InSessionSpecialEditionBudget2021.pdf.ocr.txt'
doc_path = policy_doc_df.loc[doc_name]['policy_doc_paths']

##lets take a look at the document from South Africa used in the example in the proposal first:
doc_name = 'NDC_South_Africa.pdf.ocr.txt'
##lets try with another document
#test_doc_name = 'InSessionSpecialEditionBudget2021.pdf.ocr.txt'
doc_path = policy_doc_df.loc[doc_name]['policy_doc_paths']

#can also simply specify the first or whichever document in the list by order: 
#(this will be helpful for future versions that involve more interrating over different documents)
#i=53
#doc_name = policy_doc_df.iloc[i]['policy_doc_name_clean']
#doc_path = policy_doc_df.iloc[i]['policy_doc_paths']
print(doc_name, doc_path)
tokens, token_list, sentences = preprocess_doc(doc_path)

#### The token object: 
The tokens have all sorts of useful information association with them, for instance their positions (in token.idx) which we can use these later to define windows. See below for example.

In [None]:
for token in token_list[:20]:
    print (token, token.idx)

We can see from above that the tokens need to be filtered and it might be useful if the words are all made lowercase and the words are lemmatized so the different forms of a word are recognized as the same thing

In [None]:
filtered_tokens = filter_modify_tokens(tokens)
print('These are some of the filtered tokens: ', filtered_tokens[200:300])

### Find most common (and unique) words

In [None]:
from collections import Counter
# Remove stop words and punctuation symbols
words = [token.text for token in filtered_tokens]
word_freq = Counter(words)
# 5 commonly occurring words with their frequencies
common_words = word_freq.most_common(30)
print(common_words)

# Unique words
#unique_words = [word for (word, freq) in word_freq.items() if freq == 1]
#print (unique_words)

In [None]:
for key in ndc_dict.keys(): 
    print (key)

### Make some plots of the NDC/Thematic key words

In [None]:
#just to test - need to modularise/make more reproducible and tailor output to be useful across documents
#could also link this back to the df of the documents to output a table with some summary metrics for keywords 
#in eaach document of interest

graphs_folder = '../../outputs/test_plots/bar_charts/'    
    
for key in ndc_dict.keys(): 
    print("Graphing the occurences of %s words in the document" % key)
    words, word_scores = calculate_word_freq_ndc(word_freq, ndc_dict, str(key))
    graph_word_freq_ndc(words, word_scores, str(key), doc_name, graphs_folder) 

In [None]:
for key in ndc_dict.keys(): 
    if key == 'climate change':
        print("Graphing the occurences of %s words in the document" % key)
        words, word_scores = calculate_word_freq_ndc(word_freq, ndc_dict, str(key))
        graph_word_freq_ndc(words, word_scores, str(key), doc_name, graphs_folder) 

### Where do these words appear in the document?

In [None]:
#[token.idx in tokens for token.text in words]
ndc_climate_idxs = [token.idx for token in tokens if token.text in words]
ndc_climate_idxs
#window = [3886, 5299]
#[i.idx for i in filtered_tokens[:100]]

In [None]:
#ndc_climate_idxs
filtered_tokens_in_window = [token for token in filtered_tokens if (token.idx <= 5600) and (token.idx >= 3700)]
[i.idx for i in filtered_tokens_in_window] #.idx #.ents
[i.idx for i in filtered_tokens[1400:1700]]

### Parts of speech tagging

In [None]:
for token in filtered_tokens[:50]:
    print(token, token.tag_, token.pos_, spacy.explain(token.tag_))
print('\n')
    
nouns = []
adjectives = []
for token in filtered_tokens:
    if token.pos_ == 'NOUN':
        nouns.append(token)
    if token.pos_ == 'ADJ':
        adjectives.append(token)

print('some nouns spacy called in the document include:', '\n', nouns[:50], '\n')
print('some adjectives spacy called in the document include:', '\n', adjectives[:50])

### Entity recognition

In [None]:
from spacy import displacy
#doc = 
#entities=[(i, i.label_, i.label) for i in filtered_tokens[1400:1700].ents]
#entities

token_subset = tokens[100:500]
displacy.render(token_subset, style = "ent", jupyter = True) #use original tokens

### Dependency visualization in sentences

In [None]:
sent_example = nlp("For example, it is estimated that between 9 and 12 million DATE people in impoverished rural areas directly use natural resources such as fuel wood, wild fruits and wooden utensils as a source of energy, food and building material respectively (Shackleton ORG 2004)")
sentence_spans = list(sentences)
sentence_spans[:10]
#displacy.serve(sentence_spans[30:40], style="dep")
displacy.render(sentence_spans[80], style="dep", jupyter= True)

### Experimenting with gensim 

In [None]:
###using this resouce: https://towardsdatascience.com/building-a-topic-modeling-pipeline-with-spacy-and-gensim-c5dc03ffc619
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from pprint import pprint

test_doc = [token.text for token in filtered_tokens]
doc_list = [test_doc]
print(doc_list[0][:15])

# Creates, which is a mapping of word IDs to words.
words = corpora.Dictionary(doc_list)

# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in doc_list]

In [None]:
pprint(lda_model.print_topics(num_words=10))

### Experimenting with sklearn and tfid

In [None]:
def apply_nlp_processing_framework(doc_name, doc_path):
    """Apply the NLP procesing functions to each document"""
    tokens, token_list, sentences = preprocess_doc(doc_path)
    filtered_tokens = filter_modify_tokens(tokens) #remember there is still a lot of weird stuff in here. 
    filtered_token_text = [token.text for token in filtered_tokens]
    return filtered_token_text

doc_list = []
for i in range(0, len(policy_doc_df)): 
    doc_name = policy_doc_df.iloc[i]['policy_doc_name_clean'] 
    doc_path = policy_doc_df.iloc[i]['policy_doc_paths']
    print(i, doc_name, doc_path)
    filtered_token_text = apply_nlp_processing_framework(doc_name, doc_path)
    doc_list.append(filtered_token_text)

In [None]:
print(doc_list[0][:15], doc_list[12][:15], doc_list[68][:15])

In [None]:
#from this resource: https://www.dataquest.io/blog/tutorial-text-classification-in-python-using-spacy/
from gensim import corpora, models
print(doc_list[0][:15], doc_list[12][:15], doc_list[68][:15])

dictionary_LDA = corpora.Dictionary(doc_list)
#dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(list_of_tokens) for list_of_tokens in doc_list]

num_topics = 10
%time lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  passes=4, alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))

for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=10):
    print(str(i)+": "+ topic)
    print()