### GIZ Initial Data Exploration
#### author: Emily Robitschek

Purpose: Before we build any model, we need to be able to take a look at the documents we have.

Some resources: 

#### papers mentioned in project proposal: 
https://medium.com/fiscalnoteworthy/citing-your-sources-79062248f468
https://www2.deloitte.com/us/en/insights/focus/cognitive-technologies/natural-language-processing-examples-in-government-data.html
https://documents1.worldbank.org/curated/en/634591516387264234/pdf/WPS8310.pdf

#### NLP related links: 
- https://spacy.io/usage/spacy-101
- https://towardsdatascience.com/text-pre-processing-stop-words-removal-using-different-libraries-f20bac19929a
- https://arunm8489.medium.com/getting-started-with-natural-language-processing-6e593e349675
- https://towardsdatascience.com/natural-language-processing-pipeline-decoded-f97a4da5dbb7

### Import libraries

In [None]:
import os
import glob
import time

import scipy
import numpy as np
import pandas as pd

#set up packages for NLP analysis
import nltk
#nltk.download('punkt') 
#nltk.download('stopwords') #had to also download this
import spacy
nlp=spacy.load('en_core_web_sm') #or the multi-language one: spacy.load('xx_ent_wiki_sm')

### Define helper functions for preprocessing

In [None]:
#input data helper functions
def list_docs(folder):
    """Generates a list of document names for reference and tracking. 
    This command currently extracts the .txt documents from all the subfolders of a parent folder, 
    and filters out the ones containing source information, which we might not want to use in our analysis."""
    doc_names = []
    doc_paths = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith('.txt') and (file not in ['Source.txt', 'Source Link.txt', 'Source Links.txt']):
                doc_names.append(file)
                doc_paths.append(os.path.join(root, file))  
    return doc_names, doc_paths


#NLP related helper functions
#from this resource: https://realpython.com/natural-language-processing-spacy-python/#how-to-download-models-and-data
def is_token_allowed(token):
    '''
        Only allow valid tokens which are not stop words
        and punctuation symbols.
    '''
    if (not token or not token.text.strip() or
        token.is_stop or token.is_punct):
        return False
    return True

def preprocess_token(token):
    # Reduce token to its lowercase lemma form
    return token.lemma_.strip().lower()

### Import data

In [None]:
#keywords (just for testing purposes - from the policy proposal from GIZ)
#eventually want to import a file with keywords
policy = ["policy", "integrate", "implement", "committee", "consultation"]
food = ["nutritions", "diets", "farm", "agriculture", "ecology"]
ndc_national_adaption_plan = ["nap", "sector plan", "nccrp", "vulnerable sector", 
                              "geographic vulnerability"]
ndc_climate_change = ["adaption", "program", "projects", "resilience", "institution",
                      "capacity", "response", "budget", "reprioritisation", "development", 
                      "planner", "regulator", "practitioners", "geographical", 
                      "circumstances", "land", "scheme", "authorisation", "system", 
                      "spluma"]
ndc_early_warning = ["system", "vulnerability", "needs", "assessment", "network", "weather",
   "earth", "observation", "academic", "community"]

In [None]:
cd ../


In [None]:
#here is the general folder with the different types of policy documents
policy_doc_folder = '/Users/emilyrobitschek/git/giz-policy/giz-policy_tracking_docs/SouthAfrica/Data'
policy_doc_names, policy_doc_paths = list_docs(policy_doc_folder)
print("Some of the policy docs include: ", policy_doc_names[:10])

#If want to preserve names and paths of the documents and make them easily searchable, 
#it might be useful to make a dictionary (this could also easily be converted to a dataframe 
#to add more summary information about the document for instance)

policy_doc_dict = {'policy_doc_names': policy_doc_names, 'policy_doc_paths': policy_doc_paths}
policy_doc_df = pd.DataFrame(data=policy_doc_dict, dtype='string')
policy_doc_df.index = policy_doc_df['policy_doc_names']
policy_doc_df.head()

##lets take a look at the document from South Africa used in the example in the proposal first:
test_doc_name = '2nd National Biodiversity Strategy Action Plan 2015-2025.pdf_ocr.txt'
test_doc_path = policy_doc_df.loc[test_doc_name]['policy_doc_paths']

### Preprocess data

In [None]:
### expand contractions (from this resource: https://www.geeksforgeeks.org/nlp-expand-contractions-in-text-processing/)
# import library
import contractions
# contracted text
text = open(test_doc_path).read()

#creating an empty list
expanded_words = []    
for word in text.split():
    #using contractions.fix to expand the shortened words 
    expanded_words.append(contractions.fix(word))   
    
expanded_text = ' '.join(expanded_words)
print('Original text: ', len(text))
print('Expanded_text: ', len(expanded_text))

expanded_text_doc = nlp(expanded_text)
sentences = list(expanded_text_doc.sents)
len(sentences)

for sentence in sentences[:20]:
    print(sentence)

In [None]:
# Extract tokens for the given doc (and their positions)
for token in expanded_text_doc[:50]:
    print (token, token.idx)

In [None]:
tokens = [token for token in expanded_text_doc]
print('These are some of the unfiltered tokens: ', tokens[200:300], '\n')

#filter tokens, and make lowercase and lemmatize (with preprocess function): 
filtered_text_list = [preprocess_token(token) for token in 
                   expanded_text_doc if is_token_allowed(token)]

filtered_text = ' '.join(filtered_text_list)
filtered_tokens = nlp(filtered_text)

#complete_filtered_tokens
print('These are some of the filtered tokens: ', filtered_tokens[200:300])

#need to filter out super weird non words and may want to filter numbers
#may want to find some important accronyms too

### Find most common (and unique) words

In [None]:
from collections import Counter
# Remove stop words and punctuation symbols
words = [token.text for token in filtered_tokens]
word_freq = Counter(words)
# 5 commonly occurring words with their frequencies
common_words = word_freq.most_common(30)
print(common_words)

# Unique words
#unique_words = [word for (word, freq) in word_freq.items() if freq == 1]
#print (unique_words)

In [None]:
word_freq['environmental']
ndc_climate_change_scores = []
for word in ndc_climate_change:
    #print(word, word_freq[word])
    ndc_climate_change_scores.append(word_freq[word])
print("This document has the following number of words related to climate change NDCs: ",
      sum(ndc_climate_change_scores), '\n')

food_scores = []
for word in food:
    #print(word, word_freq[word])
    food_scores.append(word_freq[word])
print("This document has the following number of words related to food: ",
      sum(food_scores))

### Make some plots of the NDC/Thematic key words

In [None]:
#just to test - need to modularise/make more reproducible and tailor output to be useful across documents
#could also link this back to the df of the documents to output a table with some summary metrics for keywords 
#in eaach document of interest

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
#plt.style.use('ggplot')

x = food
y = food_scores
x_pos = [i for i, _ in enumerate(x)]

plt.rcParams["figure.figsize"] = (2,4)
plt.bar(x, y)
plt.xlabel("Food Words")
plt.ylabel("Frequency")
plt.title("Food words in the documents")
plt.xticks(x_pos, x, rotation=90)
plt.show()

x = ndc_climate_change
y = ndc_climate_change_scores
x_pos = [i for i, _ in enumerate(x)]

plt.rcParams["figure.figsize"] = (8,4)
plt.bar(x, y, color='green') 
plt.xlabel("NDC Climate Change Words")
plt.ylabel("Frequency")
plt.title("NDC Climate Change words in the document")
plt.xticks(x_pos, x, rotation=90)
plt.show()

### Parts of speech tagging

In [None]:
for token in filtered_tokens[:50]:
    print(token, token.tag_, token.pos_, spacy.explain(token.tag_))
print('\n')
    
nouns = []
adjectives = []
for token in filtered_tokens:
    if token.pos_ == 'NOUN':
        nouns.append(token)
    if token.pos_ == 'ADJ':
        adjectives.append(token)

print('some nouns spacy called in the document include:', '\n', nouns[:50], '\n')
print('some adjectives spacy called in the document include:', '\n', adjectives[:50])