### GIZ Initial Data Exploration
#### author: Emily Robitschek

Purpose: Before we build any model, we need to be able to take a look at the documents we have.

Some resources: 

#### papers mentioned in project proposal: 
https://medium.com/fiscalnoteworthy/citing-your-sources-79062248f468
https://www2.deloitte.com/us/en/insights/focus/cognitive-technologies/natural-language-processing-examples-in-government-data.html
https://documents1.worldbank.org/curated/en/634591516387264234/pdf/WPS8310.pdf

#### NLP related links: 
- https://spacy.io/usage/spacy-101
- https://towardsdatascience.com/text-pre-processing-stop-words-removal-using-different-libraries-f20bac19929a
- https://arunm8489.medium.com/getting-started-with-natural-language-processing-6e593e349675
- https://towardsdatascience.com/natural-language-processing-pipeline-decoded-f97a4da5dbb7

### Import libraries

In [None]:
import os
import glob
import time
import json
import codecs
import re

import scipy
import numpy as np
import pandas as pd

#set up packages for processing data types and for NLP analysis
from collections import OrderedDict, Counter
import contractions
import spacy
from spacy.matcher import PhraseMatcher
from spacy import displacy
nlp = spacy.load('en_core_web_sm') #or the multi-language one: spacy.load('xx_ent_wiki_sm')

#graphing/visualization packages: 
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

### Define helper functions for preprocessing

In [None]:
from datahelper import *
from nlppreprocess import *
from nlpanalysis import *
from n_gram_correlation import *
from correlation import *
from textutils import importer

### Import data: Keywords from NDCs

In [None]:
#keywords (just for testing purposes - from the policy proposal from GIZ)
policy = ["policy", "integrate", "implement", "committee", "consultation"]
food = ["nutritions", "diets", "farm", "agriculture", "ecology"]
ndc_national_adaption_plan = ["nap", "sector plan", "nccrp", "vulnerable sector", 
                              "geographic vulnerability"]
ndc_climate_change = ["adaption", "program", "projects", "resilience", "institution",
                      "capacity", "response", "budget", "reprioritisation", "development", 
                      "planner", "regulator", "practitioners", "geographical", 
                      "circumstances", "land", "scheme", "authorisation", "system", 
                      "spluma"]
ndc_early_warning = ["system", "vulnerability", "needs", "assessment", "network", "weather",
   "earth", "observation", "academic", "community"]

#keywords from json file
json_keywords_SA_file = '../ndc_keywords/ndc_south_africa.json'
keywords_SA_dict = None
with open(json_keywords_SA_file, 'r') as f: 
    keywords_SA_dict = json.load(f)
keywords_SA_dict

ndc_dict = make_filtered_tokens_from_ndc(keywords_SA_dict)

In [None]:
### THIS IS A PLACEHOLDER ###
#get words directly from NDCs (eventually want to get words from the NDC itself and group them by topics.)
#
#
#
ndc_dict

In [None]:
#extra functions
def make_ndc_keyword_tidy_df_from_dict(key, col_group_name, ndc_dict):
    ndc_df = pd.DataFrame({'keyword': ndc_dict[key], 
                       col_group_name: key})
    return ndc_df

def stack_tidy_ndc_dfs(col_group_name, ndc_dict):
    ndc_df = pd.DataFrame()
    for key in ndc_dict.keys():
        ndc_df_add = make_ndc_keyword_tidy_df_from_dict(key, col_group_name, ndc_dict)
        ndc_df = pd.concat([ndc_df, ndc_df_add], axis=0)
    return ndc_df

# lets apply: 
col_group_name = 'NDC'
ndc_df = stack_tidy_ndc_dfs(col_group_name, ndc_dict)
ndc_df.head()


def make_ndc_idx_tidy_df(ndc_dict, topic_name, tokens):
    ndc_idx_df = pd.DataFrame()
    for key in ndc_dict.keys(): 
        ndc_idx_df_to_add = pd.DataFrame({topic_name: key,
                                          #('%s word_index'%(key)): [token.idx for token in tokens if token.text in ndc_dict[key]],
                                          'word_index': [token.idx for token in tokens if token.text in ndc_dict[key]]})
        ndc_idx_df = pd.concat([ndc_idx_df, ndc_idx_df_to_add], axis=0)
    return ndc_idx_df

def filter_idx_for_overlap(idxs, min_dist):
    distance_btwn_idxs = [(idxs[i+1]-idxs[i]) for i in range(0, len(idxs)-1)]
    print(distance_btwn_idxs[:20])
    filtered_idxs = []
    for index, distance in enumerate(distance_btwn_idxs):
        if (distance >= min_dist):
            filtered_idxs.append(idxs[index])
        else:
            pass
    print("The number of times the idx words were found was: ", len(idxs), "\n", 
          "The number of idx words seperated by at least the min_distance was : ", len(filtered_idxs))
    return filtered_idxs
    
def make_window_text(tokens, max_length):
    filtered_for_length = [token.text.lower() for token in tokens if len(token) < max_length]
    text_for_windows = ' '.join(filtered_for_length)
    window_tokens = nlp(text_for_windows)
    return window_tokens

def return_window(ndc_word_index, tokens, size=100):
    lower_limit = ndc_word_index - size
    upper_limit = ndc_word_index + size
    token_idxs = [token.idx for token in tokens]
    window_token_list = []
    #print('The window is ', lower_limit, upper_limit)
    for index, idx in enumerate(token_idxs):
        if (idx >= lower_limit) and (idx <= upper_limit):
            window_token_list.append(tokens[index])
        else:
            pass
    text_for_windows = ' '.join(list(token.text for token in window_token_list))
    window_tokens = nlp(text_for_windows)
    return lower_limit, upper_limit, window_tokens

def find_patterns_df(pattern_list, text, topic_name):
    pattern_locations = []
    pattern_num = []
    for pattern in pattern_list:
        #print(pattern)
        re.findall(pattern, text, flags=0)
        #pattern_locations = [(m.start(0), m.end(0)) for m in re.finditer(pattern, text)] #if want start and end
        locations = [m.start(0) for m in re.finditer(pattern, text)]
        pattern_locations.append(locations)
        pattern_num.append(int(len(locations)))
    #print(pattern_locations)
        #if len(pattern_locations) > 0: 
        #    print(pattern, len(pattern_locations), pattern_locations)
    return pd.DataFrame({'sdg_topic': topic_name,
                         'sdg_keywords': pattern_list,
                         'sdg_keywords_num': pattern_num,
                         'sdg_keyword_locations': pattern_locations})

def make_sdg_df(sdg_list, sdg_ontology, text):
    df_sdg = pd.DataFrame()
    for sdg in list(sdg_list):
        sdg_keywords = list(sdg_ontology[sdg_ontology['clasification']==sdg]['keyword'])
        #print(sdg)
        df_sdg_to_add = find_patterns_df(sdg_keywords, text, topic_name=sdg)
        df_sdg = pd.concat([df_sdg, df_sdg_to_add])
    return df_sdg

### SDG and other topic keywords

In [None]:
sdg_ontology = pd.read_csv('../additional_resources/Ontology_final_modified.csv', sep=';')#, #skiprows=0)

sdg_list = ['SDG1', 'SDG2', 'SDG3', 'SDG4', 'SDG5', 'SDG6', 'SDG7', 'SDG8', 'SDG9', 
            'SDG10', 'SDG11', 'SDG12', 'SDG13', 'SDG14', 'SDG15', 'SDG16', 'SDG17', 
            "mention_money"]

sdg_ontology.head(20)

### Import data: Policy-related documents to analyze

In [None]:
#here is the general folder with the different types of policy documents
policy_doc_folder = '../test_resources/data'
#get df of docs
policy_doc_df = read_docs_to_df(policy_doc_folder)
print(len(policy_doc_df))
policy_doc_df.head()

### Preprocess data

In [None]:
#can also simply specify the first or whichever document in the list by order: 
#(this will be helpful for future versions that involve more interrating over different documents)
i=53
#doc_name = policy_doc_df.iloc[i]['policy_doc_name_clean']
#doc_path = policy_doc_df.iloc[i]['policy_doc_paths']

def run_nlp_pipeline(doc_name, doc_path, ndc_dict, max_word_length):
    print(doc_name, doc_path)
    tokens, token_list, sentences = preprocess_doc(doc_path)
    filtered_tokens = filter_modify_tokens(tokens)
    print('These are some of the filtered tokens: ', filtered_tokens[0:10])
    #words = [token.text for token in filtered_tokens]
    #word_freq = Counter(words)
    #for key in ndc_dict.keys():
    #    topic_frequencies =  calculate_topic_frequency_subset(word_freq, ndc_dict, str(key))
    #ndc_climate_idxs = [token.idx for token in tokens if token.text in ndc_dict['climate change']]
    document_text = ' '.join([token.text for token in tokens if len(token.text)<=max_word_length])
    topic_name='NDC'
    ndc_idx_df = make_ndc_idx_tidy_df(ndc_dict, topic_name, tokens)
    ndc_idx_df.index = ndc_idx_df.NDC.copy()
    df_sdg = make_sdg_df(sdg_list, sdg_ontology, document_text)
    doc_summary_sdg = df_sdg.groupby('sdg_topic')['sdg_keywords_num'].sum()
    doc_summary_sdg_df = doc_summary_sdg.to_frame().rename(columns={"sdg_keywords_num": doc_name})
    return tokens, token_list, filtered_tokens, ndc_idx_df, doc_summary_sdg, doc_summary_sdg_df #topic_frequencies, sentences


max_word_length=25
summary_sdg_df_doc = pd.DataFrame()
count = 1
for i in range(0, len(policy_doc_df[:3])):
    doc_name = policy_doc_df.iloc[i]['policy_doc_name_clean']
    doc_path = policy_doc_df.iloc[i]['policy_doc_paths']
    tokens, token_list, filtered_tokens, ndc_idx_df, doc_summary_sdg, doc_summary_sdg_df = run_nlp_pipeline(doc_name, doc_path, ndc_dict, max_word_length)   
    print(ndc_idx_df.NDC.value_counts())
    if count == 1: 
        summary_sdg_df_doc = doc_summary_sdg_df
        total = doc_summary_sdg_df[doc_name].sum()
        print(doc_name, len(filtered_tokens), total)
    else: 
        summary_sdg_df_doc_for_merge = doc_summary_sdg_df
        total = summary_sdg_df_doc_for_merge[doc_name].sum()
        print(doc_name, len(filtered_tokens), total)
        summary_sdg_df_doc = summary_sdg_df_doc.merge(summary_sdg_df_doc_for_merge, left_index=True, right_index=True)
    count += 1
    #print(ndc_idx_df.head())
summary_sdg_df_doc.head() 

In [None]:
summary_sdg_df_doc

In [None]:
#make heatmap of plot above
plot_folder = '../../outputs/heatmaps/'
plt.figure(figsize=(30, 6)) #16, 6
ax = sns.heatmap(summary_sdg_df_doc,
                 #annot=True, fmt="d", 
                 cmap="YlGnBu")
plt.xlabel("Documents")
plt.ylabel("SDG Topics")
title = ("Distribution of topic keywords in the documents (counts)")
plt.title(title)
file_name=(title + (' all docs SA v1.png'))
# plt.savefig((plot_folder+file_name), dpi=300, bbox_inches='tight')
plt.show()

In [None]:
#summary_sdg_df_doc.sum(axis=0)
#print(summary_sdg_df_doc.div(summary_sdg_df_doc.sum(axis=0), axis=1))
normed_by_col_sum = summary_sdg_df_doc.div(summary_sdg_df_doc.sum(axis=0), axis=1).round(3)
normed_by_col_sum.head()
#normed_by_col_sum.round(2)

In [None]:
plt.figure(figsize=(30, 6)) #16, 6
ax = sns.heatmap(normed_by_col_sum,
                 #annot=True, fmt="d", 
                 cmap="YlGnBu")
plt.xlabel("Documents")
plt.ylabel("SDG Topics")
title = ("Distribution of topic keywords in the documents (normalized)")
plt.title(title)
file_name=(title + (' all docs SA v1.png'))
# plt.savefig((plot_folder+file_name), dpi=300, bbox_inches='tight')
plt.show()

In [None]:
plt.figure(figsize=(30, 6))
ax = sns.heatmap(normed_by_col_sum,
                 #annot=True, #fmt="f", 
                 cmap="YlGnBu")
plt.xlabel("Documents")
plt.ylabel("SDG Topics")
title = ("Distribution of topic keywords in the documents (normalized)")
plt.title(title)
file_name=(title + (' all docs SA v1.png'))
# plt.savefig((plot_folder+file_name), dpi=300, bbox_inches='tight')
plt.show()

In [None]:
plt.figure(figsize=(10, 30))
ax = sns.clustermap(normed_by_col_sum,
                 #annot=True,
                 cbar_pos=(0, .45, .03, .2),
                 cmap="YlGnBu", 
                 xticklabels=1, 
                 figsize=(30, 12))
#plt.xlabel("Documents")
#plt.ylabel("SDG Topics")
title = ("Distribution of topic keywords in the documents (normalized)")
#plt.title(title)
file_name=(title + (' clustermap all docs SA v2.png'))
# plt.savefig((plot_folder+file_name), dpi=300, bbox_inches='tight')
plt.show()

In [None]:
ndc_idx_df.NDC.value_counts()#head()

In [None]:
#look at SDGs across document/at document level
max_length=25
document_text = ' '.join([token.text for token in tokens if len(token.text)<=max_length])

df_sdg = make_sdg_df(sdg_list, sdg_ontology, document_text)
doc_summary_sdg = df_sdg.groupby('sdg_topic')['sdg_keywords_num'].sum()
doc_summary_sdg_df = doc_summary_sdg.to_frame().rename(columns={"sdg_keywords_num": ("%s"% doc_name)})
doc_summary_sdg_df 

#lets see the 50 words that occur the most often
df_sdg.sort_values(by=['sdg_keywords_num'], ascending=False)[:50]

In [None]:
##for i in range(): 
#    print('Processing doc: ', doc_name)

doc_summary_sdg_df 
#if count == 1: 
#    summary_sdg_df = summary_sdg.to_frame().rename(columns={"sdg_keywords_num": ("sdg_kw_%d"% index)})
#else: 
#    summary_sdg_df_for_merge = summary_sdg.to_frame().rename(columns={"sdg_keywords_num": ("sdg_kw_%d"% index)})
#    summary_sdg_df = summary_sdg_df.merge(summary_sdg_df_for_merge, left_index=True, right_index=True)

In [None]:
total = doc_summary_sdg_df[doc_name].sum()
print(doc_name, len(filtered_tokens), total)

### Apply Jonathan's fuzzy search method to find NDC related words

In [None]:
#sample to work with 
doc_name = '2nd National Biodiversity Strategy Action Plan 2015-2025.pdf_ocr.txt'
doc_path = policy_doc_df.loc[doc_name]['policy_doc_paths']

def run_nlp_pipeline_no_lemma(doc_name, doc_path, ndc_dict, max_word_length):
    print(doc_name, doc_path)
    tokens, token_list, sentences = preprocess_doc(doc_path)
    filtered_tokens = filter_tokens(tokens)
    print('These are some of the filtered tokens: ', filtered_tokens[0:10])
    #words = [token.text for token in filtered_tokens]
    #word_freq = Counter(words)
    #for key in ndc_dict.keys():
    #    topic_frequencies =  calculate_topic_frequency_subset(word_freq, ndc_dict, str(key))
    #ndc_climate_idxs = [token.idx for token in tokens if token.text in ndc_dict['climate change']]
    document_text = ' '.join([token.text for token in filtered_tokens if len(token.text)<=max_word_length])
    topic_name='NDC'
    ndc_idx_df = make_ndc_idx_tidy_df(ndc_dict, topic_name, tokens)
    ndc_idx_df.index = ndc_idx_df.NDC.copy()
    df_sdg = make_sdg_df(sdg_list, sdg_ontology, document_text)
    doc_summary_sdg = df_sdg.groupby('sdg_topic')['sdg_keywords_num'].sum()
    doc_summary_sdg_df = doc_summary_sdg.to_frame().rename(columns={"sdg_keywords_num": doc_name})
    return tokens, token_list, sentences, filtered_tokens, document_text, ndc_idx_df, doc_summary_sdg, doc_summary_sdg_df #topic_frequencies, sentences

count=1
max_word_length=25
tokens, token_list, sentences, filtered_tokens, document_text, ndc_idx_df, doc_summary_sdg, doc_summary_sdg_df = run_nlp_pipeline_no_lemma(doc_name, doc_path, ndc_dict, max_word_length)   
print(ndc_idx_df.NDC.value_counts())
if count == 1: 
    summary_sdg_df_doc = doc_summary_sdg_df
    total = doc_summary_sdg_df[doc_name].sum()
    print(doc_name, len(filtered_tokens), total)
else: 
    summary_sdg_df_doc_for_merge = doc_summary_sdg_df
    total = summary_sdg_df_doc_for_merge[doc_name].sum()
    print(doc_name, len(filtered_tokens), total)
    summary_sdg_df_doc = summary_sdg_df_doc.merge(summary_sdg_df_doc_for_merge, left_index=True, right_index=True)
    count += 1
    #print(ndc_idx_df.head())
summary_sdg_df_doc.head() 

In [None]:
document_text[100:200]

In [None]:
##lets take a look at the document from South Africa used in the example in the proposal first:
doc_name = '2nd National Biodiversity Strategy Action Plan 2015-2025.pdf_ocr.txt'
doc_path = policy_doc_df.loc[doc_name]['policy_doc_paths']

annual_report = importer.TextImporter(doc_path)
with open("../ndc_keywords/ndc_south_africa.json") as f:
    ndc_keywords = json.load(f)

climate_keywords = ndc_keywords['climate change']
#climate_keywords = list(sdg_ontology[sdg_ontology['clasification']=='SDG13']['keyword'])
doc = nlp(document_text) #modified to use filtered version of text #nlp(annual_report.text)

In [None]:
#sdg_ontology[sdg_ontology['clasification']=='SDG13']
climate_keywords

In [None]:
n_gram_cor = NGramCorrelateSpacy(climate_keywords, 0.7, "CLIMATE_N")
doc.ents = []
n_gram_cor.correlate_spans(doc, 2)
print(len(doc.ents))

In [None]:
for e in doc.ents[0:10]:
    displacy.render(doc[e.start-20:e.end+20], style='ent')

In [None]:
print(doc.ents[:20])
#idx = [ent.start for ent in doc.ents]
ndc_climate_idxs = [ent.start for ent in doc.ents]
print(len(ndc_climate_idxs))

In [None]:
token_correlator = TokenArrayCorrelator(climate_keywords, 0.4, "CLIMATE_TOKEN")
span_correlator = SpanCorrelator(climate_keywords, 0.4, "CLIMATE_SPAN")
generic_correlator = KeywordCorrelator(climate_keywords)

sentence_to_correlate = "We need to adapt our project to be more resillient to geographical circumstances."
unrelated_sentence = "The next time the leaders will meet in paris"

#print(generic_correlator([sent for sent in sentences]))

In [None]:
ndc_keywords

In [None]:
def label_ndc_spans(ndc_keywords, doc):
    """ndc_keywords is a dictionary, doc is the document text with nlp run on it that is the filtered but not lemmatized document text."""
    matcher = PhraseMatcher(nlp.vocab)
    #iterate through NDC keys in NDC dictionary to create seperate label categories for the matching
    entity_labels = []
    for entity_reference in [key for key in ndc_keywords.keys()]:
        entity_label = entity_reference + ' NDC'
        print(entity_label)
        entity_labels.append(entity_label)
        keywords = ndc_keywords[entity_reference]
        patterns = [nlp(i) for i in keywords]
        #print(keywords, patterns)
        matcher.add(entity_label, None, *patterns)
    matches = matcher(doc)
        #label spans
    for match_id, start, end in matches:
        try:
            span = Span(doc, start, end, label=match_id)
            doc.ents = list(doc.ents) + [span]  # add span to doc.ents
        except: 
            pass
    #print([(ent.text, ent.start, ent.label_) for ent in doc.ents])
    return entity_labels, doc

def label_ndc_spans_return_index(entity_reference, ndc_keywords, document_text):
    """ndc_keywords is a dictionary, document_text is the filtered but not lemmatized document text."""
    entity_label = entity_reference + ' NDC'
    keywords = ndc_keywords[entity_reference]
    patterns = [nlp(i) for i in keywords]
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add(entity_label, None, *patterns)
    doc = nlp(document_text)
    matches = matcher(doc)
    #label spans
    for match_id, start, end in matches:
        try:
            span = Span(doc, start, end, label=match_id)
            doc.ents = list(doc.ents) + [span]  # add span to doc.ents
        except: 
            pass
    #get list of spans related to the ndc
    idxs_from_matcher = [ent.start for ent in doc.ents if ent.label_ == entity_label]
    return doc, idxs_from_matcher

test_text = document_text #"natural nationa ationa organisi association of conservancies of south africa action plan biodiversity assessment biodiversity and business network biodiversity economy strategy biodiversity framework biodiversity strategy and action plan rimental finding development plan department of tourism environmental advisory forum environmental management act environmental skills planning forum implementing"
test_doc = nlp(test_text)
ndc_labels, labelled_doc = label_ndc_spans(ndc_keywords, test_doc)

#displacy.render(labelled_doc, style = "ent", jupyter = True)
#labelled_doc.ents

In [None]:
def make_ndc_num_dict(ndc_keywords, labelled_doc): 
    """ndc_keywords is the ndc keyword dictionary and the labelled_doc is the one labelled with the spans of NDC keywords"""
    ndc_doc_idx_dict = dict()
    ndc_doc_num = dict()
    for entity_reference in [key for key in ndc_keywords.keys()]:
            entity_label = entity_reference + ' NDC'
            print(entity_label) 
            ents_from_matcher = [ent for ent in labelled_doc.ents if ent.label_ == entity_label]
            idxs_from_matcher = [ent.start for ent in labelled_doc.ents if ent.label_ == entity_label]
            ndc_doc_idx_dict[entity_label] = idxs_from_matcher 
            if len(ents_from_matcher)==0:
                ndc_doc_num[entity_label] = 0
            else: 
                ndc_doc_num[entity_label] = len(ents_from_matcher)
    ndc_doc_num_df = pd.DataFrame.from_dict(ndc_doc_num, orient='index').rename(columns={0: 'NDC_word_count_%s' % doc_name})
    return ndc_doc_idx_dict, ndc_doc_num_df

ndc_doc_idx_dict, ndc_doc_num_df = make_ndc_num_dict(ndc_keywords, labelled_doc)
ndc_doc_num_df

In [None]:
if count == 1: 
    summary_sdg_df_doc = doc_summary_sdg_df
    total = doc_summary_sdg_df[doc_name].sum()
    print(doc_name, len(filtered_tokens), total)
else: 
    summary_sdg_df_doc_for_merge = doc_summary_sdg_df
    total = summary_sdg_df_doc_for_merge[doc_name].sum()
    print(doc_name, len(filtered_tokens), total)
    summary_sdg_df_doc = summary_sdg_df_doc.merge(summary_sdg_df_doc_for_merge, left_index=True, right_index=True)
    count += 1

In [None]:
test_text = "helps to address poverty and unemployment . operation phakisa is initially implemented in two sectors , the ocean economy and health , and will be rolled out in other sectors . in the oceans economy four priority areas for unlocking the oceans economy through inclusive economic growth have been identified , one of which is marine protection services and ocean governance . other biodiversity and"
sdg_ontology[sdg_ontology['clasification']=='SDG8']['keyword']
#list(ndc_dict['climate change'])
    
entity_reference = 'climate change'
#print([(ent.text, ent.start, ent.label_) for ent in doc.ents]) #there are some cool default entitites as well
climate_ndc_idxs_from_matcher = label_ndc_spans_return_index(entity_reference, ndc_keywords, document_text)[1]
climate_ndc_idxs_from_matcher

new_doc = label_ndc_spans_return_index(entity_reference, ndc_keywords, document_text)[0]

### Window functions 

In [None]:
# # # From the Windows Notebook
def return_window_i(ndc_word_index, tokens, size=20):
    """size is the number of words to include on either side of the NDC keyword whose position is given by the ndc_word_index. 
    The tokens are the original tokens in the document"""
    lower_limit = ndc_word_index - size
    upper_limit = ndc_word_index + size
    token_indices = [token.i for token in tokens]
    window_tokens = tokens[(ndc_word_index-size):(ndc_word_index+size)] 
    return lower_limit, upper_limit, window_tokens

def return_window_idx(ndc_word_index, tokens, size=100):
    """Uses the .idx positions of the start of the invidual NDC words in the document to define the windows."""
    lower_limit = ndc_word_index - size
    upper_limit = ndc_word_index + size
    token_idxs = [token.idx for token in tokens]
    window_token_list = []
    #print('The window is ', lower_limit, upper_limit)
    for index, idx in enumerate(token_idxs):
        if (idx >= lower_limit) and (idx <= upper_limit):
            window_token_list.append(tokens[index])
        else:
            pass
    text_for_windows = ' '.join(list(token.text for token in window_token_list))
    window_tokens = nlp(text_for_windows)
    return lower_limit, upper_limit, window_tokens

### Where do these words appear in the document?

In [None]:
window_tokens_overall = make_window_text(tokens, max_length=25)

#### Use indexes from spacy matcher 

In [None]:
#may want to increase the min_distance/set it as a function of window size
window_size = 200
min_dist = 200
idx_for_window = filter_idx_for_overlap(idxs=climate_ndc_idxs_from_matcher, min_dist=min_dist)

In [None]:
count = 0
for index in idx_for_window[0:len(idx_for_window)]:
    count += 1
    print(index)
    window_tokens = return_window(index, window_tokens_overall, size=window_size)[2]
    window_text = ' '.join([token.text for token in window_tokens])
    #print(window_text)
    df_sdg = make_sdg_df(sdg_list, sdg_ontology, window_text) #may want to keep this for the windows for a more granular analysis
    df_sdg_sorted = df_sdg.sort_values(by=['sdg_keywords_num'], ascending=False)
    #print(df_sdg_sorted.head())
    print(list(df_sdg_sorted[df_sdg_sorted['sdg_keywords_num'] > 0]['sdg_keywords'][:10])) #print top 10 positive valued keywords
    #print a summary of the SDG words found: 
    summary_sdg = df_sdg.groupby('sdg_topic')['sdg_keywords_num'].sum()
    if count == 1: 
        summary_sdg_df = summary_sdg.to_frame().rename(columns={"sdg_keywords_num": ("sdg_kw_%d"% index)})
    else: 
        summary_sdg_df_for_merge = summary_sdg.to_frame().rename(columns={"sdg_keywords_num": ("sdg_kw_%d"% index)})
        summary_sdg_df = summary_sdg_df.merge(summary_sdg_df_for_merge, left_index=True, right_index=True)

In [None]:
summary_sdg_df

In [None]:
#make heatmap of plot above
plt.figure(figsize=(16, 6))
ax = sns.heatmap(summary_sdg_df,
                 annot=True, #fmt="d", 
                 cmap="YlGnBu")
plt.xlabel("climate change NDC-associated windows")
plt.ylabel("SDG Topics")
title = ("Distribution of topic keywords in climate change NDC-associated windows in %s" % (doc_name))
plt.title(title)
file_name=(title + (' across windows from spacy matching.png'))
# plt.savefig((plot_folder+file_name), dpi=300, bbox_inches='tight')
plt.show()

In [None]:
index=5868
print(return_window(index, window_tokens_overall, size=window_size)[2]) #print(window_tokens[2])
summary_sdg_df[('sdg_kw_%d' % index)]
print("")

index=11424
print(return_window(index, window_tokens_overall, size=window_size)[2]) #print(window_tokens[2])
summary_sdg_df[('sdg_kw_%d' % index)]
print("")

test_text = return_window(index, window_tokens_overall, size=window_size)[2]
test_text

In [None]:
#lets make another function to label categories:

def label_keywords(entity_reference, keyword_list, text):
    """ndc_keywords is a dictionary, document_text is the filtered but not lemmatized document text."""
    entity_label = entity_reference
    patterns = [nlp(i) for i in keyword_list]
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add(entity_label, None, *patterns)
    doc = nlp(document_text)
    matches = matcher(doc)
    #label spans
    for match_id, start, end in matches:
        try:
            span = Span(doc, start, end, label=match_id)
            doc.ents = list(doc.ents) + [span]  # add span to doc.ents
        except: 
            pass
    #get list of spans related to the ndc
    idxs_from_matcher = [ent.start for ent in doc.ents if ent.label_ == entity_label]
    return doc, idxs_from_matcher

sdg_list = ['SDG1', 'SDG2', 'SDG3', 'SDG4', 'SDG5', 'SDG6', 'SDG7', 'SDG8', 'SDG9', 
            'SDG10', 'SDG11', 'SDG12', 'SDG13', 'SDG14', 'SDG15', 'SDG16', 'SDG17', 
            "mention_money"]

index=5868
index=11424
index=18783
print(return_window(index, window_tokens_overall, size=window_size)[2]) #print(window_tokens[2])
summary_sdg_df[('sdg_kw_%d' % index)]

test_text = return_window(index, window_tokens_overall, size=window_size)[2]
test_text

text = test_text
sdg_list = ['SDG15']
for i in sdg_list:
    entity_reference = i
    keyword_list = list(sdg_ontology[sdg_ontology['clasification']==entity_reference]['keyword'])
    text = label_keywords(entity_reference, keyword_list, text)[0]

In [None]:
#lets see if we can find ndc phrases with spacy matcher
#document_text

sdg14_keywords = list(sdg_ontology[sdg_ontology['clasification']=='SDG14']['keyword'])
sdg8_keywords = list(sdg_ontology[sdg_ontology['clasification']=='SDG8']['keyword'])

patterns14 = [nlp(i) for i in sdg14_keywords]
patterns8 = [nlp(i) for i in sdg8_keywords]
matcher = PhraseMatcher(nlp.vocab)
matcher.add('SDG14', None, *patterns14)
matcher.add('SDG8', None, *patterns8)

#doc = nlp("I like bacon and chicken but unfortunately I only had an apple and a carrot in the fridge")
doc = nlp(test_text)
matches = matcher(doc)

for match_id, start, end in matches:
    try:
        span = Span(doc, start, end, label=match_id)
        doc.ents = list(doc.ents) + [span]  # add span to doc.ents
    except: 
        pass

colors = {"SDG14": "#85C1E9", "SDG8": "#ff6961"}
options = {"ents": ['SDG14', 'SDG8'], "colors": colors}
displacy.render(doc, style='ent', options=options) 

### Find most common (and unique) words

In [None]:
print(doc_name, doc_path)
tokens, token_list, sentences = preprocess_doc(doc_path)
filtered_tokens = filter_modify_tokens(tokens)
print('These are some of the filtered tokens: ', filtered_tokens[0:10])
words = [token.text for token in filtered_tokens]
word_freq = Counter(words)
for key in ndc_dict.keys():
    topic_frequencies =  calculate_topic_frequency_subset(word_freq, ndc_dict, str(key))
    print(topic_frequencies)
    
ndc_climate_idxs = [token.idx for token in tokens if token.text in ndc_dict['climate change']]


# Remove stop words and punctuation symbols
words = [token.text for token in filtered_tokens]
word_freq = Counter(words)
# 5 commonly occurring words with their frequencies
common_words = word_freq.most_common(30)
print(common_words)

### Make some plots of the NDC/Thematic key words

In [None]:
#just to test - need to modularise/make more reproducible and tailor output to be useful across documents
#could also link this back to the df of the documents to output a table with some summary metrics for keywords 
#in each document of interest
graphs_folder = '../../outputs/bar_charts/'    
    
for key in ndc_dict.keys(): 
    print("Graphing the occurences of %s words in the document" % key)
    topic_frequencies =  calculate_topic_frequency_subset(word_freq, ndc_dict, str(key))
    plot_word_freq_barchart_ndc(topic_frequencies, str(key), doc_name, graphs_folder) 

In [None]:
ndc_climate_idxs = [ent.start for ent in doc.ents]
if key == 'climate change':
        print("Graphing the occurences of %s words in the document" % key)
        topic_frequencies = calculate_topic_frequency_subset(word_freq, ndc_dict, str(key))
        plot_word_freq_barchart_ndc(topic_frequencies, str(key), doc_name, graphs_folder) 

### Using other resources:

### NDC Ontology with SDG classifications

In [None]:
sdg_ontology = pd.read_csv('../additional_resources/Ontology_final_modified.csv', sep=';')#, #skiprows=0)
SDG1_keywords = list(sdg_ontology[sdg_ontology['clasification']=='SDG1']['keyword'])
#print(SDG1_keywords)
sdg_ontology.head(20)
#print(list(ndc_ontology[ndc_ontology['clasification']=='SDG3']['keyword']))

In [None]:
document_text = ' '.join([token.text for token in window_tokens_overall])

#look at SDGs across document/at document level
sdg_list = ['SDG1', 'SDG2', 'SDG3', 'SDG4', 'SDG5', 'SDG6', 'SDG7', 'SDG8', 'SDG9', 
            'SDG10', 'SDG11', 'SDG12', 'SDG13', 'SDG14', 'SDG15', 'SDG16', 'SDG17', 
            "mention_money"]

df_sdg = make_sdg_df(sdg_list, sdg_ontology, document_text)

#lets see the 50 words that occur the most often
df_sdg.sort_values(by=['sdg_keywords_num'], ascending=False)[:50]

In [None]:
print('Processing doc: ', doc_name)
    
document_text = ' '.join([token.text for token in window_tokens_overall])
df_sdg = make_sdg_df(sdg_list, sdg_ontology, document_text)
doc_summary_sdg = df_sdg.groupby('sdg_topic')['sdg_keywords_num'].sum()
doc_summary_sdg.to_frame().rename(columns={"sdg_keywords_num": ("%s"% doc_name)})

In [None]:
plot_folder = '../../outputs/heatmaps/'
#print a summary of the SDG words found: 
doc_summary_sdg = df_sdg.groupby('sdg_topic')['sdg_keywords_num'].sum()
doc_summary_sdg.to_frame()
#doc_summary_sdg.to_frame()
plt.figure(figsize=(2, 6))
ax = sns.heatmap(doc_summary_sdg.to_frame(), 
                 annot=True, fmt="d", 
                 cmap="YlGnBu")

#plt.xlabel("climate change NDC-associated windows")
plt.ylabel("Topics")
title = ("Topic keywords in %s" % (doc_name))
plt.title(title)
file_name=(title + (' across document v1.png'))
# plt.savefig((plot_folder+file_name), dpi=300, bbox_inches='tight')
plt.show()

In [None]:
df_sdg_sorted  = df_sdg.sort_values(by=['sdg_keywords_num'], ascending=False)
list(df_sdg_sorted[df_sdg_sorted['sdg_keywords_num'] > 0]['sdg_keywords'][:10])

In [None]:
idx_for_window

In [None]:
count = 0
for index in idx_for_window[0:3]:
    count += 1
    print(index)
    window_tokens = return_window(index, window_tokens_overall, size=200)[2]
    window_text = ' '.join([token.text for token in window_tokens])
    print("Window Index: ", window_text)
    df_sdg = make_sdg_df(sdg_list, sdg_ontology, window_text)
    df_sdg_sorted = df_sdg.sort_values(by=['sdg_keywords_num'], ascending=False)
    #print(df_sdg_sorted.head())
    print(list(df_sdg_sorted[df_sdg_sorted['sdg_keywords_num'] > 0]['sdg_keywords'][:10])) #print top 10 positive valued keywords
    #print a summary of the SDG words found: 
    summary_sdg = df_sdg.groupby('sdg_topic')['sdg_keywords_num'].sum()
    if count == 1: 
        summary_sdg_df = summary_sdg.to_frame().rename(columns={"sdg_keywords_num": ("sdg_kw_%d"% index)})
    else: 
        summary_sdg_df_for_merge = summary_sdg.to_frame().rename(columns={"sdg_keywords_num": ("sdg_kw_%d"% index)})
        summary_sdg_df = summary_sdg_df.merge(summary_sdg_df_for_merge, left_index=True, right_index=True)
    print(100 * "-")

In [None]:
#may want to increase the min_distance/set it as a function of window size
window_size = 200
min_dist = 200
idx_for_window = filter_idx_for_overlap(idxs=ndc_climate_idxs, min_dist=min_dist)

In [None]:
count = 0
for index in idx_for_window[0:len(idx_for_window)]:
    count += 1
    print(index)
    window_tokens = return_window(index, window_tokens_overall, size=window_size)[2]
    window_text = ' '.join([token.text for token in window_tokens])
    #print(window_text)
    df_sdg = make_sdg_df(sdg_list, sdg_ontology, window_text) #may want to keep this for the windows for a more granular analysis
    df_sdg_sorted = df_sdg.sort_values(by=['sdg_keywords_num'], ascending=False)
    #print(df_sdg_sorted.head())
    print(list(df_sdg_sorted[df_sdg_sorted['sdg_keywords_num'] > 0]['sdg_keywords'][:10])) #print top 10 positive valued keywords
    #print a summary of the SDG words found: 
    summary_sdg = df_sdg.groupby('sdg_topic')['sdg_keywords_num'].sum()
    if count == 1: 
        summary_sdg_df = summary_sdg.to_frame().rename(columns={"sdg_keywords_num": ("sdg_kw_%d"% index)})
    else: 
        summary_sdg_df_for_merge = summary_sdg.to_frame().rename(columns={"sdg_keywords_num": ("sdg_kw_%d"% index)})
        summary_sdg_df = summary_sdg_df.merge(summary_sdg_df_for_merge, left_index=True, right_index=True)

In [None]:
summary_sdg_df

In [None]:
summary_sdg_df

#make heatmap of plot above
plt.figure(figsize=(16, 6))
ax = sns.heatmap(summary_sdg_df,
                 #annot=True, fmt="d", 
                 cmap="YlGnBu")
plt.xlabel("climate change NDC-associated windows")
plt.ylabel("SDG Topics")
title = ("Distribution of topic keywords in climate change NDC-associated windows in %s" % (doc_name))
plt.title(title)
file_name=(title + (' across all windows v1.png'))
# plt.savefig((plot_folder+file_name), dpi=300, bbox_inches='tight')
plt.show()

In [None]:
#make heatmap of plot above
plt.figure(figsize=(16, 6))
ax = sns.heatmap(summary_sdg_df,
                 #annot=True, fmt="d", 
                 cmap="YlGnBu")
plt.xlabel("Windows with NDC words in the document")
plt.ylabel("SDG Topics")
title = ("Distribution of keywords related to different SDGs in NDC-associated windows in the document")
plt.title(title)
plt.show()

In [None]:
#make heatmap of plot above
plt.figure(figsize=(16, 6))
ax = sns.heatmap(summary_sdg_df.iloc[:, 0:-1], #can show all windows with summary_sdg_df
                 annot=True, fmt="d", 
                 cmap="YlGnBu")
plt.xlabel("climate change NDC-associated windows")
plt.ylabel("SDG Topics")
title = ("Distribution of topic keywords in climate change NDC-associated windows in %s" % (doc_name))
plt.title(title)
file_name=(title + (' across windows 40-100 v1.png'))
# plt.savefig((plot_folder+file_name), dpi=300, bbox_inches='tight')
plt.show()

In [None]:
index=41020
print(return_window(index, window_tokens_overall, size=window_size)[2]) #print(window_tokens[2])
summary_sdg_df[('sdg_kw_%d' % index)]
print("")
index=41340
print(return_window(index, window_tokens_overall, size=window_size)[2]) #print(window_tokens[2])
summary_sdg_df[('sdg_kw_%d' % index)]

#### SDG14: "Conserve and sustainably use the oceans, seas and marine resources for sustainable development"
https://sdgs.un.org/goals/goal14

In [None]:
index=55289
print(return_window(index, window_tokens_overall, size=window_size)[2]) #print(window_tokens[2])
summary_sdg_df[('sdg_kw_%d' % index)]

#### SDG08: "Promote sustained, inclusive and sustainable economic growth, full and productive employment and decent work for all"

In [None]:
index=72837
print(return_window(index, window_tokens_overall, size=window_size)[2]) #print(window_tokens[2])
summary_sdg_df[('sdg_kw_%d' % index)]

#### SDG11: "Make cities and human settlements inclusive, safe, resilient and sustainable"

In [None]:
index=74881
print(return_window(index, window_tokens_overall, size=window_size)[2]) #print(window_tokens[2])
summary_sdg_df[('sdg_kw_%d' % index)]

In [None]:
index=241379
print(return_window(index, window_tokens_overall, size=window_size)[2]) #print(window_tokens[2])
summary_sdg_df[('sdg_kw_%d' % index)]

In [None]:
index=67819
print(return_window(index, window_tokens_overall, size=window_size)[2]) #print(window_tokens[2])
summary_sdg_df[('sdg_kw_%d' % index)]

### Parts of speech tagging

In [None]:
for token in filtered_tokens[:50]:
    print(token, token.tag_, token.pos_, spacy.explain(token.tag_))
print('\n')
    
nouns = []
adjectives = []
for token in filtered_tokens:
    if token.pos_ == 'NOUN':
        nouns.append(token)
    if token.pos_ == 'ADJ':
        adjectives.append(token)

print('some nouns spacy called in the document include:', '\n', nouns[:50], '\n')
print('some adjectives spacy called in the document include:', '\n', adjectives[:50])

### Entity recognition

In [None]:
#can display entity property for the tokens as well: 
entities=[(i, i.label_, i.label) for i in filtered_tokens[1400:1700].ents]
print(entities[:10])

token_subset = tokens[100:500]
displacy.render(token_subset, style = "ent", jupyter = True) #use original tokens

### Dependency visualization in sentences

In [None]:
sent_example = nlp("For example, it is estimated that between 9 and 12 million DATE people in impoverished rural areas directly use natural resources such as fuel wood, wild fruits and wooden utensils as a source of energy, food and building material respectively (Shackleton ORG 2004)")
sentence_spans = list(sentences)
sentence_spans[:10]
displacy.render(sentence_spans[80], style="dep", jupyter= True)