<h1><span style="color:red">NEMO Annotation for SuAVE</span></h1>

This notebook uses the NEMO (Named Entities Made Obvious) service developed at Microsoft Research (author: Dr. Silviu Cuzerzan, see https://doi.org/10.1145/2633211.2634360) to generate a set of named entities for user's text


## 1. Retrieve survey parameters from the URL

In [None]:
%%javascript
function getQueryStringValue (key)
{  
    return unescape(window.location.search.replace(new RegExp("^(?:.*[&\\?]" + escape(key).replace(/[\.\+\*]/g, "\\$&") + "(?:\\=([^&]*))?)?.*$", "i"), "$1"));
}
IPython.notebook.kernel.execute("survey_url='".concat(getQueryStringValue("surveyurl")).concat("'"));
IPython.notebook.kernel.execute("views='".concat(getQueryStringValue("views")).concat("'"));
IPython.notebook.kernel.execute("view='".concat(getQueryStringValue("view")).concat("'"));
IPython.notebook.kernel.execute("user='".concat(getQueryStringValue("user")).concat("'"));
IPython.notebook.kernel.execute("csv_file='".concat(getQueryStringValue("csv")).concat("'")); 
IPython.notebook.kernel.execute("dzc_file='".concat(getQueryStringValue("dzc")).concat("'")); 
IPython.notebook.kernel.execute("params='".concat(getQueryStringValue("params")).concat("'")); 
IPython.notebook.kernel.execute("active_object='".concat(getQueryStringValue("activeobject")).concat("'")); 
IPython.notebook.kernel.execute("full_notebook_url='" + window.location + "'"); 

## 2. Setting up the environment, importing libraries, defining functions

In [None]:
# common imports
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import Markdown, display

import pandas as pd
pd.set_option('display.max_colwidth', 0)
    
import numpy as np
import panel as pn

pn.extension()
def printmd(string):
    display(Markdown(string))

absolutePath = "../../temp_csvs/"

# local imports
import sys
sys.path.insert(1, '../../helpers')
import panel_libs as panellibs
import suave_integration as suaveint


# specific imports
import requests
from urllib.parse import urlparse
import re
import nemofunc as nemo


In [None]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

## 3. Select a survey file from SuAVE or import a local CSV file

In [None]:
data_select = pn.widgets.RadioBoxGroup(name='Select notebook', options=['Load survey file from SuAVE', 
                                                                        'Import a local CSV file'], 
                                       inline=False)
data_select

In [None]:
data_input = pn.widgets.FileInput()
    
def check_selection():
    if data_select.value == 'Load survey file from SuAVE':
        global fname
        fname = absolutePath + csv_file
        printmd("<b><span style='color:red; font-size: 200%;'>Current SuAVE survey will be loaded. Continue to step 4.</span></b>")

    else:
        message = pn.pane.HTML("<b><span style='color:red; font-size: 200%;'>Upload data and continue to step 4.</span><br><span style='font-size: 150%;'>IMPORTANT: The local CSV file should not have SuAVE-specific variable names!</span></b>", width=700)
        return pn.Column(message, data_input)
    
check_selection()

## 4. Visualize the data and select a text variable to parse

In [None]:
if not pd.isnull(data_input.filename):
    fname = absolutePath + data_input.filename
    data_input.save(fname)

df = panellibs.extract_data(fname)
panellibs.slider(df)


In [None]:
# if the above didn't display the dataframe:
with pd.option_context("display.max_columns", None):
    if any("geometry" in col for col in df.columns):
        display(df.drop(['geometry'],axis=1))
    else:
        display(df)
    

## 5. Generate pre-defined #multi variables for entity types extracted by NEMO

In [None]:
varcols = df.columns.tolist()
# remove any variable names are unlikely to contain parsable text 
varcols = [x for x in varcols if '#number' not in x and '#date' not in x and '#img' not in x and '#href' not in x and '#link' not in x]

# Left panel
left_text = pn.Row("####Select Variables for NEMO Processing", margin=(0,0,-15,270))
binary_selector = pn.widgets.CrossSelector(options=varcols, width=630)
left_panel = pn.Column(left_text, binary_selector, css_classes=['widget-box'], margin=(0,30,0,0))

remap_text = pn.pane.Markdown('####      Make selections and run the next cell ', width=650)

# Display widgets
widgets = pn.Row(left_panel)
full_display = pn.Column(widgets,remap_text)
full_display

In [None]:

# Replace NA with empty in each row
# Convert row to string
# Join row with spaces
concatted = df[binary_selector.value].fillna('').astype(str).dropna().apply(lambda row: ' '.join(row), axis=1)


## 6. Run NEMO

In [None]:
def extract_nemo(payload):
#     print(payload)
    df, respjson = nemo.nemo_annotate(payload)
    return df
extracted_df = concatted.progress_apply(extract_nemo)

## 7. Add generated tags to dataframe and explore

In [None]:
extracted_multi_df=pd.DataFrame()
import numpy as np
nan = np.nan
rows=[]
for i in concatted.index:
    print ("###### index == " + str(i))
    rows.append(nemo.create_nemo_dict(extracted_df[i]))

extracted_multi_df = pd.DataFrame(rows)


In [None]:
# explore the generated dataframe
panellibs.slider(extracted_multi_df)

In [None]:
# alternatively:
with pd.option_context("display.max_columns", None):
    if any("geometry" in col for col in extracted_multi_df.columns):
        display(extracted_multi_df.drop(['geometry'],axis=1))
    else:
        display(extracted_multi_df)
    

## 8. Reoder the columns, and decide which columns to keep

In [None]:
# re-order the columns in the df
columnTitles = nemo.column_order()
extracted_multi_df = extracted_multi_df.reindex(columns=columnTitles)

In [None]:
tags_to_keep = pn.widgets.RadioBoxGroup(name='Tags to keep:', options=['Pre-modeled categories: Location, Geopolitical, Organization, Person', 
                                                                        'All internal NEMO output'], 
                                       inline=False)
tags_to_keep

#### Columns to keep:

        "e_G-y": "Entity: Geopolitical#multi#sortquan",
        "e_G-y_WP": "Entity: Geopolitical URL#hidden",
        "e_O-y": "Entity: Organization#multi#sortquan",
        "e_O-y_WP": "Entity: Organization URL#hidden",
        "e_L-y": "Entity: Location#multi#sortquan",
        "e_L-y_WP": "Entity: Location URL#hidden",
        "e_P-y": "Entity: Personal#multi#sortquan",
        "e_P-y_WP": "Entity: Personal URL#hidden",

        "e_C-y": "Entity: Abstract#multi#sortquan",
        "e_C-y_WP" : "Entity: Abstract URL#hidden",


In [None]:
cols_to_keep = [
     "e_G-y",
    "e_G-y_WP",
    "e_O-y",
    "e_O-y_WP",
    "e_L-y",
    "e_L-y_WP",
    "e_P-y",
    "e_P-y_WP",

    "e_C-y",
    "e_C-y_WP"   
]

if tags_to_keep.value == 'Pre-modeled categories: Location, Geopolitical, Organization, Person':
    extracted_multi_df.drop(extracted_multi_df.columns.difference(cols_to_keep), 1, inplace=True)

panellibs.slider(extracted_multi_df)

In [None]:
# alternatively:
with pd.option_context("display.max_columns", None):
    if any("geometry" in col for col in extracted_multi_df.columns):
        display(extracted_multi_df.drop(['geometry'],axis=1))
    else:
        display(extracted_multi_df)
    

## 9. Rename the columns

In [None]:
# this is a complete list of columns to rename
columns_dict = nemo.columns_dict()

In [None]:
#rename the columns
extracted_multi_df_ren = extracted_multi_df.rename(columns=dict(columns_dict))

In [None]:
df_new = pd.concat([df, extracted_multi_df_ren], axis=1)
print('Dimensions:\n --- The original df: ' +str(df.shape) +'\n --- The NEMO-generated df: '+ str(extracted_multi_df.shape)+'\n --- The concatenated df:' +str(df_new.shape))


## 10. Only keep columns with more than N records as facets (for new fields, or for all fields), make others #hidden

In [None]:
# set min number of values in a column
mincount = 0

# whether to drop columns from the original dataframe as well
drop_from_original=True

if drop_from_original:
    start_column = 0
else:
    start_column = len(df.columns)

for column in df_new.columns:
    if (df_new.columns.get_loc(column) > start_column) & (df_new[column].count() < mincount):
        if column.find("#multi") > 0:
            new_col = column.split('#')[0]+"#multi#hidden"
        elif column.find("#link") > 0:
            new_col = column
        else:
            new_col = column.split('#')[0]+"#hidden"

        df_new = df_new.rename(columns={column:new_col})
        print(column + " renamed to "+ new_col)
            


## 11. Visualize the generated dataframe

In [None]:
panellibs.slider(df_new)

In [None]:
# alternatively:
with pd.option_context("display.max_columns", None):
    if any("geometry" in col for col in df_new.columns):
        display(df_new.drop(['geometry'],axis=1))
    else:
        display(df_new)
    

In [None]:
# if needed, save to a local file. OPTIONAL
df_to_save = df_new.copy().fillna('')
df_to_save.to_csv('dataframe_after_nemo.csv', index=None)


## 12. Save the new version of CSV file, and give a name to new survey

In [None]:
if data_select.value == 'Import a local CSV file':
    csv_file = data_input.filename
    dzc_file = ''
    
new_file = suaveint.save_csv_file(df_new, absolutePath, csv_file)

In [None]:
#Input survey name

import ipywidgets as widgets
from IPython.display import display

input_text = widgets.Text(placeholder='Enter Survey Name...')
output_text = widgets.Text()

def bind_input_to_output(sender):
    output_text.value = input_text.value

# Tell the text input widget to call bind_input_to_output() on submit
input_text.on_submit(bind_input_to_output)

printmd("<b><span style='color:red'>Input survey name here, press Enter, and then run the next cell:</span></b>")
# Display input text box widget for input
display(input_text)

display(output_text)

In [None]:
#Print survey name
survey_name = output_text.value
printmd("<b><span style='color:red'>Survey Name is: </span></b>" + survey_name)

In [None]:
suaveint.create_survey(survey_url,new_file, survey_name, dzc_file, user, csv_file, view, views, data_select.value)