<h1><span style="color:red">NEMO Annotation for SuAVE</span></h1>

This notebook uses the NEMO (Named Entities Made Obvious) service developed at Microsoft Research (author: Dr. Silviu Cuzerzan, see https://doi.org/10.1145/2633211.2634360) to generate a set of named entities for user's text


## 1. Retrieve survey parameters from the URL

In [1]:
%%javascript
function getQueryStringValue (key)
{  
    return unescape(window.location.search.replace(new RegExp("^(?:.*[&\\?]" + escape(key).replace(/[\.\+\*]/g, "\\$&") + "(?:\\=([^&]*))?)?.*$", "i"), "$1"));
}
IPython.notebook.kernel.execute("survey_url='".concat(getQueryStringValue("surveyurl")).concat("'"));
IPython.notebook.kernel.execute("views='".concat(getQueryStringValue("views")).concat("'"));
IPython.notebook.kernel.execute("view='".concat(getQueryStringValue("view")).concat("'"));
IPython.notebook.kernel.execute("user='".concat(getQueryStringValue("user")).concat("'"));
IPython.notebook.kernel.execute("csv_file='".concat(getQueryStringValue("csv")).concat("'")); 
IPython.notebook.kernel.execute("dzc_file='".concat(getQueryStringValue("dzc")).concat("'")); 
IPython.notebook.kernel.execute("params='".concat(getQueryStringValue("params")).concat("'")); 
IPython.notebook.kernel.execute("active_object='".concat(getQueryStringValue("activeobject")).concat("'")); 
IPython.notebook.kernel.execute("full_notebook_url='" + window.location + "'"); 

<IPython.core.display.Javascript object>

## 2. Setting up the environment, importing libraries, defining functions

In [2]:
# common imports
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import Markdown, display

import pandas as pd
pd.set_option('display.max_colwidth', 0)
    
import numpy as np
import panel as pn

pn.extension()
def printmd(string):
    display(Markdown(string))

absolutePath = "/home/jovyan/temp_csvs/"

# local imports
import sys
sys.path.insert(1, '../../helpers')
import panel_libs as panellibs
import suave_integration as suaveint


# specific imports
import requests
from urllib.parse import urlparse
import re
import nemofunc as nemo


In [3]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

## 3. Select a survey file from SuAVE or import a local CSV file

In [4]:
data_select = pn.widgets.RadioBoxGroup(name='Select notebook', options=['Load survey file from SuAVE', 
                                                                        'Import a local CSV file'], 
                                       inline=False)
data_select

In [5]:
data_input = pn.widgets.FileInput()
    
def check_selection():
    if data_select.value == 'Load survey file from SuAVE':
        global fname
        fname = absolutePath + csv_file
        printmd("<b><span style='color:red; font-size: 200%;'>Current SuAVE survey will be loaded. Continue to step 4.</span></b>")

    else:
        message = pn.pane.HTML("<b><span style='color:red; font-size: 200%;'>Upload data and continue to step 4.</span><br><span style='font-size: 150%;'>IMPORTANT: The local CSV file should not have SuAVE-specific variable names!</span></b>", width=700)
        return pn.Column(message, data_input)
    
check_selection()

## 4. Visualize the data and select a text variable to parse

In [6]:
if not pd.isnull(data_input.filename):
    fname = absolutePath + data_input.filename
    data_input.save(fname)

df = panellibs.extract_data(fname)
panellibs.slider(df)


In [7]:
# if the above didn't display the dataframe:
with pd.option_context("display.max_columns", None):
    if any("geometry" in col for col in df.columns):
        display(df.drop(['geometry'],axis=1))
    else:
        display(df)
    

## 5. Generate pre-defined #multi variables for entity types extracted by NEMO

In [8]:
varcols = df.columns.tolist()
# remove any variable names are unlikely to contain parsable text 
varcols = [x for x in varcols if '#number' not in x and '#date' not in x and '#img' not in x and '#href' not in x and '#link' not in x]

# Left panel
left_text = pn.Row("####Select Variables for NEMO Processing", margin=(0,0,-15,270))
binary_selector = pn.widgets.CrossSelector(options=varcols, width=630)
left_panel = pn.Column(left_text, binary_selector, css_classes=['widget-box'], margin=(0,30,0,0))

remap_text = pn.pane.Markdown('####      Make selections and run the next cell ', width=650)

# Display widgets
widgets = pn.Row(left_panel)
full_display = pn.Column(widgets,remap_text)
full_display

In [9]:

# Replace NA with empty in each row
# Convert row to string
# Join row with spaces
concatted = df[binary_selector.value].fillna('').astype(str).dropna().apply(lambda row: ' '.join(row), axis=1)


## 6. Run NEMO

In [11]:
def extract_nemo(payload):
#     print(payload)
    df, respjson = nemo.nemo_annotate(payload)
    return df
extracted_df = concatted.progress_apply(extract_nemo)

## 7. Add generated tags to dataframe and explore

In [12]:
extracted_multi_df=pd.DataFrame()
import numpy as np
nan = np.nan
rows=[]
for i in concatted.index:
    print ("###### index == " + str(i))
    rows.append(nemo.create_nemo_dict(extracted_df[i]))

extracted_multi_df = pd.DataFrame(rows)


In [13]:
# explore the generated dataframe
panellibs.slider(extracted_multi_df)

In [14]:
# alternatively:
with pd.option_context("display.max_columns", None):
    if any("geometry" in col for col in extracted_multi_df.columns):
        display(extracted_multi_df.drop(['geometry'],axis=1))
    else:
        display(extracted_multi_df)
    

## 8. Reoder the columns, and decide which columns to keep

In [18]:
# re-order the columns in the df
columnTitles = nemo.column_order()
extracted_multi_df = extracted_multi_df.reindex(columns=columnTitles)

In [19]:
tags_to_keep = pn.widgets.RadioBoxGroup(name='Tags to keep:', options=['Pre-modeled categories: Location, Geopolitical, Organization, Person', 
                                                                        'All internal NEMO output'], 
                                       inline=False)
tags_to_keep

#### Columns to keep:

        "e_G-y": "Entity: Geopolitical#multi#sortquan",
        "e_G-y_WP": "Entity: Geopolitical URL#hidden",
        "e_O-y": "Entity: Organization#multi#sortquan",
        "e_O-y_WP": "Entity: Organization URL#hidden",
        "e_L-y": "Entity: Location#multi#sortquan",
        "e_L-y_WP": "Entity: Location URL#hidden",
        "e_P-y": "Entity: Personal#multi#sortquan",
        "e_P-y_WP": "Entity: Personal URL#hidden",

        "e_C-y": "Entity: Abstract#multi#sortquan",
        "e_C-y_WP" : "Entity: Abstract URL#hidden",


In [20]:
cols_to_keep = [
     "e_G-y",
    "e_G-y_WP",
    "e_O-y",
    "e_O-y_WP",
    "e_L-y",
    "e_L-y_WP",
    "e_P-y",
    "e_P-y_WP",

    "e_C-y",
    "e_C-y_WP"   
]

if tags_to_keep.value == 'Pre-modeled categories: Location, Geopolitical, Organization, Person':
    extracted_multi_df.drop(extracted_multi_df.columns.difference(cols_to_keep), 1, inplace=True)

panellibs.slider(extracted_multi_df)

In [21]:
# alternatively:
with pd.option_context("display.max_columns", None):
    if any("geometry" in col for col in extracted_multi_df.columns):
        display(extracted_multi_df.drop(['geometry'],axis=1))
    else:
        display(extracted_multi_df)
    

Unnamed: 0,c_U-n,c_U-y,c_U-y_WP,c_C-y,c_C-y_WP,c_W-y,c_W-y_WP,e_N-y,e_N-y_WP,e_C-y,e_C-y_WP,e_G-y,e_G-y_WP,e_O-y,e_O-y_WP,e_M-y,e_M-y_WP,e_U-y,e_U-y_WP,e_F-y,e_F-y_WP,e_H-y,e_H-y_WP,e_W-y,e_W-y_WP,e_L-y,e_L-y_WP,e_P-y,e_P-y_WP,e_J-y,e_J-y_WP,e_V-y,e_V-y_WP,e_A-y,e_A-y_WP,e_I-y,e_I-y_WP,e_B-y,e_B-y_WP,e_Y-y,e_Y-y_WP,e_S-y,e_S-y_WP,e_E-y,e_E-y_WP,e_K-y,e_K-y_WP,e_R-y,e_R-y_WP,e_Q-y,e_Q-y_WP,e_U-n,e_D-d,d_url-n,d_quantity-n,d_age-n,d_phone-n,d_street-n,d_date-n
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1372,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1373,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1374,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1375,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## 9. Rename the columns

In [22]:
# this is a complete list of columns to rename
columns_dict = nemo.columns_dict()

In [23]:
#rename the columns
extracted_multi_df_ren = extracted_multi_df.rename(columns=dict(columns_dict))

In [24]:
df_new = pd.concat([df, extracted_multi_df_ren], axis=1)
print('Dimensions:\n --- The original df: ' +str(df.shape) +'\n --- The NEMO-generated df: '+ str(extracted_multi_df.shape)+'\n --- The concatenated df:' +str(df_new.shape))


Dimensions:
 --- The original df: (1377, 16)
 --- The NEMO-generated df: (1377, 59)
 --- The concatenated df:(1377, 75)


## 10. Only keep columns with more than N records as facets (for new fields, or for all fields), make others #hidden

In [25]:
# set min number of values in a column
mincount = 0

# whether to drop columns from the original dataframe as well
drop_from_original=True

if drop_from_original:
    start_column = 0
else:
    start_column = len(df.columns)

for column in df_new.columns:
    if (df_new.columns.get_loc(column) > start_column) & (df_new[column].count() < mincount):
        if column.find("#multi") > 0:
            new_col = column.split('#')[0]+"#multi#hidden"
        elif column.find("#link") > 0:
            new_col = column
        else:
            new_col = column.split('#')[0]+"#hidden"

        df_new = df_new.rename(columns={column:new_col})
        print(column + " renamed to "+ new_col)
            


## 11. Visualize the generated dataframe

In [26]:
panellibs.slider(df_new)

In [27]:
# alternatively:
with pd.option_context("display.max_columns", None):
    if any("geometry" in col for col in df_new.columns):
        display(df_new.drop(['geometry'],axis=1))
    else:
        display(df_new)
    

Unnamed: 0,Name,OAID#link#multi,Affiliation#sortquan,City#sortquan,Region#sortquan,Country#sortquan,Latitude#hidden,Longitude#hidden,Collaborators#multi#link#sortquan,Scope#multi#sortquan,Keywords#multi#sortquan,OA concepts#multi#sortquan,Publications#hidden,Publication Dates#multi#sortquan,#img,#netvis,Concept: Common Phrase#multi#sortquan,Concept: Formal Term#multi#sortquan,Concept: Formal Term URL#hidden,Concept: Abstract#multi#sortquan,Concept: Abstract URL#hidden,Concept W#multi#sortquan,Concept W URL#hidden,Entity: Covid Term#multi#sortquan,Entity: Covid Term URL#hidden,Entity: Abstract#multi#sortquan,Entity: Abstract URL#hidden,Entity: Geopolitical#multi#sortquan,Entity: Geopolitical URL#hidden,Entity: Organization#multi#sortquan,Entity: Organization URL#hidden,Entity: Media#multi#sortquan,Entity: Media URL#hidden,Entity: Formal#multi#sortquan,Entity: Formal URL#hidden,Entity: Facility#multi#sortquan,Entity: Facility URL#hidden,Entity: Holidays#multi#sortquan,Entity: Holidays URL#hidden,Entity: Arts#multi#sortquan,Entity: Arts URL#hidden,Entity: Location#multi#sortquan,Entity: Location URL#hidden,Entity: Personal#multi#sortquan,Entity: Personal URL#hidden,Entity: Computer Term#multi#sortquan,Entity: Computer Term URL#hidden,Entity: Vehicle#multi#sortquan,Entity: Vehicle URL#hidden,Entity: Medical#multi#sortquan,Entity: Medical URL#hidden,Entity: Informatic#multi#sortquan,Entity: Informatic URL#hidden,Entity: Linguistic#multi#sortquan,Entity: Linguistic URL#hidden,Entity: Peoples#multi#sortquan,Entity: Peoples URL#hidden,Entity: Transportation#multi#sortquan,Entity: Transportation URL#hidden,Entity: Event#multi#sortquan,Entity: Event URL#hidden,Entity: Bureaucratic#multi#sortquan,Entity: Bureaucratic URL#hidden,Entity R#multi#sortquan,Entity R URL#hidden,Entity Q#multi#sortquan,Entity Q URL#hidden,Entity: Formal no-URL#multi#sortquan,Entity for Disambiguation#multi#sortquan,Data URL#hidden,Quantity#multi#sortquan,Age#multi#sortquan,Phone#multi#sortquan,Street#multi#sortquan,Date#multi#sortquan
0,A Olioso,https://openalex.org/A4227955457,Unknown,,,,,,https://openalex.org/A4227955454|https://openalex.org/A4227955461|https://openalex.org/A4227955455|https://openalex.org/A4227955463|https://openalex.org/A4227955453|https://openalex.org/A4227955464|https://openalex.org/A4227955456|https://openalex.org/A4227955462|https://openalex.org/A4227955460|https://openalex.org/A4227955459|https://openalex.org/A4227955452|https://openalex.org/A4227955458,aquifer|transboundary,,Groundwater|Geology|Geotechnical engineering|Hydrology (agriculture)|Aquifer|Environmental science|Computer science|Water resource management,"<a href='#' onClick='javascript:getPublication({oaids:""https://openalex.org/A4227955457"",search:""Keywords,Scope"",OAConcepts:""OA concepts""})'>Show publications</a>",2021,US,02ac504b6e11517e2110d174ea70a1a7ac1cf19899e1a0f23c29558f6225db03,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,A Olioso,https://openalex.org/A4226682424,Unknown,,,,,,https://openalex.org/A4226682420|https://openalex.org/A4226682425|https://openalex.org/A4226682421|https://openalex.org/A4226682429|https://openalex.org/A4226682431|https://openalex.org/A4226682426|https://openalex.org/A4226682422|https://openalex.org/A4226682428|https://openalex.org/A4226682427|https://openalex.org/A4226682419|https://openalex.org/A4226682423|https://openalex.org/A4226682430,aquifer|transboundary,,Groundwater|Geology|Geotechnical engineering|Hydrology (agriculture)|Aquifer|Environmental science|Water resource management,"<a href='#' onClick='javascript:getPublication({oaids:""https://openalex.org/A4226682424"",search:""Keywords,Scope"",OAConcepts:""OA concepts""})'>Show publications</a>",2021,US,8f95a1d08aacc416f1abe22426fe9c9fd2f8f338bb7365407f284e3985165d23,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,A. Alassane,https://openalex.org/A2484425674,Cheikh Anta Diop University,Dakar,,Senegal,14.686944,-17.463333,https://openalex.org/A2434763705|https://openalex.org/A3069707669|https://openalex.org/A2182351332|https://openalex.org/A3051995119,aquifer|transboundary,,Sociology|Population|Water supply|Demography|Groundwater|Water quality|Ecology|Geology|Geotechnical engineering|Groundwater recharge|Hydrology (agriculture)|Environmental engineering|Aquifer|Biology|Environmental science|Water resource management,"<a href='#' onClick='javascript:getPublication({oaids:""https://openalex.org/A2484425674"",search:""Keywords,Scope"",OAConcepts:""OA concepts""})'>Show publications</a>",2010,US,f55f8f5c25002f0f2a2e121be602248623f07494d5161a13d399ab12aa746bac,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,A. Aureli,https://openalex.org/A2422334401,Unknown,,,,,,https://openalex.org/A2304341794|https://openalex.org/A2182540860,aquifer|transboundary,,Karst|Biology|Tourism|Business|Environmental planning|Archaeology|Environmental science|Groundwater|Geotechnical engineering|Water resources|Water resource management|Environmental resource management|Environmental protection|Law|Ecology|Engineering|Multidisciplinary approach|Aquifer|Geography|Political science,"<a href='#' onClick='javascript:getPublication({oaids:""https://openalex.org/A2422334401"",search:""Keywords,Scope"",OAConcepts:""OA concepts""})'>Show publications</a>",2010,US,fee72a7c6e6595abd9a1fe8878cb3c9be76652d50b3986c6b1cb4ac610869e76,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,A. Aureli,https://openalex.org/A3086349667,Unknown,,,,,,https://openalex.org/A3085518772|https://openalex.org/A3085940897|https://openalex.org/A3086175637|https://openalex.org/A3086707070|https://openalex.org/A3216340081|https://openalex.org/A3084770820|https://openalex.org/A3085504345,aquifer|transboundary,,Environmental resource management|Groundwater|Environmental planning|Geology|Geotechnical engineering|Hydrology (agriculture)|Aquifer|Environmental science|Water resource management,"<a href='#' onClick='javascript:getPublication({oaids:""https://openalex.org/A3086349667"",search:""Keywords,Scope"",OAConcepts:""OA concepts""})'>Show publications</a>",2015,US,bd15eb707485fff043de670a9d011e35d88324f1f4113701f894b92ce4c64d88,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1372,Ä½udovÃ­t MolnÃ¡r,https://openalex.org/A2292818572,Unknown,,,,,,https://openalex.org/A2291355566|https://openalex.org/A2292165767,aquifer|transboundary,,Oceanography|Structural basin|Inflow|Cartography|Groundwater|Geology|Tributary|Geotechnical engineering|Hydrology (agriculture)|Aquifer|Alluvium|Environmental science|Geomorphology|Geography,"<a href='#' onClick='javascript:getPublication({oaids:""https://openalex.org/A2292818572"",search:""Keywords,Scope"",OAConcepts:""OA concepts""})'>Show publications</a>",2005,US,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1373,Å½. PekaÅ¡,https://openalex.org/A2491145178,Unknown,,,,,,https://openalex.org/A2478352227|https://openalex.org/A2641079011|https://openalex.org/A3114708001|https://openalex.org/A2061648226,aquifer|transboundary,,Karst|Business|Archaeology|Environmental planning|Process management|Geography,"<a href='#' onClick='javascript:getPublication({oaids:""https://openalex.org/A2491145178"",search:""Keywords,Scope"",OAConcepts:""OA concepts""})'>Show publications</a>",2016,US,46e5ed3ab9553bfac4c049dec30c1d226638344208cecd69b71825f0a43c111b,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1374,Å½elimir PekaÅ¡,https://openalex.org/A4267790636,Unknown,,,,,,https://openalex.org/A4267790634|https://openalex.org/A4267790637|https://openalex.org/A4267790638|https://openalex.org/A4267790639|https://openalex.org/A4267790635,aquifer|transboundary,,Paleontology|Ideal (ethics)|Civil engineering|Karst|Groundwater|Law|Water resource management|Geology|Engineering|Geotechnical engineering|Hydrology (agriculture)|Aquifer|Environmental science|Geography|Political science,"<a href='#' onClick='javascript:getPublication({oaids:""https://openalex.org/A4267790636"",search:""Keywords,Scope"",OAConcepts:""OA concepts""})'>Show publications</a>",2016,US,ae12a61c22472adc53867b99f9e307e4e45871a8cc14c841b752512b2835f1a9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1375,Å½eljko KramariÄ,https://openalex.org/A2591057633,Unknown,,,,,,https://openalex.org/A3200145164|https://openalex.org/A1183704316,aquifer|transboundary,,Virology|Karst|Groundwater|Business|Environmental planning|Archaeology|Geology|Geotechnical engineering|Replication (statistics)|Hydrology (agriculture)|Aquifer|Biology|Environmental science|Geography|Water resource management,"<a href='#' onClick='javascript:getPublication({oaids:""https://openalex.org/A2591057633"",search:""Keywords,Scope"",OAConcepts:""OA concepts""})'>Show publications</a>",2012,US,fe94c788e0bd99c18f77c0f32e6e3976c77e2c0b1e420fead4293dda6417de75,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
# if needed, save to a local file. OPTIONAL
df_to_save = df_new.copy().fillna('')
df_to_save.to_csv('dataframe_after_nemo.csv', index=None)


## 12. Save the new version of CSV file, and give a name to new survey

In [None]:
if data_select.value == 'Import a local CSV file':
    csv_file = data_input.filename
    dzc_file = ''
    
new_file = suaveint.save_csv_file(df_new, absolutePath, csv_file)

In [None]:
#Input survey name

import ipywidgets as widgets
from IPython.display import display

input_text = widgets.Text(placeholder='Enter Survey Name...')
output_text = widgets.Text()

def bind_input_to_output(sender):
    output_text.value = input_text.value

# Tell the text input widget to call bind_input_to_output() on submit
input_text.on_submit(bind_input_to_output)

printmd("<b><span style='color:red'>Input survey name here, press Enter, and then run the next cell:</span></b>")
# Display input text box widget for input
display(input_text)

display(output_text)

In [None]:
#Print survey name
survey_name = output_text.value
printmd("<b><span style='color:red'>Survey Name is: </span></b>" + survey_name)

In [None]:
suaveint.create_survey(survey_url,new_file, survey_name, dzc_file, user, csv_file, view, views, data_select.value)