# load related library

In [1]:
%matplotlib inline
# import matplotlib as mpl
# import matplotlib.pyplot as plt

from shapely.geometry import Point
import pandas as pd
import geopandas as gpd
from geopandas import GeoSeries, GeoDataFrame

import ipywidgets
from ipywidgets import widgets 
from ipywidgets import *  
from IPython.display import display,clear_output
#https://github.com/jupyter-widgets/ipywidgets/issues/134 about ouput
from ipywidgets import Layout
import folium
from datetime import datetime
from datetime import date
from dateutil import rrule

from traitlets import directional_link


# install rtree
# https://github.com/kjordahl/SciPy-Tutorial-2015/issues/1
# http://jspeis.com/installing-rtree-on-mac-os-x/

#plt.style.use('ggplot')



In [2]:
# print the version of libraries being used
pd.__version__, gpd.__version__, folium.__version__

('0.19.2', '0.3.0', '0.5.0')

# side function

In [10]:
def loadJson(filename):
    import json
    f = open(filename, "r", encoding = "utf-8")
    output = json.load(f)
    f.close()
    return output
def add_datetime_column(df,unit):
    # unit: date,month,year
    datetime_format = ""
    if unit == "date":
        datetime_format = "%Y-%m-%d"
    elif unit == "month":
        datetime_format = "%Y-%m"
    elif unit == "year":
        datetime_format = "%Y"
    new_list = [datetime.strptime(one,datetime_format) for one in df[unit]]
    if unit == "date":
        df = df.assign(datetime_date=pd.Series(new_list).values)
    elif unit == "month":
        df = df.assign(datetime_month=pd.Series(new_list).values)
    elif unit == "year":
        df = df.assign(datetime_year=pd.Series(new_list).values)
    return df

# load dataset

# load remote dataset (when running in mybinder)

If you do it locally, check the commented one, replace all wiki package with pd.read_csv feeding the file path by yours.

In [5]:
# we use quilt to make all the datasets as package
import quilt
# install dataset
quilt.install("qianhongye/WikiSenti")

# import the dataset
from quilt.data.qianhongye import WikiSenti as wiki

ImportError: No module named 'quilt'

### get score and geo information

In [None]:
# loading csv containing title, score*8, geometry, country, continent
title_score_geo = {}
title_score_geo["Articles"] = wiki.article_score_with_geo._data()
# #if you do it locally
# title_score_geo["Articles"] = pd.read_csv("score_file/title_score_geo_country_continent.csv")

title_score_geo["Talks"] = wiki.talk_score_with_geo._data()
# #if you do it locally
# title_score_geo["Talks"] = pd.read_csv("talk_score_file/title_score_geo_country_continent.csv")

### get date information and exclude BC date

In [16]:

# load csv containing date for people and events
# exclude BC
# change dates to datetime type
dates = {}
temp_df = None
# for people
temp_df = wiki.people_birthdate._data()
#temp_df = pd.read_csv("person_birthdate_space.csv",names=["title","date"])
# exclude BC
temp_df = temp_df[~temp_df['date'].str.startswith("-")]
# add datetime column
temp_df = add_datetime_column(temp_df,'date')
del temp_df['date']
dates["People"] = temp_df

temp_df = None
temp_df = wiki.events_occurrence_date._data()
#temp_df = pd.read_csv("event_date_space.csv",names=["title","date"])
# exclude BC
temp_df = temp_df[~temp_df['date'].str.startswith("-")]
temp_df = add_datetime_column(temp_df,'date')
del temp_df['date']
dates["Events"] = temp_df


In [18]:
print (dates["People"].head(3))

             title        datetime_date
0    Ricardo Prado  1965-01-03 00:00:00
1  Gottlieb GÃ¶ller  1935-05-31 00:00:00
2     Henry Wiggin  1824-02-14 00:00:00


### get people list and events list

In [4]:
people_list = wiki.people_list._data()
#people_list = pd.read_csv("personList_space.csv",names=["title"])
events_list = wiki.events_list._data()
#events_list = pd.read_csv("eventList_space.csv",names=["title"])

### merge score and date to get three dataframes separately for people, events, others

In [19]:
# split title_score_geo into people, events, others three groups, and add date to people, events
df = {"Articles":{},"Talks":{}}
temp_df = None
for one in df:
    # for people
    temp_df = title_score_geo[one].merge(people_list,how='inner')
    print ("length of People for %s with geolocation is %d"%(one,len(temp_df)))
    df[one]["People"] = temp_df.merge(dates["People"],how='left')
    
    temp_df = None
    # for events
    temp_df = title_score_geo[one].merge(events_list,how='inner')
    print ("length of Events for %s with geolocation is %d"%(one,len(temp_df)))
    df[one]["Events"] = temp_df.merge(dates["Events"],how='left')

    temp_df = None
    # for others
    temp_df = title_score_geo[one][~title_score_geo[one]['title'].isin(people_list['title'])]
    df[one]["Others"] = temp_df[~temp_df['title'].isin(events_list['title'])]
    print ("length of Others for %s with geolocation is %d"%(one,len(df[one]["Others"])))
#     print (df[one]["People"].head(2))
#     print ("---")
#     print (df[one]["Others"].head(2))

length of People for Articles with geolocation is 1816
length of Events for Articles with geolocation is 7477
length of Others for Articles with geolocation is 913577
length of People for Talks with geolocation is 1191
length of Events for Talks with geolocation is 4442
length of Others for Talks with geolocation is 268773


In [20]:
print (df["Articles"]["People"].head(3))

                                title  pos_score_OL  neg_score_OL  \
0  Phillips' Sound Recording Services        1.8801        2.9377   
1                Alimpiu Barboloviciu        1.1236        0.0000   
2                   Sarah Fairbrother        3.2590        7.7187   

   pos_score_MPQA  neg_score_MPQA  pos_score_LIWC  neg_score_LIWC  \
0          3.0552          4.1128          2.7027          2.7027   
1          2.2472          0.0000          2.2472          0.0000   
2          4.4597          5.3173          4.2882          3.7736   

   pos_score_ANEW  neg_score_ANEW  length  \
0          3.6187          0.8058     851   
1          0.3989          0.0000      89   
2          4.3353          1.4867     583   

                                     geometry         country continent  \
0           POINT (-2.960055555555555 53.411)  United Kingdom    Europe   
1  POINT (22.91944444444445 47.2988888888889)         Romania    Europe   
2       POINT (-0.2273800000000001 51.52

# get continent, country list from geopandas library

In [9]:
# prepare data for dropdown: continent list and country list

# get world info
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# get continent list plus "all"
continent_list = ["all"] + list(set(world['continent']))
#continent_list.append("all")

continent_country_dict = {}
for continent in continent_list:
    continent_country_dict[continent] = ["all"] + list(world[world['continent']== continent]['name'])
    #continent_country_dict[continent].append("all")
    
print ("continent list for dropdown:")
print (continent_list)

continent list for dropdown:
['all', 'Seven seas (open ocean)', 'South America', 'Asia', 'Africa', 'North America', 'Antarctica', 'Oceania', 'Europe']


# visualization

In [30]:
# design the panel: map panel and side panel
# for map panel:
#     map
#
# for side panel:
#     sentiment container: tell user the color to indicate negative and positive
#         title label
#         negative label
#         positive label
#     lexicon container: Which lexicon you want to choose? (OL, MPQA, LIWC)
#         title label
#         radio_button
#     group container: Which group you want to show? (People, Events, Others)
#         title label
#         checkboxes container:
#             checkbox
#             checkbox
#             checkbox
#     area filter: Do you want to filter out the continent or country for entities?
#         title label
#         dropdown_continent
#         dropdown_country
#     time filter: Do you want to filter out time range?
#         title label
#         slider
#     button to update


# change label to HTML (for set style more flexible)
    
# set a general style and layout    
score_description_style = {'description_width':'50px'}
with_border_layout = Layout(border='solid 1px')
sp_items_layout = Layout(flex='1 1 auto',
                         width='auto',
                         border='solid 1px',)


# container for all
all_container = widgets.VBox(layout=with_border_layout)


# preparing a container for side panel
sp_container = widgets.VBox(layout=Layout(
    display='flex',
    flex_flow='column',
    align_items='stretch',
    border='solid 1px',
    width='30%'
)
)


# for sentiment lexicon
# lexicon container for lexicons
lexicon_container = widgets.VBox(layout=sp_items_layout)
# add title
html_lexicon = widgets.HTML(value="<b>Sentiment lexicon</b>")
#label_lexicon = widgets.Label(value="sentiment lexicon")
# preparing a container to put in radio buttons
radio_button_lexicon = widgets.RadioButtons(
    options=['OL', 'MPQA', 'LIWC','ANEW'],
    #description='sentiment lexicon',
    #style=style,
    disabled=False
)
# put text and button into lexicon container
lexicon_container.children = [html_lexicon,radio_button_lexicon]


# for domain
domain_container = widgets.VBox(layout=sp_items_layout)
# add title
html_domain = widgets.HTML(value="<b>Domain</b>")
# preparing a container to put in radio buttons
radio_button_domain = widgets.RadioButtons(
    options=['Articles', 'Talks'],
    disabled=False
)
# put text and button into domain container
domain_container.children = [html_domain,radio_button_domain]


# for target group
# container for groups
group_container = widgets.VBox(layout=sp_items_layout)
html_group = widgets.HTML(value="<b>Target group</b>")
# checkboxes container
cb_container = widgets.VBox(layout=Layout(
)) 
# preparing a container to put in created checkbox
checkboxes = []  
# create checkbox
checkboxes.append(widgets.Checkbox(description = 'People', value=False))
checkboxes.append(widgets.Checkbox(description = 'Events', value=False))
checkbox_others = widgets.Checkbox(description = 'Others', value=False)
checkboxes.append(checkbox_others)
# put check box into checkboxes container
cb_container.children=[i for i in checkboxes]
group_container.children=[html_group,cb_container]


# for description: how to indicate positive and negative
# container for sentiment description
sentiment_container = widgets.VBox()
html_sentiment = widgets.HTML(value=" <b>Sentiment representation: </b>red for negative, blue for positive.")
#text_neg_sentiment = widgets.Label(value="--red for negative")
#text_pos_sentiment = widgets.Label(value="--blue for positive")
#text_sentiment = widgets.Label(value="red for negative, blue for positive")
sentiment_container.children=[html_sentiment]


# for area filter
#container for area
area_container = widgets.VBox(layout=sp_items_layout)
html_area = widgets.HTML(value="<b>Target area</b>")
#dropdown_container = widgets.HBox()
#dropdown
dropdown_continent = widgets.Dropdown(
    options = continent_list,
    value = 'all',
    description='continent:',
    layout=Layout(width='250px'),
    disabled=False,
)
    
dropdown_country = widgets.Dropdown(
    #options = country_list,
    description='country:',
    #options = ["all"],
    #value='all',
    layout=Layout(width='250px'),
    
    disabled = False,
)

def transform(case):
    return continent_country_dict[case]

directional_link((dropdown_continent,'value'),(dropdown_country,'options'),transform)


#l = traitlets.link((dropdown_continent,'value'),(dropdown_country,'value'))
#dropdown_container.children = [dropdown_continent,dropdown_country]
area_container.children=[html_area,dropdown_continent,dropdown_country]


# for length
length_container = widgets.VBox(layout = sp_items_layout)
length_checkbox = widgets.Checkbox(description = 'minimum length setting', value=False)
length_dropdown = widgets.Dropdown(options=[5,10,20,30],
                                   description='minimum length',
                                   layout=Layout(width='250px'),
                                   disabled = True,)
length_container.children = [length_checkbox,length_dropdown]



# container for slider
slider_container = widgets.VBox(layout=with_border_layout)
# add a time slider: SelectionRangeSlider
slider_dates = [day for day in rrule.rrule(rrule.MONTHLY, dtstart=date(1900,1,1), until=date(2017,12,31))]#date.today()
options = [(i.strftime('%b%Y'),i) for i in slider_dates]
#slider = widgets.SelectionRangeSlider
slider = ipywidgets.SelectionRangeSlider(
    options=options,
    index=(0,len(slider_dates)-1),
    description='          ',
    layout=Layout(width='900px',height='40px'),
    #,description_width='200px'
    description_width='20px',
    #style = style,
    disabled=True
)
html_slider = widgets.HTML(value="<b>Time range</b>")
checkbox_slider = widgets.Checkbox(
    layout=Layout(width='800px'),
    description = 'Do not filter out date (if you tick it or select "Others" in "Target group", the time range will be ignored)', value=False)
slider_container.children=[html_slider,checkbox_slider,slider]
#print (slider.value[0].date())



# add a link between others and sliders
def transform_others_slider(case):
    return {True:True,False:False}[case]

directional_link((checkbox_others,'value'),(slider,'disabled'),transform_others_slider)

# add a link from others to check_slider
def transform_others_check_slider(case):
    return {True:True,False:False}[case]

directional_link((checkbox_others,'value'),(checkbox_slider,'disabled'),transform_others_check_slider)

# add a link between checkbox_slider and sliders (directly disabled slider)
def transform_check_slider(case):
    return {True:True,False:False}[case]

directional_link((checkbox_slider,'value'),(slider,'disabled'),transform_check_slider)


# for score range
# container for score 
score_container = widgets.VBox(layout=sp_items_layout)
# add a score slider: IntRangeSlider
slider_score = widgets.IntRangeSlider(value=[1,100],
                                      min=0,
                                      max=100,
                                      step=1,
                                      disabled=False,
                                      continuous_update=False,
                                      orientation='horizontal',
                                      readout=True,
                                      layout=Layout(width='273px'),
                                      readout_format='d',
                                      description='total',
                                      style=score_description_style,)
slider_score_positive = widgets.IntRangeSlider(value=[1,100],
                                      min=0,
                                      max=100,
                                      step=1,
                                      disabled=False,
                                      continuous_update=False,
                                      orientation='horizontal',
                                      readout=True,
                                      layout=Layout(width='273px'),
                                      readout_format='d',
                                      description='positive',
                                      style=score_description_style,)

slider_score_negative = widgets.IntRangeSlider(value=[1,100],
                                      min=0,
                                      max=100,
                                      step=1,
                                      disabled=False,
                                      continuous_update=False,
                                      orientation='horizontal',
                                      readout=True,
                                      layout=Layout(width='273px'),
                                      readout_format='d',
                                      description='negative',
                                      style=score_description_style,)


#html_score = widgets.HTML(value="<b>Score range</b><br>score=positive score + negative score<br>default range is 1 to 500.<br>")
# for labels
html_score = widgets.HTML(value="<b>Score range</b>")  
html_score_total = widgets.HTML(value="total score") 
html_score_positive = widgets.HTML(value="positive score") 
html_score_negative = widgets.HTML(value="negative score") 

#score_container.children=[html_score,html_score_total,slider_score,html_score_positive,slider_score_positive,html_score_negative,slider_score_negative]
score_container.children=[html_score,slider_score,slider_score_positive,slider_score_negative]
#print (slider.value[0].date())



# use an area for text output
output_text_container = widgets.VBox(layout=Layout(width='100%',border='solid 1px'))
output_label_str_container = widgets.VBox(layout=Layout(width='100%'))
output_label = widgets.Label(value="Here is the summary...",
                               layout=Layout(width='100%'),
                              disabled=False)
text_out = widgets.Output()
with text_out:
    display(output_label_str_container)


output_label_str_container.children=[output_label]
output_text_container.children = [text_out]


#for button
update_container = widgets.VBox(layout=Layout(display='flex',
                                              flex_flow='column',
                                              align_items='center',
                                              width='100%',
                                              border='solid 1px'))
#add button that updates the graph based on the checkboxes
button = widgets.Button(description="Update the graph", button_style='primary')
update_container.children=[button]


sp_container.children=[lexicon_container, domain_container, group_container,score_container, area_container,length_container]


# for map
map_container = widgets.VBox(layout=Layout(width='80%',height='100%',border='solid 1px'))
# preparing the plot 
myMap = folium.Map(location=[45.5236, -122.6750], zoom_start=1)
map_out = widgets.Output()
with map_out:
    display(myMap)
map_container.children = [map_out,sentiment_container]



    
# container for upper part:map container + side panel
upper_container = widgets.HBox(layout=with_border_layout
)
upper_container.children=[map_container,sp_container]


# container for all
#all_container = widgets.VBox(layout=with_border_layout)
all_container.children = [upper_container,slider_container,update_container,output_text_container]
display(all_container)

def on_button_clicked(b): 
#   # create a new map
    myMap = folium.Map(location=[45.5236, -122.6750], zoom_start=1)
    # create a new output container
    output_label_str_container = widgets.VBox(layout=Layout(width='100%',border='solid 1px'))
    # get continent and country info
    continent = dropdown_continent.value
    country = dropdown_country.value
    # get score range info
    score_start, score_end = slider_score.value
    score_start_positive, score_end_positive = slider_score_positive.value
    score_start_negative, score_end_negative = slider_score_negative.value
    print (score_start,score_end)
    #time_start, time_end = slider.value
    print (node_continent,node_country)
    # get domain: Articles or Talks
    domain = radio_button_domain.value
    # get lexicon name: OL, MPQA, LIWC, ANEW
    lexicon = radio_button_lexicon.value
    
    output_label_list = []
    counter_showed = 0
    for c in cb_container.children:
        # deal with one target group
        if c.value:
            group = c.description
            output_label_list.append("There are %d entities for %s group in %s."%(stat_dict[group],group,domain))

            # area filter
            target_data = None
            if not node_continent == "all":
                if not node_country == "all":
                    target_data = df[domain][group][df[domain][group]['country']==country]
                else:
                    # set continent but not country
                    target_data = df[domain][group][df[domain][group]['continent']==continent]
            else:
                # include all continent
                target_data = df[domain][group]
            output_label_list.append("After filtering with geolocation, we got %d entities left"%(len(target_data)))
            
            
            # here we add doc length filter
            if length_checkbox == True:
                length_temp = len(target_data)
                target_data = target_data[target_data['length']>=length_dropdown.value]
                output_label_list.append("%d entities are excluded because their doc lengths are too short."%(length_temp-len(target_data)))
                
            # here we do time filtering
            if checkbox_slider.value == True:
                # time start ,time end = slider.value
                xaxis_value = [day for day in rrule.rrule(rrule.DAILY,dtstart=slider.value[0],until=slider.value[1])]
                target_data = target_data[target_data['datetime_date'].isin(xaxis_value)]
            output_label_list.append("After filtering with time, %d entities left."%(len(target_data)))
            
            # here we do score filtering
            counter_0 = len(target_data)
            target_data = target_data[target_data['pos_score_'+lexicon]>=score_start_positive]
            counter_1 = len(target_data)
            counter_less_pos = counter_0 - counter_1
            target_data = target_data[target_data['pos_score_'+lexicon]<=score_end_positive]
            counter_2 = len(target_data)
            counter_larger_pos = counter_1-counter_2
            target_data = target_data[target_data['neg_score_'+lexicon]>=score_start_negative]
            counter_3 = len(target_data)
            counter_less_neg = counter_2 - counter_3
            target_data = target_data[target_data['neg_score_'+lexicon]<=score_end_negative]
            counter_4 = len(target_data)
            counter_larger_neg = counter_3 - counter_4
            # add total score
            target_data = target_data.assign(total=target_data['pos_score_'+lexicon]+target_data['neg_score_'+lexicon])
            # filtering total score
            target_data = target_data[target_data['total']>=score_start]
            counter_5 = len(target_data)
            counter_less_total = counter_4 - counter_5
            target_data = target_data[target_data['total']<=score_end]
            counter_6 = len(target_data)
            counter_larger_total = counter_5 - counter_6
            output_label_list.append("---After filtering with score, there are %d entities left."%(len(target_data)))
            output_label_list.append("------for total score: %d entities have a too high score, %d entities have a too low score."%(counter_larger_total,counter_less_total))
            output_label_list.append("------for positive score: %d entities have a too high score, %d entities have a too low score."%(counter_larger_pos,counter_less_pos))
            output_label_list.append("------for negative score: %d entities have a too high score, %d entities have a too low score."%(counter_larger_neg,counter_less_neg))
            counter_showed += len(target_data)
            

            for each in target_data.iterrows():              
                node_color = '#{:02x}00{:02x}'.format(int(255 * each[1]['neg_score_'+lexicon]/each[1]['total']),int(255 * each[1]['neg_score_'+lexicon]/each[1]['total']))

                popup = folium.Popup(title+". pos score:"+str(each[1]['pos_score_'+lexicon])+", neg score:"+str(each[1]['neg_score_'+lexicon]),parse_html=True)
                #print (node_color)

                node_lon,node_lat = each[1]['geometry'][7:-1].split(" ")
                #print (title)
                #print (float(node_lat),float(node_lon))

                folium.CircleMarker([float(node_lat),float(node_lon)],
                                radius= total_score,
                                #radius = 5,     
                                popup=popup,
                                fill = True,
                                color=node_color,
                                weight = 1,
                                fill_color=node_color,
                                ).add_to(myMap)
  
    with map_out:
        clear_output(wait=True)
        display(myMap)
    
    output_label_list.append("Totally %d entities are displayed in map."%(counter_showed))
    output_labels = [widgets.HTML(value=i) for i in output_label_list]
    output_label_str_container.children = [i for i in output_labels]
    with text_out:
        clear_output(wait=True)
        display(output_label_str_container)
    print ("finish")

button.on_click(on_button_clicked)  




