# load related library

In [1]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

from shapely.geometry import Point
import pandas as pd
import geopandas as gpd
from geopandas import GeoSeries, GeoDataFrame


from ipywidgets import widgets 
from ipywidgets import *  
from IPython.display import display,clear_output
#https://github.com/jupyter-widgets/ipywidgets/issues/134 about ouput
from ipywidgets import Layout
import folium
from datetime import datetime
from datetime import date
from dateutil import rrule

from traitlets import directional_link

# install rtree
# https://github.com/kjordahl/SciPy-Tutorial-2015/issues/1
# http://jspeis.com/installing-rtree-on-mac-os-x/

plt.style.use('ggplot')

def loadJson(filename):
	import json
	f = open(filename, "r", encoding = "utf-8")
	output = json.load(f)
	f.close()
	return output

In [37]:
# print the version of libraries being used
mpl.__version__, pd.__version__, gpd.__version__, folium.__version__, ipywidgets.__version__

('2.0.2', '0.16.2', '0.3.0')

# load score information

In [2]:
#load score information

pos_score_OL = loadJson("score_file/pos_score_OL.json")
neg_score_OL = loadJson("score_file/neg_score_OL.json")
pos_neg_score_OL = loadJson("score_file/pos_neg_score_OL.json")

pos_score_MPQA = loadJson("score_file/pos_score_MPQA.json")
neg_score_MPQA = loadJson("score_file/neg_score_MPQA.json")
pos_neg_score_MPQA = loadJson("score_file/pos_neg_score_MPQA.json")

pos_score_LIWC = loadJson("score_file/pos_score_LIWC.json")
neg_score_LIWC = loadJson("score_file/neg_score_LIWC.json")
pos_neg_score_LIWC = loadJson("score_file/pos_neg_score_LIWC.json")

scores = {}
scores["OL"] = {}
scores["MPQA"] = {}
scores["LIWC"] = {}
    
scores["OL"]["positive"] = pos_score_OL
scores["OL"]["negative"] = neg_score_OL
#scores["OL"]["positive+negative"] = pos_neg_score_OL
scores["MPQA"]["positive"] = pos_score_MPQA
scores["MPQA"]["negative"] = neg_score_MPQA
#scores["MPQA"]["positive+negative"] = pos_neg_score_MPQA
scores["LIWC"]["positive"] = pos_score_LIWC
scores["LIWC"]["negative"] = neg_score_LIWC
#scores["LIWC"]["positive+negative"] = pos_neg_score_LIWC

# filter document length 

ignore articles with too short length

In [3]:
# create a csv file to store the title->doc_length for articles

# f = open("token_file/titles","r",encoding='utf-8')
# f1 = open("token_file/Titles.csv","w",encoding='utf-8')
# for line in f:
#     f1.write('"'+line.strip()+'"'+'\n')
# f.close()
# f1.close()

# doc_length_len_df= pd.read_csv("token_file/doc_length_for_title",names=['length'])
# doc_length_title_df = pd.read_csv("token_file/Titles.csv",names=['title'])
    
# doc_length_df = pd.concat([doc_length_title_df,doc_length_len_df],axis=1,join_axes=[doc_length_title_df.index])
# doc_length_df.head()
# doc_length_df.to_csv('docs_length.csv',columns=['title','length'],header=False,index=False,encoding='utf-8')

In [4]:
# get a set of articles whose length < 10
doc_length_df = pd.read_csv("docs_length.csv",names=['title','length'])
doc_length_okay = set(doc_length_df[doc_length_df['length'] < 10]['title'])
print ("There are %d out of %d articles whose token length smaller than 10."%(len(doc_length_okay),len(doc_length_df)))


There are 733537 out of 5416537 articles whose token length smaller than 10.


# load date information

In [5]:
### here we load date information

dates = {}
dates["People"] = loadJson("person_birthdate.json")
dates["Events"] = loadJson("event_date.json")

In [6]:
# print (type(dates))

# load entity-geo-country-continent info directly

In [7]:
# load entity-geo-country-continent info directly
names=['title', 'geometry', 'country','continent']
all_gdf = pd.read_csv("title_geo_country_continent.csv", names=names)

In [8]:
#loading people and event set

# load event set (with _)
event_set = set([one for one in loadJson("eventList.json")])
#event_list = [one for one in loadJson("eventList.json")]
# load people set
people_set = set([one for one in loadJson("personList.json")])




In [9]:
# split all gdf to people, events, others
sub_gdf = {}
sub_gdf["People"] = all_gdf[all_gdf['title'].isin(people_set)]
sub_gdf["Events"] = all_gdf[all_gdf['title'].isin(event_set)]
temp = all_gdf[~all_gdf['title'].isin(people_set)]
sub_gdf["Others"] = temp[~all_gdf['title'].isin(event_set)]
print ("There are %d entities for people, %d entities for events, %d entities for others from total %d entities"%(len(sub_gdf["People"]),len(sub_gdf["Events"]),len(sub_gdf["Others"]),len(all_gdf)))
print ("structure example of people")
print (sub_gdf["People"].head())

There are 1730 entities for people, 6720 entities for events, 889701 entities for others from total 898151 entities
structure example of people
                 title                               geometry        country  \
10     André_the_Giant   POINT (-79.80634000000001 35.116211)  United States   
58  Frederick_Douglass                POINT (-75.958 38.8845)  United States   
72     H._P._Lovecraft  POINT (-71.3810921 41.85401760000001)  United States   
83   John_Wilkes_Booth               POINT (-77.2302 38.1385)  United States   
87           Jack_Ruby            POINT (-87.826853 41.95811)  United States   

        continent  
10  North America  
58  North America  
72  North America  
83  North America  
87  North America  




# Prepare other required data

In [10]:
# prepare data for dropdown: continent list and country list

# get world info
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# get continent list plus "all"
continent_list = ["all"] + list(set(world['continent']))
#continent_list.append("all")

continent_country_dict = {}
for continent in continent_list:
    continent_country_dict[continent] = ["all"] + list(world[world['continent']== continent]['name'])
    #continent_country_dict[continent].append("all")
    
print ("continent list for dropdown:")
print (continent_list)

continent list for dropdown:
['all', 'Antarctica', 'Africa', 'North America', 'Europe', 'Oceania', 'Asia', 'Seven seas (open ocean)', 'South America']


# visualization

In [13]:
# design the panel: map panel and side panel
# for map panel:
#     map
#
# for side panel:
#     sentiment container: tell user the color to indicate negative and positive
#         title label
#         negative label
#         positive label
#     lexicon container: Which lexicon you want to choose? (OL, MPQA, LIWC)
#         title label
#         radio_button
#     group container: Which group you want to show? (People, Events, Others)
#         title label
#         checkboxes container:
#             checkbox
#             checkbox
#             checkbox
#     area filter: Do you want to filter out the continent or country for entities?
#         title label
#         dropdown_continent
#         dropdown_country
#     time filter: Do you want to filter out time range?
#         title label
#         slider
#     button to update


# change label to HTML (for set style more flexible)
    
# set a general style and layout    
style = {'description_width':'50px'}
with_border_layout = Layout(border='solid 1px')
sp_items_layout = Layout(flex='1 1 auto',
                         width='auto',
                         border='solid 1px',)


# container for all
all_container = widgets.VBox(layout=with_border_layout)


# preparing a container for side panel
sp_container = widgets.VBox(layout=Layout(
    display='flex',
    flex_flow='column',
    align_items='stretch',
    border='solid 1px',
    width='30%'
)
)


# for sentiment lexicon
# lexicon container for lexicons
lexicon_container = widgets.VBox(layout=sp_items_layout)
# add title
html_lexicon = widgets.HTML(value="<b>Sentiment lexicon</b>")
#label_lexicon = widgets.Label(value="sentiment lexicon")
# preparing a container to put in radio buttons
radio_button_lexicon = widgets.RadioButtons(
    options=['OL', 'MPQA', 'LIWC'],
    #description='sentiment lexicon',
    #style=style,
    disabled=False
)
# put text and button into lexicon container
lexicon_container.children = [html_lexicon,radio_button_lexicon]


# for target group
# container for groups
group_container = widgets.VBox(layout=sp_items_layout)
html_group = widgets.HTML(value="<b>Target group</b>")
# checkboxes container
cb_container = widgets.VBox(layout=Layout(
)) 
# preparing a container to put in created checkbox
checkboxes = []  
# create checkbox
checkboxes.append(widgets.Checkbox(description = 'People', value=False))
checkboxes.append(widgets.Checkbox(description = 'Events', value=False))
checkbox_others = widgets.Checkbox(description = 'Others', value=False)
checkboxes.append(checkbox_others)
# put check box into checkboxes container
cb_container.children=[i for i in checkboxes]
#display(cb_container)
# # add a new container to control the arrangement
# temp_container = widgets.HBox()
# temp = widgets.Label(description='choose target group')
# temp_container.children=[cb_container,temp]
group_container.children=[html_group,cb_container]


# for description: how to indicate positive and negative
# container for sentiment description
sentiment_container = widgets.VBox()
html_sentiment = widgets.HTML(value=" <b>Sentiment representation: </b>red for negative, blue for positive.")
#text_neg_sentiment = widgets.Label(value="--red for negative")
#text_pos_sentiment = widgets.Label(value="--blue for positive")
#text_sentiment = widgets.Label(value="red for negative, blue for positive")
sentiment_container.children=[html_sentiment]


# for area filter
#container for area
area_container = widgets.VBox(layout=sp_items_layout)
html_area = widgets.HTML(value="<b>Target area</b>")
#dropdown_container = widgets.HBox()
#dropdown
dropdown_continent = widgets.Dropdown(
    options = continent_list,
    value = 'all',
    description='continent:',
    layout=Layout(width='250px'),
    disabled=False,
)
    
dropdown_country = widgets.Dropdown(
    #options = country_list,
    description='country:',
    #options = ["all"],
    #value='all',
    layout=Layout(width='250px'),
    
    disabled = False,
)

def transform(case):
    return continent_country_dict[case]

directional_link((dropdown_continent,'value'),(dropdown_country,'options'),transform)


#l = traitlets.link((dropdown_continent,'value'),(dropdown_country,'value'))
#dropdown_container.children = [dropdown_continent,dropdown_country]
area_container.children=[html_area,dropdown_continent,dropdown_country]


# container for slider
slider_container = widgets.VBox(layout=with_border_layout)
# add a time slider: SelectionRangeSlider
slider_dates = [day for day in rrule.rrule(rrule.MONTHLY, dtstart=date(1800,1,1), until=date.today())]
options = [(i.strftime('%b%Y'),i) for i in slider_dates]

slider = widgets.SelectionRangeSlider(
    options=options,
    index=(0,len(slider_dates)-1),
    description='          ',
    layout=Layout(width='900px',height='40px'),
    #,description_width='200px'
    description_width='20px',
    #style = style,
    disabled=True
)
html_slider = widgets.HTML(value="<b>Time range</b>")
checkbox_slider = widgets.Checkbox(
    layout=Layout(width='800px'),
    description = 'Do not filter out date (if you tick it or select "Others" in "Target group", the time range will be ignored)', value=False)
slider_container.children=[html_slider,checkbox_slider,slider]
#print (slider.value[0].date())



# add a link between others and sliders
def transform_others_slider(case):
    return {True:True,False:False}[case]

directional_link((checkbox_others,'value'),(slider,'disabled'),transform_others_slider)

# add a link from others to check_slider
def transform_others_check_slider(case):
    return {True:True,False:False}[case]

directional_link((checkbox_others,'value'),(checkbox_slider,'disabled'),transform_others_check_slider)

# add a link between checkbox_slider and sliders (directly disabled slider)
def transform_check_slider(case):
    return {True:True,False:False}[case]

directional_link((checkbox_slider,'value'),(slider,'disabled'),transform_check_slider)


# for score range
# container for score 
score_container = widgets.VBox(layout=sp_items_layout)
# add a score slider: IntRangeSlider
slider_score = widgets.IntRangeSlider(value=[1,500],
                                      min=0,
                                      max=500,
                                      step=1,
                                      disabled=False,
                                      continuous_update=False,
                                      orientation='horizontal',
                                      readout=True,
                                      layout=Layout(width='250px'),
                                      readout_format='d',)

html_score = widgets.HTML(value="<b>Score range</b><br>score=positive score + negative score<br>default range is 1 to 500.<br>")
                                
score_container.children=[html_score,slider_score]
#print (slider.value[0].date())



# use an area for text output
output_text_container = widgets.VBox(layout=Layout(width='100%',border='solid 1px'))
output_label_str_container = widgets.VBox(layout=Layout(width='100%'))
output_label = widgets.Label(value="Here is the summary...",
                               layout=Layout(width='100%'),
                              disabled=False)
text_out = widgets.Output()
with text_out:
    display(output_label_str_container)


output_label_str_container.children=[output_label]
output_text_container.children = [text_out]


#for button
update_container = widgets.VBox(layout=Layout(display='flex',
                                              flex_flow='column',
                                              align_items='center',
                                              width='100%',
                                              border='solid 1px'))
#add button that updates the graph based on the checkboxes
button = widgets.Button(description="Update the graph", button_style='primary')
update_container.children=[button]


sp_container.children=[lexicon_container, group_container,score_container, area_container]


# for map
map_container = widgets.VBox(layout=Layout(width='80%',height='100%',border='solid 1px'))
# preparing the plot 
myMap = folium.Map(location=[45.5236, -122.6750], zoom_start=1)
map_out = widgets.Output()
with map_out:
    display(myMap)
map_container.children = [map_out,sentiment_container]



    
# container for upper part:map container + side panel
upper_container = widgets.HBox(layout=with_border_layout
)
upper_container.children=[map_container,sp_container]
#display(all_container)
#print (dir(myMap))

# container for all
#all_container = widgets.VBox(layout=with_border_layout)
all_container.children = [upper_container,slider_container,update_container,output_text_container]
display(all_container)

def on_button_clicked(b): 
#   # create a new map
    myMap = folium.Map(location=[45.5236, -122.6750], zoom_start=1)
    # create a new output container
    output_label_str_container = widgets.VBox(layout=Layout(width='100%',border='solid 1px'))
    # get dropdown info
    node_continent = dropdown_continent.value
    node_country = dropdown_country.value
    # get score range info
    score_start, score_end = slider_score.value
    print (score_start,score_end)
    #time_start, time_end = slider.value
    print (node_continent,node_country)
    
    #output_text_str = ""
    output_label_list = []
    index_displayed_node = 0
    for c in cb_container.children:
        # deal with one target group
        if c.value:
            group = c.description
            index_node_with_date = 0
            index_node_date_in_range = 0
            index_node_has_score = 0
            index_score_too_high = 0
            index_score_too_low = 0
            # area filter
            if not node_continent == "all":
                if not node_country == "all":
                    target_data = sub_gdf[group][sub_gdf[group]['country']==node_country]
                else:
                    # set continent but not country
                    target_data = sub_gdf[group][sub_gdf[group]['continent']==node_continent]
            else:
                # include all continent
                target_data = sub_gdf[group]
            #print (target_data)
            
            
            # here we add doc length filter
            count_before_docLength = len(target_data)
            print (count_before_docLength)
            target_data = target_data[~target_data['title'].isin(doc_length_okay)]
            index_docLength_too_small = count_before_docLength-len(target_data) 
            print (len(target_data))
                
            
            
            
            # until now we got target entities with certain area
            
            for each in target_data.iterrows():              
                #print (node_continent,node_country)
                #print ("finished")
                #break
                #[each[1]['lat'],each[1]['lon']]
                title_underline = each[1]['title']
                title = title_underline.replace("_", " ")
                
                
                                        
                # here we add time filter
                # if checkbox_slider is ticked or others is ticked, skip this step
                # otherwise check if date is inside the time rage
                if checkbox_slider.value == False and checkbox_others.value == False:
                    #print ("checkbox_slider.value == False and checkbox_others == False")
                    time_start, time_end = slider.value
                    if title_underline in dates[group]:
                        
                        index_node_with_date += 1
                        # dates[group][title] is date of entity
                        # if date is BC, datetime library is not able to handle ('%Y-%m-%d' cannot match), 
                        # we excluded it manually (we say that it is out of time range)
                        if dates[group][title_underline].startswith("-"):
                            continue
                        # convert str to datetime with datetime.strptime()
                        node_date = datetime.strptime(dates[group][title_underline],'%Y-%m-%d')
                        #print (title,str(node_date),str(time_start),str(time_end))
                        if node_date < time_start or node_date > time_end:
                            
                            continue
                        else:
                            index_node_date_in_range += 1
                    else:
                        # title has no date info, then skip
                        continue
                        
                        
                                        
                pos_score_dict = scores[radio_button_lexicon.value]["positive"]
                if title in pos_score_dict:
                    #print ("there")
                    index_node_has_score += 1

                    
                    #deal with color
                    pos_score = pos_score_dict[title]

                    neg_score = scores[radio_button_lexicon.value]["negative"][title]
                    #print (neg_score)
#                     print (pos_score+neg_score)
#                     if index_score_too_high >= 20:
#                         break
                    total_score = pos_score + neg_score
                    if total_score < score_start:
                        index_score_too_low += 1
                        continue
                    elif total_score > score_end:
                        index_score_too_high += 1
                        continue

                    else:
                        
                        # formular
                        # ff * pos_score/total_score
#                         # 
#                         pos_color = str(hex(int(255 * pos_score/(pos_score+neg_score))))[2:]
#                         neg_color = str(hex(int(255 * neg_score/(pos_score+neg_score))))[2:]
#                         if len(pos_color) ==1:
#                             pos_color = '0' + pos_color
#                         if len(neg_color) == 1:
#                             neg_color = '0' + neg_color
#                         #print (pos_color,neg_color)
#                         node_color = "#"+neg_color+"00"+pos_color
                        #int(255 * neg_score/total_score)
                        #print (pos_score,neg_score)
                        #print (neg_score)
                        node_color = '#{:02x}00{:02x}'.format(int(255 * neg_score/total_score),int(255 * pos_score/total_score))
                        #print (node_color)
#                     if index_score_has_score >= 20:
#                         break
                    popup = folium.Popup(title,parse_html=True)
                    #print (node_color)

                    node_lon,node_lat = each[1]['geometry'][7:-1].split(" ")
                    #print (title)
                    #print (float(node_lat),float(node_lon))

                    folium.CircleMarker([float(node_lat),float(node_lon)],
                                  radius= total_score/10,
                                  #radius = 5,     
                                  popup=popup,
                                  fill = True,
                                  color=node_color,
                                  weight = 1,
                                  fill_color=node_color,
                                 ).add_to(myMap)
            
            #output_text_str += "There are %d entities for %s group. %d of them are displayed.\n"%(len(target_data),group,index_node_has_score - index_score_too_high)
            print (index_node_has_score,index_score_too_high,index_score_too_low)
            output_label_list.append("There are %d entities for %s group. %d of them are displayed.\n"%(count_before_docLength,group,index_node_has_score - index_score_too_high - index_score_too_low))
            if index_docLength_too_small > 0:
                output_label_list.append("------for document length filter: %d entities with document tokens smaller than 10 which are filtered out."%(index_docLength_too_small))
            if checkbox_slider.value == False and checkbox_others.value == False:
                #output_text_str += "   for time filter: %d entities with date infomation, %d of them fit the range.\n"%(index_node_with_date,index_node_date_in_range)
                output_label_list.append("------for time filter: %d entities with date infomation, %d of them fit the range.\n"%(index_node_with_date,index_node_date_in_range))
            #output_text_str += "    for score filter: %d entities with score information, %d of them have a too high score so be filtered out.\n"%(index_node_has_score,index_score_too_high)
            output_label_list.append("------for score filter: %d entities with score information, %d of them too high and %d of them too low which are filtered out.\n"%(index_node_has_score,index_score_too_high,index_score_too_low))
            index_displayed_node += index_node_has_score - index_score_too_high - index_score_too_low
    
    with map_out:
        clear_output(wait=True)
        display(myMap)
    #output_text_str += "Totally %d entities are displayed in map."%(index_displayed_node)
    output_label_list.append("Totally %d entities are displayed in map."%(index_displayed_node))

    #checkboxes.append(widgets.Checkbox(description = 'People', value=False))
    output_labels = [widgets.Label(value=i) for i in output_label_list]
    output_label_str_container.children = [i for i in output_labels]
    with text_out:
        clear_output(wait=True)
        display(output_label_str_container)
    print ("finish")

button.on_click(on_button_clicked)  





# Time line plot (individual part)

# load related library

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt


import pandas as pd
import geopandas as gpd
from geopandas import GeoSeries, GeoDataFrame


from ipywidgets import widgets 
from ipywidgets import *  
from IPython.display import display,clear_output
#https://github.com/jupyter-widgets/ipywidgets/issues/134 about ouput
from ipywidgets import Layout
import folium
from datetime import datetime
from datetime import date
from dateutil import rrule

from traitlets import directional_link

# install rtree
# https://github.com/kjordahl/SciPy-Tutorial-2015/issues/1
# http://jspeis.com/installing-rtree-on-mac-os-x/

plt.style.use('ggplot')

def loadJson(filename):
	import json
	f = open(filename, "r", encoding = "utf-8")
	output = json.load(f)
	f.close()
	return output

# load score information

In [None]:
#load score information

names=['title', 'score']
pos_score_OL = pd.read_csv("title_geo_country_continent.csv", names=names)

pos_score_OL = loadJson("score_file/pos_score_OL.json")
neg_score_OL = loadJson("score_file/neg_score_OL.json")
pos_neg_score_OL = loadJson("score_file/pos_neg_score_OL.json")

pos_score_MPQA = loadJson("score_file/pos_score_MPQA.json")
neg_score_MPQA = loadJson("score_file/neg_score_MPQA.json")
pos_neg_score_MPQA = loadJson("score_file/pos_neg_score_MPQA.json")

pos_score_LIWC = loadJson("score_file/pos_score_LIWC.json")
neg_score_LIWC = loadJson("score_file/neg_score_LIWC.json")
pos_neg_score_LIWC = loadJson("score_file/pos_neg_score_LIWC.json")

scores = {}
scores["OL"] = {}
scores["MPQA"] = {}
scores["LIWC"] = {}
    
scores["OL"]["positive"] = pos_score_OL
scores["OL"]["negative"] = neg_score_OL
scores["OL"]["positive+negative"] = pos_neg_score_OL
scores["MPQA"]["positive"] = pos_score_MPQA
scores["MPQA"]["negative"] = neg_score_MPQA
scores["MPQA"]["positive+negative"] = pos_neg_score_MPQA
scores["LIWC"]["positive"] = pos_score_LIWC
scores["LIWC"]["negative"] = neg_score_LIWC
scores["LIWC"]["positive+negative"] = pos_neg_score_LIWC

In [47]:
# a = '1996-01-01'
# b = '2006-01-01'
# a_date = datetime.strptime(a,'%Y-%m-%d')
# b_date = datetime.strptime(b,'%Y-%m-%d')
# print (a_date-b_date)
                        

-3653 days, 0:00:00


# The codes below are for an old version

This version load title_long_lat file and world_map file, and combine country border and entities geo-info to check which country the entity is belong to dynamically.

load geo information for analyzed data (old version)


column: title,	geometry



#load geo information for analyzed data
For this version, we get dataframe (geodata) and geometry for each category, and then generate geopandas for each of them.

In [2]:

# load all geo information
names=['title', 'lon', 'lat']
geodata = pd.read_csv("title_long_lat.csv", names=names)
geometry = [Point(xy) for xy in zip(geodata['lon'], geodata['lat'])]

# load event set (with _)
event_set = set([one for one in loadJson("eventList.json")])
# load people set
people_set = set([one for one in loadJson("personList.json")])

print (len([one for one in geodata['title'] if one in event_set]))

print (len([one for one in geodata['title'] if one in people_set]))

event_geodata = geodata[geodata['title'].isin(event_set)]
event_geometry = [Point(xy) for xy in zip(event_geodata['lon'], event_geodata['lat'])]
people_geodata = geodata[geodata['title'].isin(people_set)]
people_geometry = [Point(xy) for xy in zip(people_geodata['lon'], people_geodata['lat'])]
other_geodata = geodata[~geodata['title'].isin(event_set)]
other_geodata = other_geodata[~geodata['title'].isin(people_set)]
other_geometry = [Point(xy) for xy in zip(other_geodata['lon'], other_geodata['lat'])]

# event_gdf = GeoDataFrame(event_geodata, geometry=event_geometry)
# people_gdf = GeoDataFrame(people_geodata, geometry=people_geometry)
# other_gdf = GeoDataFrame(other_geodata, geometry=other_geometry)

sub_gdf = {}
sub_gdf["People"] = GeoDataFrame(people_geodata, geometry=people_geometry)[['title','geometry']] #people_gdf
sub_gdf["Events"] = GeoDataFrame(event_geodata, geometry=event_geometry)[['title','geometry']] # event_gdf
sub_gdf["Others"] = GeoDataFrame(other_geodata, geometry=other_geometry)[['title','geometry']] # other_gdf

# sub_geodata = {}
# sub_geodata["People"]=people_geodata
# sub_geodata["Events"]=event_geodata
# sub_geodata["Others"]=other_geodata

print(len(geodata))
print (len(event_geodata))
print (len(people_geodata))
print (len(other_geodata))

7754
1911




956916
7754
1911
947251


In [3]:
#how the data sub_geodata["People"] looks like
sub_gdf["People"].head()

Unnamed: 0,title,geometry
16,Alfred_Nobel,POINT (18.01928611111111 59.35681111111111)
19,A._A._Milne,POINT (0.107 51.09)
36,Abraham,POINT (35.110726 31.524744)
39,Ahmad_Shah_Durrani,POINT (65.70694444444446 31.61944444444445)
40,Aga_Khan_III,POINT (32.878722 24.088254)


# load score information

In [6]:
#load score information

pos_score_OL = loadJson("score_file/pos_score_OL.json")
neg_score_OL = loadJson("score_file/neg_score_OL.json")
pos_neg_score_OL = loadJson("score_file/pos_neg_score_OL.json")

pos_score_MPQA = loadJson("score_file/pos_score_MPQA.json")
neg_score_MPQA = loadJson("score_file/neg_score_MPQA.json")
pos_neg_score_MPQA = loadJson("score_file/pos_neg_score_MPQA.json")

pos_score_LIWC = loadJson("score_file/pos_score_LIWC.json")
neg_score_LIWC = loadJson("score_file/neg_score_LIWC.json")
pos_neg_score_LIWC = loadJson("score_file/pos_neg_score_LIWC.json")

scores = {}
scores["OL"] = {}
scores["MPQA"] = {}
scores["LIWC"] = {}
    
scores["OL"]["positive"] = pos_score_OL
scores["OL"]["negative"] = neg_score_OL
scores["OL"]["positive+negative"] = pos_neg_score_OL
scores["MPQA"]["positive"] = pos_score_MPQA
scores["MPQA"]["negative"] = neg_score_MPQA
scores["MPQA"]["positive+negative"] = pos_neg_score_MPQA
scores["LIWC"]["positive"] = pos_score_LIWC
scores["LIWC"]["negative"] = neg_score_LIWC
scores["LIWC"]["positive+negative"] = pos_neg_score_LIWC

# get border info for world map (old version)

column: country, geometry, continent

In [26]:
# get borders of countries 
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
print ("structure of 'world':")
print (world.head())
# pick only country name and geometry column
country_border = world[['name','geometry', 'continent']]
# rename column name to 'country','geometry'
country_border = country_border.rename(columns={'name':'country'})
print ("structure of 'country_border':")
print (country_border.head())



structure of 'world':
    pop_est      continent                  name iso_a3  gdp_md_est  \
0  28400000           Asia           Afghanistan    AFG       22270   
1  12799293         Africa                Angola    AGO      110300   
2   3639453         Europe               Albania    ALB       21810   
3   4798491           Asia  United Arab Emirates    ARE      184300   
4  40913584  South America             Argentina    ARG      573900   

                                            geometry  
0  POLYGON ((61.21081709172574 35.65007233330923,...  
1  (POLYGON ((16.32652835456705 -5.87747039146621...  
2  POLYGON ((20.59024743010491 41.85540416113361,...  
3  POLYGON ((51.57951867046327 24.24549713795111,...  
4  (POLYGON ((-65.50000000000003 -55.199999999999...  
structure of 'country_border':
                country                                           geometry  \
0           Afghanistan  POLYGON ((61.21081709172574 35.65007233330923,...   
1                Angola  (POLYGON 

In [39]:
#show_table(world[world['name']=='China'].)
#show_table(world.head())
HTML(world[world['name']=='China'].to_html())
#HTML(world[world['name']=='Germany'].to_html())



Unnamed: 0,pop_est,continent,name,iso_a3,gdp_md_est,geometry
30,1338612970,Asia,China,CHN,7973000,(POLYGON ((110.3391878601516 18.67839508714761...


In [5]:

#cities = gpd.read_file(gpd.datasets.get_path('naturalearth_cities'))

# build continent->country dict for dropdown

#continent_list = world.continent.unique()
continent_list = ["all"] + list(set(world['continent']))
#continent_list.append("all")

continent_country_dict = {}
for continent in continent_list:
    continent_country_dict[continent] = ["all"] + list(world[world['continent']== continent]['name'])
    #continent_country_dict[continent].append("all")
    
print ("continent list for dropdown:")
print (continent_list)
for one in ["People","Events","Others"]:
    sub_gdf[one].crs = country_border.crs
#print (conti_country[1])
#area_dict = {}

structure of 'world':
    pop_est      continent                  name iso_a3  gdp_md_est  \
0  28400000           Asia           Afghanistan    AFG       22270   
1  12799293         Africa                Angola    AGO      110300   
2   3639453         Europe               Albania    ALB       21810   
3   4798491           Asia  United Arab Emirates    ARE      184300   
4  40913584  South America             Argentina    ARG      573900   

                                            geometry  
0  POLYGON ((61.21081709172574 35.65007233330923,...  
1  (POLYGON ((16.32652835456705 -5.87747039146621...  
2  POLYGON ((20.59024743010491 41.85540416113361,...  
3  POLYGON ((51.57951867046327 24.24549713795111,...  
4  (POLYGON ((-65.50000000000003 -55.199999999999...  
structure of 'country_border':
                country                                           geometry  \
0           Afghanistan  POLYGON ((61.21081709172574 35.65007233330923,...   
1                Angola  (POLYGON 

# Old Version

check which country the entities is belonging to dynamically


In [13]:


# preparing a container for side panel
sp_container = widgets.VBox(layout=Layout(
    border='solid 1px',
    width='30%'
)
)
style = {'description_width':'50px'}
my_layout = Layout(border='solid 1px')

# for sentiment lexicon
# lexicon container for lexicons
lexicon_container = widgets.VBox(layout=my_layout)
# add title
label_lexicon = widgets.Label(value="sentiment lexicon")
# preparing a container to put in radio buttons
radio_button_lexicon = widgets.RadioButtons(
    options=['OL', 'MPQA', 'LIWC'],
    #description='sentiment lexicon',
    #style=style,
    disabled=False
)
# put text and button into lexicon container
lexicon_container.children = [label_lexicon,radio_button_lexicon]


# for target group
# container for groups
group_container = widgets.VBox(layout=my_layout)
label_group = widgets.Label(value="target group")
# checkboxes container
cb_container = widgets.VBox(layout=Layout(
)) 
# preparing a container to put in created checkbox
checkboxes = []  
# create checkbox
checkboxes.append(widgets.Checkbox(description = 'People', value=False))
checkboxes.append(widgets.Checkbox(description = 'Events', value=False))
checkboxes.append(widgets.Checkbox(description = 'Others', value=False))
# put check box into checkboxes container
cb_container.children=[i for i in checkboxes]
#display(cb_container)
# # add a new container to control the arrangement
# temp_container = widgets.HBox()
# temp = widgets.Label(description='choose target group')
# temp_container.children=[cb_container,temp]
group_container.children=[label_group,cb_container]


# for target sentiment, pos | neg | pos+neg
# container for sentiment
sentiment_container = widgets.VBox(layout=my_layout)
label_sentiment = widgets.Label(value="target sentiment")
# checkboxes container
st_container = widgets.VBox(
    #description='target sentiment'
)
radio_button_sentiment = widgets.RadioButtons(
    options=['positive', 'negative', 'positive+negative'],
    #description='sentiment',
    #style=style,
    disabled=False
)
sentiment_container.children=[label_sentiment,radio_button_sentiment]


# for area filter
#container for area
area_container = widgets.VBox(layout=my_layout)
label_area = widgets.Label(value="target area")
#dropdown_container = widgets.HBox()
#dropdown
dropdown_continent = widgets.Dropdown(
    options = continent_list,
    value = 'all',
    description='continent:',
    layout=Layout(width='250px'),
    disabled=False,
)
    
dropdown_country = widgets.Dropdown(
    #options = country_list,
    description='country:',
    #options = ["all"],
    #value='all',
    layout=Layout(width='250px'),
    
    disabled = False,
)

def transform(case):
    return continent_country_dict[case]

directional_link((dropdown_continent,'value'),(dropdown_country,'options'),transform)


#l = traitlets.link((dropdown_continent,'value'),(dropdown_country,'value'))
#dropdown_container.children = [dropdown_continent,dropdown_country]
area_container.children=[label_area,dropdown_continent,dropdown_country]


# container for slider
slider_container = widgets.VBox(layout=my_layout)
# add a time slider: SelectionRangeSlider
dates = [day for day in rrule.rrule(rrule.MONTHLY, dtstart=date(2001,1,1), until=date.today())]
options = [(i.strftime('%b%Y'),i) for i in dates]

slider = widgets.SelectionRangeSlider(
    options=options,
    index=(0,192),
    #description='time range',
    layout=Layout(width='250px',height='40px'),
    #,description_width='200px'
    #description_width:'200px'
    style = style,
    disabled=False
)
label_slider = widgets.Label(value="time range")
slider_container.children=[label_slider,slider]


#for button
update_container = widgets.VBox(layout=my_layout)
#add button that updates the graph based on the checkboxes
button = widgets.Button(description="Update the graph")
update_container.children=[button]


sp_container.children=[lexicon_container, group_container, area_container, slider_container, button]


# for map
map_container = widgets.VBox(layout=Layout(width='80%',height='100%',border='solid 4px'))
# preparing the plot 
myMap = folium.Map(location=[45.5236, -122.6750], zoom_start=1)
map_out = widgets.Output()
with map_out:
    display(myMap)
map_container.children = [map_out]
    
# container for all
all_container = widgets.HBox(layout=Layout(border='solid 4px')
)
all_container.children=[map_container,sp_container]
display(all_container)
#print (dir(myMap))

def on_button_clicked(b): 
#     # delete map and create a new one
    #import rtree
    myMap = folium.Map(location=[45.5236, -122.6750], zoom_start=1)
    # get dropdown info
    node_continent = dropdown_continent.value
    node_country = dropdown_country.value
    print (node_continent,node_country)
    cc_e = 0
    for c in cb_container.children:
        
        if c.value:
            group = c.description
            # area filter
            if not node_continent == "all":
                if not node_country == "all":
                    target_data = gpd.sjoin(sub_gdf[group],country_border,how="inner",op='intersects')
                    target_data = target_data[target_data['country']==node_country]
                else:
                    target_data = gpd.sjoin(sub_gdf[group],country_border,how="inner",op='intersects')
                    target_data = target_data[target_data['continent']==node_continent]
            else:
                target_data = sub_gdf[group]
            #print (target_data)
            print ("totally %d nodes will be shown"%(len(target_data)))
            
            
            for each in target_data.iterrows():              
                #print (node_continent,node_country)
                #print ("finished")
                #break
                #[each[1]['lat'],each[1]['lon']]
                title = each[1]['title']
                title = title.replace("_", " ")
                pos_score_dict = scores[radio_button_lexicon.value]["positive"]
                if title in pos_score_dict:
                    #print ("there")
                    cc_e += 1
#                     if cc_e >= 20:
#                         break
                    if cc_e % 500 == 0:
                        print (cc_e)
                    
                    #deal with color
                    pos_score = pos_score_dict[title]
                    #print (pos_score)
                    neg_score = scores[radio_button_lexicon.value]["negative"][title]
                    #print (neg_score)
                    
#                     if pos_score + neg_score >= 1000:
#                         continue
                    if (pos_score + neg_score) == 0:
                        node_color = 'yellow'
                    else:
                        # formular
                        # ff * pos_score/total_score
                        # 
                        pos_color = str(hex(int(255 * pos_score/(pos_score+neg_score))))[2:]
                        neg_color = str(hex(int(255 * neg_score/(pos_score+neg_score))))[2:]
                        #print (pos_color,neg_color)
                        node_color = "#"+neg_color+"00"+pos_color
#                     pos_color = (1000-pos_score) * 255/1000
                    
#                     pos_color = hex(int(pos_color))
                    
#                     pos_color = str(pos_color)[2:]
#                     #print ("pos_color:"+pos_color)
#                     #print (neg_score)
#                     neg_color = int((1000-neg_score) * 255/1000)
#                     #print (neg_color)
#                     neg_color = str(hex(neg_color))[2:]
#                     #print ("neg_color:"+neg_color)
                
                    #print (node_color)
                    #print (title+node_color)
                    #break
                    #print ((scores[radio_button_lexicon.value]["positive+negative"][title]))
                    popup = folium.Popup(title,parse_html=True)
                    folium.CircleMarker([each[1]['geometry'].y,each[1]['geometry'].x],
                                  radius=(scores[radio_button_lexicon.value]["positive+negative"][title])/20,
                                  #radius = 5,     
                                  popup=popup,
                                  fill = True,
                                  color=node_color,
                                  fill_color=node_color,
                                 ).add_to(myMap)
                    
        else:
            # delete
            print ("here")
    print (cc_e)
    with map_out:
        clear_output(wait=True)
        display(myMap)
    print ("finish")

button.on_click(on_button_clicked)  



# function to save entity-geo-country-continent info
(with which we don't need to compute the country for each node dynamically, and make it running faster
    

In [27]:
def get_entity_geo_country_continent_csv():
    
    ### get border info for world map
    ### get "country_border"
    
    
    #load existing geo information: title_long_lat.csv
    names=['title', 'lon', 'lat']
    geodata = pd.read_csv("title_long_lat.csv", names=names)
    geometry = [Point(xy) for xy in zip(geodata['lon'], geodata['lat'])]
    all_gdf = GeoDataFrame(geodata, geometry=geometry)[['title','geometry']] 
    all_gdf.crs = country_border.crs
    
    #merge title-geometry and country-continent-geometry
    joindata = gpd.sjoin(all_gdf,country_border,how="inner",op='intersects')
    
    # write into gdf
    #joindata.to_file("title_geo_country_continent_gdf")
    
    # write into csv
    #newnames = ['title','geometry','country','continent']
    #joindata.to_csv('title_geo_country_continent.csv',columns=newnames,header=False,index=False,encoding='utf-8')
    
    
    
get_entity_geo_country_continent_csv()

#load existing geo information: title_long_lat.csv
# names2=['title', 'lon', 'lat']
# geodata2 = pd.read_csv("title_long_lat.csv", names=names2)
# geometry2 = [Point(xy) for xy in zip(geodata2['lon'], geodata2['lat'])]
# all_gdf2 = GeoDataFrame(geodata2, geometry=geometry2)[['title','geometry']]
# show_table(all_gdf2.head())


In [33]:
def show_table(df):
    from IPython.display import display, HTML
    #print (all_gdf[all_gdf['title']=='Fuzhou_University'].to_html())
    #HTML(all_gdf[all_gdf['title']=='Fuzhou_University'].to_html())
    #HTML(all_gdf[all_gdf['title']=='Alfred_Nobel'].to_html())
    #HTML(all_gdf.head().to_html())
    HTML(df.to_html())

In [5]:
import pandas as pd
names=['title', 'lon', 'lat']
data_1 = pd.read_csv("title_long_lat_2.csv", names=names)
data_2 = pd.read_csv("title_long_lat_3.csv", names=names)

data_3 = pd.concat([data_2,data_1],ignore_index=True)
data_4 = data_3.drop_duplicates(subset='title')

data_4.to_csv('title_long_lat_4.csv',columns=names,header=False,index=False,encoding='utf-8')
#data_4.to_csv('title_long_lat_title.csv',columns=['title'],header=False,index=False,encoding='utf-8')
#data_4.to_csv('title_long_lat_long_lat.csv',columns=['lon','lat'],header=False,index=False,encoding='utf-8')
# add " " automatically if there is comma inside title


In [47]:
data_8 = pd.concat([data_1,data_2],join='outer')
data_8

Unnamed: 0,title,lon,lat
0,Algeria,3.216667,36.7
1,Andorra,1.516667,42.5
2,Aruba,-70.033333,12.516667
3,Angola,13.333333,-8.833333
0,Alabama,-86.7,32.7
1,Algeria,2.0,28.0
2,Apollo_8,-165.016667,8.133333
3,Apollo_11,-169.15,13.316667
4,Andorra,1.5,42.5


In [3]:
data_4 = data_1.append(data_2)
data_4

Unnamed: 0,title,lon,lat
0,Algeria,3.216667,36.7
1,Andorra,1.516667,42.5
2,Aruba,-70.033333,12.516667
3,Angola,13.333333,-8.833333
0,Alabama,-86.7,32.7
1,Algeria,2.0,28.0
2,Apollo_8,-165.016667,8.133333
3,Apollo_11,-169.15,13.316667
4,Andorra,1.5,42.5


In [5]:
data_5 = pd.concat([data_1,data_2],ignore_index=True)
data_5

Unnamed: 0,title,lon,lat
0,Algeria,3.216667,36.7
1,Andorra,1.516667,42.5
2,Aruba,-70.033333,12.516667
3,Angola,13.333333,-8.833333
4,Alabama,-86.7,32.7
5,Algeria,2.0,28.0
6,Apollo_8,-165.016667,8.133333
7,Apollo_11,-169.15,13.316667
8,Andorra,1.5,42.5


In [8]:
data_6 = pd.merge(data_1,data_2,how='outer',on=['title'])
data_6

Unnamed: 0,title,lon_x,lat_x,lon_y,lat_y
0,Algeria,3.216667,36.7,2.0,28.0
1,Andorra,1.516667,42.5,1.5,42.5
2,Aruba,-70.033333,12.516667,,
3,Angola,13.333333,-8.833333,,
4,Alabama,,,-86.7,32.7
5,Apollo_8,,,-165.016667,8.133333
6,Apollo_11,,,-169.15,13.316667


In [36]:
data_7 = data_5.duplicated()
data_7.head()
data_5
# data_7 = data_1.join(data_2,on='title',how='outer',)
# data_7

Unnamed: 0,title,lon,lat
0,Algeria,3.216667,36.7
1,Andorra,1.516667,42.5
2,Aruba,-70.033333,12.516667
3,Angola,13.333333,-8.833333
4,Alabama,-86.7,32.7
6,Apollo_8,-165.016667,8.133333
7,Apollo_11,-169.15,13.316667


In [23]:
data_2

Unnamed: 0,title,lon,lat
0,Alabama,-86.7,32.7
1,Algeria,2.0,28.0
2,Apollo_8,-165.016667,8.133333
3,Apollo_11,-169.15,13.316667
4,Andorra,1.5,42.5


overview of files with underline and space

underline:
EventList.json
personList.json
dates

space:
scores
