In [1]:
# related library
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import geopandas as gpd

import ipywidgets
from ipywidgets import widgets 
from ipywidgets import *  
from IPython.display import display,clear_output

from ipywidgets import Layout
from traitlets import directional_link

from datetime import datetime
from datetime import date
from dateutil import rrule

import plotly.graph_objs as go
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()

In [48]:
ipywidgets.__version__


'5.2.2'

In [3]:
# prepare data for dropdown: continent list and country list

# get world info
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# get continent list plus "all"
continent_list = ["all"] + list(set(world['continent']))
#continent_list.append("all")

continent_country_dict = {}
for continent in continent_list:
    continent_country_dict[continent] = ["all"] + list(world[world['continent']== continent]['name'])
    #continent_country_dict[continent].append("all")
    
print ("continent list for dropdown:")
print (continent_list)

continent list for dropdown:
['all', 'Asia', 'Europe', 'Oceania', 'North America', 'Seven seas (open ocean)', 'South America', 'Africa', 'Antarctica']


# load dataset

You can download dataset in the following link:
https://www.dropbox.com/sh/mt7by5f1wgl6n3z/AACddwkFPq5lPpH3ry83MgSDa?dl=0

In [None]:
# we use quilt to make all the datasets as package
import quilt
# install dataset
quilt.install("qianhongye/WikiSentiment")

# import the dataset
from quilt.data.qianhongye import WikiSentiment as wiki

In [4]:
t_people = wiki.talk_score_with_date_for_people._data()
## for locally dataset
#t_people = pd.read_csv("talk_people_score_date_geo.csv")

In [5]:
t_events = wiki.talk_score_with_date_for_events._data()
## for locally dataset
#t_events = pd.read_csv("talk_events_score_date_geo.csv")

In [6]:
a_people = wiki.article_score_with_date_for_people._data()
## for locally dataset
#a_people = pd.read_csv("article_people_score_date_geo.csv")

In [7]:
a_events = wiki.article_score_with_date_for_events._data()
## for locally dataset
#a_events = pd.read_csv("article_events_score_date_geo.csv")

In [8]:
complete_df = {}
complete_df['Articles'] = {}
complete_df['Talks'] = {}
complete_df['Articles']['People'] = a_people
complete_df['Articles']['Events'] = a_events
complete_df['Talks']['People'] = t_people
complete_df['Talks']['Events'] = t_events

# side function

Please run it first

In [9]:
def exclude_BC(df):
    bc_list = [one for one in df['date'] if one.startswith("-")]
    df_withoutBC = df[~df['date'].isin(bc_list)]
    #print ("There are %d dates before Christ,%d entities left after excluding"%(len(bc_list),len(df_withoutBC)))
    return df_withoutBC

In [10]:
def add_datetime_column(df,unit):
    # unit: date,month,year
    datetime_format = ""
    if unit == "date":
        datetime_format = "%Y-%m-%d"
    elif unit == "month":
        datetime_format = "%Y-%m"
    elif unit == "year":
        datetime_format = "%Y"
    new_list = [datetime.strptime(one,datetime_format) for one in df[unit]]
    if unit == "date":
        df = df.assign(datetime_date=pd.Series(new_list).values)
    elif unit == "month":
        df = df.assign(datetime_month=pd.Series(new_list).values)
    elif unit == "year":
        df = df.assign(datetime_year=pd.Series(new_list).values)
    return df

In [11]:
def add_month_column(df):
    new_list = [one[:7] for one in df['date']]
    df = df.assign(month=pd.Series(new_list).values)
    return df

In [12]:
def add_year_column(df):
    new_list = [one[:4] for one in df['date']]
    df = df.assign(year=pd.Series(new_list).values)
    return df

In [13]:
def score_median_group_by_column(df,tcn,scn):
    #print (df)
    out_df = df.groupby(tcn).agg({scn:np.median}).reset_index()
    #print (df)
    return out_df
# df_month_score=score_median_group_by_column(monthSenti,'month','score')
# print (df_month_score.head())

In [14]:
def score_percentile_group_by_column(df,tcn,scn,percentile):
    #print (df)
    out_df = df.groupby(tcn).quantile(percentile).reset_index()
    #print (out_df)
    return out_df

In [18]:
# plot function
def prepare_scatter(lexicon,sentiment,group,domain,geo,times,unit):
    # lexicon,sentiment,group,domain,geo,time,ra,unit
    # lexicon: OL, MPQA, LIWC, ANEW
    # sentiment: pos, neg, total
    # group: [group_people,group_events,group_others] boolean value
    # domain: Articles, Talks
    # geo: [continent,country] continent,country could be 'all'
    # times: [start_year,start_month,end_year,end_month] int
    # unit: 'year' or 'month'
    
    # output_text
    statistics = []
    dict_a_people = {"total":"1,146,257","date":"775,664","BC":"27"}
    dict_a_events = {"total":"54,071","date":"22,582","BC":"33"}
    dict_t_people = {"total":"415,124","date":"289,108","BC":"26"}
    dict_t_events = {"total":"21,621","date":"10,283","BC":"27"}
    statistics_dict = {"Articles":{"People":dict_a_people,"Events":dict_a_events},"Talks":{"People":dict_t_people,"Events":dict_t_events}}
    birth_occurrance = {"People":"birth","Events":"occurrance"}
    
    # prepare showed dates depending on time range
    # get time range
    start_yy,start_mm,end_yy,end_mm = times
    start_date = date(start_yy,start_mm,1)
    end_date = date(end_yy,end_mm,1)
    # prepare x-axis
    xaxis_value = [day for day in rrule.rrule(rrule.MONTHLY,dtstart=start_date,until=end_date)]
    #print ("value for x-axis looks like this: \n%s \nlen(x-axis): %d"%(xaxis_value[-1],len(xaxis_value)))
    
    # prepare dataframe to display
    df = pd.DataFrame()
    
    #group_people,group_events,group_others = group
    
    # choose corresponding dataframe depend on domain and group
    for one in group:
        if one.value:
            # add statistics for total entities for this group
            statistics.append("There are totally %s %s entities in Wikipedia %s."%(statistics_dict[domain][one.description]['total'],one.description,domain))
            statistics.append("Among them, %s have %s date information. Inside them we exclude %s entities whose date before Christ (BC)."%
                              (statistics_dict[domain][one.description]['date'],birth_occurrance[one.description],statistics_dict[domain][one.description]['BC']))

            if df.empty:
                df = complete_df[domain][one.description][['title','pos_score_'+lexicon,'neg_score_'+lexicon,'date','country','continent']]
            else:
                df = pd.concat([df,complete_df[domain][one.description][['title','pos_score_'+lexicon,'neg_score_'+lexicon,'date','country','continent']]])

    statistics.append("After all, there are {:,d} entities with AC date information in this run.".format(len(df)))

    # filter out entities based on geo
    continent,country = geo
    # if continent == 'all', do nothing
    if not continent == 'all':
        if country == 'all':
            # filter out entities with target continent
            df = df[df['continent']==continent]
        else:
            # filter out entities with target country
            df = df[df['country']==country]
            
    statistics.append("After filtering with area, there are {:,d} entities left.".format(len(df)))
          
    # add target unit and corresponding datetime type for entities
    if unit == "month":
        df = add_month_column(df)
    elif unit == "year":
        df = add_year_column(df)
        
    # we need datetime type because we use it to filter out entities within time range
    df = add_datetime_column(df,unit)
    
    # filter out entities based on time range
    df = df[df["datetime_"+unit].isin(xaxis_value)]
    statistics.append("After filtering with date, there are {:,d} entities left, and collected in the plot.".format(len(df)))

    # get score depending on lexicon and sentiment
    df = df.assign(total=df["pos_score_"+lexicon]+df["neg_score_"+lexicon])
    df = df.assign(final=df["pos_score_"+lexicon]-df["neg_score_"+lexicon])
    #print (df.head(5))
    
    return df,statistics,start_date,end_date


# widget 2: sentiment score for identical Wikipedia concepts over time

In [41]:
def on_button_clicked(b):
    # prepare and filter dataset
    # get parameters from input
    # get geo
    continent = dropdown_continent.value
    country = dropdown_country.value
    geo = [continent,country]
    # get time
    time_start_year = dropdown_start_year.value
    time_start_month = dropdown_start_month.value
    time_end_year = dropdown_end_year.value
    time_end_month = dropdown_end_month.value
    time = [time_start_year,time_start_month,time_end_year,time_end_month]
    # get lexicon
    lexicon = radio_button_lexicon.value
    # get sentiment
    sentiment = sen_dict[radio_button_sentiment.value]
    # get domain
    domain = radio_button_domain.value
    # get group
    # group_people,group_events,group_others = cb_container.children
    group = cb_container.children
    
    # define time unit to month
    unit = "month"
    
    # output text
    #output_label_list = []
    output_label_str_container = widgets.VBox(layout=Layout(width='100%',border='solid 1px'))
    
    
    df,output_label_list,start_date,end_date=prepare_scatter(lexicon,sentiment,group,domain,geo,time,unit)
    # add column 'total' and 'final' during preparing scatter
    # total = pos+neg
    # final = pos-neg
    
    #for i in range(len(df)):
    text_list = [df['title'].tolist()[i]+"<br>"+"pos:"+str(df['pos_score_'+lexicon].tolist()[i])+", neg:"+str(df['neg_score_'+lexicon].tolist()[i]) for i in range(len(df))]
    print (text_list[0],text_list[1])
    trace = go.Scatter(
        x = df['datetime_'+unit].tolist(),
        y = df['final'].tolist(),
        mode = 'markers',
        marker=dict(size=[i+2 for i in df['total'].tolist()],
                    opacity=0.8,
                    color=df['total'],
                    colorscale='Blues',#'Hot',#'Bluered',#'Blues',
                    reversescale=True,
                    showscale=True
                   ),
        name='marker',
        text = df['title'],
        hovertext = text_list,
        hoverinfo='text+x'
        #text=df['title']
    )
    mylayout = go.Layout(
        title='Sentiment score for Wikipedia entities',
        hovermode='closest',
        xaxis=dict(
            title='time',
            zeroline=True,
            range=[start_date,end_date]
        ),
        yaxis=dict(
            title='score',
        ),
        showlegend=False
    )
    data = [trace]
    fig = go.Figure(data=data,layout=mylayout)
        
    
    #py.iplot(data)
    

    
#     # draw
#     figure = plt.figure(figsize=(13,5))
#     ax = figure.add_subplot(111)
#     size = np.asarray(df['total'].tolist())*10
#     plt.scatter(df['datetime_'+unit].tolist(),df['final'].tolist(),c=df['total'].tolist(),s=size,picker=True)
    
#     ax.axhline(color='r')
    
    


    with box_out:
        clear_output(wait=True)
        #display(all_container)
        
        iplot(data)
    #plt.close(figure)
        

In [42]:
# framework
#     lexicon container: Which lexicon you want to choose? (OL, MPQA, LIWC)
#         title html
#         radio_button
#     group container: Which group you want to show? (People, Events)
#         title html
#         checkboxes container:
#             checkbox
#             checkbox
#     area filter: Do you want to filter out the continent or country for entities?
#         title html
#         dropdown_continent
#         dropdown_country
#     domain container: What domain you want to use? (Article, Talks)
#         title html
#         checkboxes container:
#             radio_button
#     time filter: Set the start and stop of time range
#         dropdown
#         

# change label to HTML (for set style more flexible)
    
container_width = '100px'#'auto'
with_border_layout = Layout(border='solid 0.5px')


# preparing a container for header
header_container = widgets.VBox(layout=Layout(width='100%',border='solid 0.5px'))
header_text = """<h1>Time Widget 2</h1><br>
<p>This widget is used to show detail sentiment score for Wikipedia concepts verying with time.</p><br>
<p>Wikipedia concepts (or Wikipedia entities) here contains Wikipedia articles and Wikipedia talks, which can be entered from upper left side of article pages.
This widget includes entities both People and Events, with people the date indicates birth date, while with events the date indicates occurrance date.
The text of Articles and Talks is extracted from Wikipedia Dump, and time stamps are extracted from DBPedia.
The scores are calculated with term frequency for sentiment words based on certain lexicons (OL, MPQA, LIWC, ANEW). For ANEW we take valency into account too.
</p><br>
<p>
For each run it plots sentiment for every entity fitting the time and area range as one circle. 
Value in x-axis indicates the date (united by month) and y-axis indiates merging value, which means to subtract negative_score from its positive_score.
The size and color of the circle indicates its total score, which means sum up positive_score and negative_score.
You can check which entity the circle present for and what its positive and negative scores are by hovering the mouse on it.

In the bottom you will get the data characteristics for the current run.</p>"""
html_header = widgets.HTML(value=header_text)
header_container.children=[html_header]


# for lexicon
lexicon_container = widgets.VBox(layout=Layout(width='8%',border='solid 0.5px'))
# add title
html_lexicon = widgets.HTML(value="<b>Lexicon</b>")
#label_lexicon = widgets.Label(value="sentiment lexicon")
# preparing a container to put in radio buttons
radio_button_lexicon = widgets.RadioButtons(
    options=['OL', 'MPQA', 'LIWC','ANEW'],
    #description='sentiment lexicon',
    #style=style,
    disabled=False
)
# put text and button into lexicon container
lexicon_container.children = [html_lexicon,radio_button_lexicon]


# for sentiment
sentiment_container = widgets.VBox(layout=lexicon_container.layout)
# add title
html_sentiment = widgets.HTML(value="<b>Sentiment</b>")
# preparing a container to put in radio buttons
radio_button_sentiment = widgets.RadioButtons(
    options=['total', 'positive', 'negative'],
    #description='',
    #style=style,
    disabled=False
)
# put text and button into lexicon container
sentiment_container.children = [html_sentiment,radio_button_sentiment]


# for target group
group_container = widgets.VBox(layout=lexicon_container.layout)
html_group = widgets.HTML(value="<b>Group</b>")
# checkboxes container
cb_container = widgets.VBox(layout=Layout(
)) 
# preparing a container to put in created checkbox
checkboxes = []  
# create checkbox
checkboxes.append(widgets.Checkbox(description = 'People', value=False))
checkboxes.append(widgets.Checkbox(description = 'Events', value=False))
# put check box into checkboxes container
cb_container.children=[i for i in checkboxes]
#display(cb_container)
# # add a new container to control the arrangement
# temp_container = widgets.HBox()
# temp = widgets.Label(description='choose target group')
# temp_container.children=[cb_container,temp]
group_container.children=[html_group,cb_container]


# for area 
area_container = widgets.VBox(layout=Layout(width='25%',border='solid 0.5px'))
html_area = widgets.HTML(value="<b>Geo</b>")
#dropdown_container = widgets.HBox()
#dropdown
dropdown_continent = widgets.Dropdown(
    options = continent_list,
    value = 'all',
    description='continent:',
    layout=Layout(width='240px'),
    disabled=False,
)  
dropdown_country = widgets.Dropdown(
    #options = country_list,
    description='country:',
    #options = ["all"],
    #value='all',
    layout=Layout(width='240px'),
    disabled = False,
)
def transform(case):
    return continent_country_dict[case]
directional_link((dropdown_continent,'value'),(dropdown_country,'options'),transform)
area_container.children=[html_area,dropdown_continent,dropdown_country]


# for domain
domain_container = widgets.VBox(layout=lexicon_container.layout)
# add title
html_domain = widgets.HTML(value="<b>Domain</b>")
# preparing a container to put in radio buttons
radio_button_domain = widgets.RadioButtons(
    options=['Articles', 'Talks'],
    disabled=False
)
# put text and button into domain container
domain_container.children = [html_domain,radio_button_domain]


# for date
time_container = widgets.VBox(layout=Layout(width='25%',border='solid 0.5px'))
#add title
html_time = widgets.HTML(value="<b>Time</b>")
# prepare time year list
# dates = [day for day in rrule.rrule(rrule.YEARLY, dtstart=date(1700,1,1), until=date.today())]
# year_list = [(i.strftime('%Y'),i) for i in dates]
year_list = list(range(1900,2019,1))
month_list = list(range(1,13,1))
#time_list = [1940,1960,1992,1993]
dropdown_start_year = widgets.Dropdown(
    options = year_list,
    #description='start(year,month):',
    layout=Layout(width='80px'),
    #style={'description_width':'initial'},
    disabled=False,
)
dropdown_start_month = widgets.Dropdown(
    options = month_list,
    #description='',
    layout=Layout(width='40px'),
    disabled=False,
)
time_container_start = widgets.HBox()
time_container_start.children=[widgets.Label('start (year,month):'),dropdown_start_year,dropdown_start_month]

dropdown_end_year = widgets.Dropdown(
    options = year_list,
    #description='end(year,month):',
    #options = ["all"],
    value=year_list[-1],
    layout=Layout(width='80px'),
    disabled = False,
)
dropdown_end_month = widgets.Dropdown(
    options = month_list,
    #description='',
    #options = ["all"],
    #value=month_list[-1],
    layout=Layout(width='40px'),
    disabled = False,
)
time_container_end = widgets.HBox()
time_container_end.children = [widgets.Label('end (year,month):'),dropdown_end_year,dropdown_end_month]
time_container.children = [html_time,time_container_start,time_container_end]


#for button
update_container = widgets.VBox(layout=Layout(#display='flex',
                                              #flex_flow='column',
                                              align_items='center',
                                              #width='100%',
                                              width='10%',
                                              #border='solid 0.5px'
))
#add button that updates the graph based on the checkboxes
button = widgets.Button(description="Go",layout=Layout(width='90%',height='100px'))
update_container.children=[button]


# preparing a container for input panel
input_container = widgets.HBox(layout=Layout(
    display='flex',
    flex_flow='row',
    align_items='stretch',
    border='solid 0.5px',
    #height='120px'
    #width='30%'
))
input_container.children=[lexicon_container, sentiment_container,group_container,domain_container,area_container,time_container,update_container]


# for plot
box_out = ipywidgets.Output(layout=Layout(width='100%',height='500px',border='solid 0.5px'))
# for out_text
#label_out = widgets.Label(value="Here is the statistics of output...",layout=Layout(width='100%'))
text_out = widgets.Output(layout=Layout(width='100%',border='solid 0.5px'))


# for output
output_container = widgets.VBox(layout=Layout(border='solid 0.5px'))
output_container.children = [box_out,text_out]


# container for all: including header, input and output two sections
all_container = widgets.VBox(layout=with_border_layout)
all_container.children = [header_container,input_container,output_container]

sen_dict = {"total":"total","positive":"pos","negative":"neg"}

# run the widget
display(all_container)
button.on_click(on_button_clicked)

                            title  pos_score_OL  neg_score_OL        date  \
10804   2001 Greyhound bus attack        1.4245        6.8376  1972-09-21   
15810                  Les Stroud        5.7343        2.3776  1961-10-20   
18407               Jonathan Hole        2.0576        2.0576  1904-08-13   
38553  2015 San Bernardino attack        2.1899        7.1705  1987-06-14   
45714              John R. McGann        2.6738        1.0695  1924-12-02   

             country      continent    month       datetime_month   total  \
10804  United States  North America  1972-09  1972-09-01 00:00:00  8.2621   
15810         Canada  North America  1961-10  1961-10-01 00:00:00  8.1119   
18407  United States  North America  1904-08  1904-08-01 00:00:00  4.1152   
38553  United States  North America  1987-06  1987-06-01 00:00:00  9.3604   
45714  United States  North America  1924-12  1924-12-01 00:00:00  3.7433   

        final  
10804 -5.4131  
15810  3.3567  
18407  0.0000  
38553 -4.9