### Visualisation with Bokeh
This is just a simple piece of visualisation code to display the text data which has been processed in other notebooks.  

In [1]:
print('------------------------------------------------------')
print('Step 5:  Creating visualisation')
from datetime import datetime as dt
print(dt.now())
print('------------------------------------------------------')

------------------------------------------------------
Step 5:  Creating visualisation
2018-04-04 10:35:55.050354
------------------------------------------------------


In [2]:
import pandas as pd
import numpy as np
import pickle

# %matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns


from bokeh.plotting import *
# from bokeh.models import ColumnDataSource, OpenURL, TapTool
from bokeh.models import *
from bokeh.io import output_notebook

In [3]:
from config import Config as c
tfidf_filename = c.tfidf_filename
bar_filename = c.bar_filename
plot_title = c.title
doi_datapath = c.dois_pkl
working_data = c.working_data
set_name = c.set_name
# test_set = c.test_set
n_clusters = c.n_clusters
journal_of_interest = c.joi

In [4]:
# global variable
dois = pickle.load(open(doi_datapath,'rb'))

In [5]:
# read data
data = pd.read_csv(c.working_data,index_col = 0)
# data["Cluster_no"] = data["Cluster_no"].astype('category')
data.sample(5)

Unnamed: 0,DI,PY,TI,AB,WD,AU,EM,AF,SO,SC,...,highly_cited_1,highly_cited_10,highly_cited_5,recent_citations,relative_citation_ratio,times_cited,Citations,Cluster,Cluster_no,Article_kws
800,10.1016/j.ijedudev.2014.02.005,2014,"Academic effectiveness of private, public and ...","recent years, there has been major growth in l...","Academic effectiveness of private, public and ...","Amjad, R; MacLeod, G",ravish.ita@gmail.com,"Amjad, Ravish; MacLeod, Gordon",INTERNATIONAL JOURNAL OF EDUCATIONAL DEVELOPMENT,Education & Educational Research,...,False,False,False,5,,5,5,"['school', 'student', 'privat']",11,"['privat', 'outperform', 'school']"
1264,10.1177/1367493510397624,2011,The importance of including both a child persp...,The UN Convention on the Rights of the Child (...,The importance of including both a child persp...,"Soderback, M; Coyne, I; Harder, M",maja.soderback@mdh.se,"Soderback, Maja; Coyne, Imelda; Harder, Maria",JOURNAL OF CHILD HEALTH CARE,Nursing; Pediatrics,...,False,True,False,24,3.89,58,58,"['servic', 'care', 'famili']",4,"['child', 'perspect', 'care']"
548,10.1111/j.1365-2214.2010.01125.x,2011,Parent and service providers' perceptions rega...,Background Family-centred service (FCS) provis...,Parent and service providers' perceptions rega...,"Dickens, K; Matthews, LR; Thompson, J",,"Dickens, K.; Matthews, L. R.; Thompson, J.",CHILD CARE HEALTH AND DEVELOPMENT,Psychology; Pediatrics,...,False,False,False,8,2.28,24,24,"['servic', 'care', 'famili']",4,"['fcs', 'servic', 'rehabilit']"
1270,10.1177/1367493510385020,2011,Diet and sleep in children with attention defi...,Sleep disturbances are common and consequentia...,Diet and sleep in children with attention defi...,"Blunden, SL; Milte, CM; Sinn, N",sarah.blunden@unisa.edu.au,"Blunden, Sarah Lee; Milte, Catherine M.; Sinn,...",JOURNAL OF CHILD HEALTH CARE,Nursing; Pediatrics,...,False,False,False,2,0.3,4,4,"['sleep', 'adhd', 'depress']",7,"['sleep', 'diet', 'adhd']"
1236,10.1177/1367493511423649,2012,Sibling involvement in childhood chronic heart...,In interviews with parents for the evaluation ...,Sibling involvement in childhood chronic heart...,"Redshaw, S; Wilson, V",Valerie.wilson@health.nsw.gov.au,"Redshaw, Sarah; Wilson, Valerie",JOURNAL OF CHILD HEALTH CARE,Nursing; Pediatrics,...,False,False,False,0,0.27,5,5,"['activ', 'sibl', 'food']",13,"['bead', 'sibl', 'heart']"


## Set NaNs in citations col to zero.

In [6]:
data['Citations'] = data['Citations'].fillna(0)

In [7]:
print('Showing data for the following journals:')
print(data['SO'].value_counts())

Showing data for the following journals:
CHILD CARE HEALTH AND DEVELOPMENT                   559
INTERNATIONAL JOURNAL OF EDUCATIONAL DEVELOPMENT    485
JOURNAL OF CHILD HEALTH CARE                        232
JOURNAL OF RESEARCH IN CHILDHOOD EDUCATION           76
Name: SO, dtype: int64


In [8]:
# for col in data.columns:
#     print(col,': ',sum(data[col].isnull()))

## Bokeh code
The first step is to define a few things that will go into the plot.

#### Hover tool
The hover tool defines what happens when you hover your mouse over the plot. 

In [9]:
hover = HoverTool(
        tooltips=[
#             ("index", "$index"),
#             ("(x,y)", "($x, $y)"),
            ("Journal", "@SO"),
            ("DOI", "@DI"),
            ("Article Keywords","@Article_kws"),
            ("Citations", "@Citations"),
            ("Cluster_no", "@Cluster_no"),
            ("Cluster Keywords","@Cluster")
                ])

In [10]:
output_notebook()

#### Other tools
Other tools to add to the right hand side of the plot can be selected from a list.

In [11]:
TOOLS = [BoxSelectTool(), hover, 'tap','box_zoom','reset', 'crosshair'] #,HoverTool()] # just say 'HoverTool()' for the default

## Add Alpha and size data

'Alpha' is the transparency or 'brightness' of the dots.  The formula below ensures that low-cited articles are dimmer than the bright ones.  This has quite a subtle effect on the final plot and can be removed, but it does help to make individual articles stand out , even if their coloring puts then in a group that has low citations.

'Sizes' is our setting for the size of the dots.  I used to use this to help accentuate highly cited papers, but I decided that it made the plot look cluttered.  Perhaps it's worth uncommenting this line if you are using a small dataset.

In [12]:
# data['Sizes'] = 3+(1.5*(np.log(1+data.Citations)))

# normalize citations to max 1 and attenuate with log so that we don't have a spike
normed = np.log10(1+data.Citations)/np.log10(1+np.max(data.Citations))
# set minimum alpha
min_alpha = 0.2
# set max alpha
max_alpha = 1.0
# set alpha
data['Alpha'] = min_alpha + ( (max_alpha-min_alpha)*normed )

## Colour clusters by average citation rates
This is where we define our colour scheme. 

In [13]:

sns.palplot(sns.color_palette("YlOrRd", 50)[::-1])

In [14]:
data[data['Citations']==100]

Unnamed: 0,DI,PY,TI,AB,WD,AU,EM,AF,SO,SC,...,highly_cited_10,highly_cited_5,recent_citations,relative_citation_ratio,times_cited,Citations,Cluster,Cluster_no,Article_kws,Alpha
477,10.1111/j.1365-2214.2011.01282.x,2011,A social-ecological model of readiness for tra...,Background Policy and research related to tran...,A social-ecological model of readiness for tra...,"Schwartz, LA; Tuchman, LK; Hobbie, WL; Ginsber...",schwartzl@email.chop.edu,"Schwartz, L. A.; Tuchman, L. K.; Hobbie, W. L....",CHILD CARE HEALTH AND DEVELOPMENT,Psychology; Pediatrics,...,True,True,59,5.52,100,100,"['transit', 'adult', 'care']",17,"['smart', 'transit', 'readi']",0.985472


In [None]:
cit_col_ls = []

av_years = c.av_years # this won't work if you have zero articles from these years in a particular cluster.  Use with care or just set to c.years

avcits = data[data['PY'].isin(av_years)].groupby('Cluster_no')['Citations'].mean()
maxav = avcits.max()
minav = avcits.min()
avrange = maxav-minav

# define the palette
palette = sns.color_palette("YlOrRd", n_clusters+1).as_hex()[::-1] # note +1 to fix rounding errors
# palette2 = ['#%02x%02x%02x'%(int(y) for y in x) for x in palette]
# palette2
i=0
so = data['SO'].tolist()
color_numerics=[]
for cl_no in data['Cluster_no'].tolist():
    
    if so[i] == journal_of_interest:
        cit_col_ls.append('#7ec0ee') # this colour is a bright sky-blue.  'green' or maybe purple would show up well, too
    else:
        cl_avcit = n_clusters*(avcits[cl_no]-minav)/avrange # gets the relative position of the cluster's average cites in the distribution
        color_numerics.append(cl_avcit)
        colr = palette[int(cl_avcit)]
        cit_col_ls.append(colr)
    i+=1
data['Cit_col'] = cit_col_ls   
# cit_col_ls

Check the distribution of the colours.  In some datasets, you'll find a poor distribution of red/orange/yellow simply due to the distribution of citations in the dataset.  Worth fiddling with the code to make the diferences stand out.  

In [None]:
pd.Series(color_numerics).hist()

## Build plot
This is where we define the figure itself using the Bokeh package.

In [None]:
from bokeh.io import show
from bokeh.models import ColumnDataSource
from bokeh.palettes import RdBu3
from bokeh.plotting import figure

# https://stackoverflow.com/questions/41856999/bokeh-plots-just-bring-up-a-blank-window
# BOKEH_RESOURCES=inline


# plotting
p = figure(plot_width=950, plot_height=600,
           title=plot_title,  # specified in the config file!
           tools=TOOLS,
          x_axis_label = "Textual similarity axis_1 (arbitrary units)",
          y_axis_label = 'Textual similarity axis_2 (arbitrary units)') # , active_inspect=None)

# p.toolbar.active_inspect = ['crosshair', hover]

p.background_fill_color = "black"

p.circle(x = 'TSNE1', 
         y = 'TSNE2', 
#           legend = 'Division',
         size = 5, # 'Sizes',
         color = 'Cit_col', # Cit_col',#'j_col', #'cit_colr', # #841F27', #'Color',
         alpha = 'Alpha', 
         line_alpha = 0,
         source = ColumnDataSource(data))  # This conversion to ColumnDataSource is crucial.

p.legend.location = "bottom_right"
# p.legend.text = div_colors

output_file(tfidf_filename,
           mode = 'inline')  # toggle for write-to-file


# add links
url = "@Link"
taptool = p.select(type=TapTool)
taptool.callback = OpenURL(url=url)

In [None]:
show(p)

### Add bar plots
Now that the plot is completed, there are a few simple images that might help to describe the data.

In [None]:
# % matplotlib inline
df = pd.read_csv('data/cluster_data.csv', index_col=0)
df.sample(1)

In [None]:
df.dtypes

In [None]:
# Convert Cluster to category data type
df['Cluster'] = df['Cluster'].astype(str).astype('category', ordered = True)
#check
df.dtypes

### With Seaborn
Means first

In [None]:
x = 'Cluster'
y = 'mean_cites'

df.Cluster = df.sort_values(y).Cluster
df = df.sort_values(y, ascending = False)
df.Cluster.cat.ordered
# df['Cluster'] = df['Cluster'].cat.reorder_categories(list(df['Cluster']), ordered=True)

In [None]:
import matplotlib.pyplot as plt
f, ax = plt.subplots(figsize=(20, 12))
p = sns.barplot(data = df, 
            x=df[y],
            y=df[x],
            color = 'b',
            orient='h',
            order = df[x]).set_title('Citation rates for {} split into {} K-Means clusters'.format(c.set_name,c.n_clusters))
# p.set_xticklabels(labels = df[x],rotation=90)
ax.set(xlabel='Mean citations', ylabel='Cluster')
plt.savefig('outputs/K_Means_Barplot_means_k{}.png'.format(n_clusters))
p

Now plot the medians

In [None]:
import seaborn as sns

df = pd.DataFrame(data.groupby('Cluster')['Citations'].median())
df.reset_index(level=0, inplace=True)

In [None]:
df.columns = ['Cluster', 'Median citations']
df.Cluster = df.Cluster.astype('category')
df.head()

In [None]:
x = 'Cluster'
y = 'Median citations'

In [None]:
df[x] = df.sort_values(y).Cluster

In [None]:
df = df.sort_values(y, ascending = False)
df.Cluster.cat.ordered

In [None]:
import matplotlib.pyplot as plt
f, ax = plt.subplots(figsize=(20, 12))
p = sns.barplot(data = df, 
            x=df[y],
            y=df[x],
            color = 'b',
            orient='h',
            order = df[x]).set_title('Citation rates for {} split into {} K-Means clusters'.format(c.set_name, c.n_clusters))
# p.set_xticklabels(labels = df[x],rotation=90)
ax.set(xlabel='Median citations', ylabel='Cluster')
plt.savefig('outputs/K_Means_Barplot_medians_k{}.png'.format(n_clusters))
p

## Show relative sizes of journals over the years

In [None]:
# see clusters 15, 47, 10

bar_df = pd.DataFrame(data.groupby(['SO',
#                                     'Cluster',
                                    'PY']).size().reset_index(name="Count"))

# bar_df.columns
bar_df.sample()

In [None]:
f, ax = plt.subplots(figsize=(20, 12))
p = sns.factorplot(data = bar_df, 
                   ax =ax,
            x='PY',
            y='Count',
#                     fit_reg=False,
                   kind='bar',
            hue = 'SO')
# plt.title('Citations per year for cluster 15')
# p.set_xticklabels(labels = df[x],rotation=90)
# plt.savefig('outputs/Cluster_15_Barplot.png') #image not saving correctly
plt