In [1]:
import pandas as pd
import sqlite3
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
from datetime import datetime
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
yearly_topics = [
    [
        "Action needed",
        "Legal action",
        "Men's actions",
        "Advocacy & campaigning",
        "Government inaction",
        "Violence justified",
        "Services",
        "Abuse experiences",
        "Social causes",
        "Culture of violence"
    ],
    [
        "Culture & attitudes",
        "Government inaction",
        "Services needed",
        "Men's actions",
        "Advocacy & campaigning",
        "Policing",
        "Prevalence"
    ],
    [
        "Violence & policing",
        "Male perpetrators",
        "Government inaction",
        "Community support",
        "Law reform",
        "Successful programs",
        "Social determinants",
        "Victims' experiences"
    ],
    [
        "Social context",
        "Male perpetrators",
        "Survival & inspiration",
        "Prevalence & risk",
        "Programs & services",
        "Men as victims"
    ],
    [
        "Gun violence",
        "Prevalence",
        "Men's actions",
        "Contexts & causes",
        "Prevention strategy",
        "Government inaction",
        "Abuse experiences",
        "Advocacy & campaigning",
        "Politics & governance"
    ]
]

start_year = 2014
end_year = 2018
num_years = end_year-start_year+1

In [3]:
file_paths = [
    'input/2014/data/2014_10_clusters.xlsx',
    'input/2015/data/2015_7_clusters.xlsx',
    'input/2016/data/2016_8_clusters.xlsx',
    'input/2017/data/2017_6_clusters.xlsx',
    'input/2018/data/2018_9_clusters.xlsx'
]

In [9]:
import numpy as np

df = []
for i, path in enumerate(file_paths):
    df_tmp = pd.read_excel(path, 'topic_rank')
    df_tmp.set_index('Dominant_Topic_Num', inplace=True)
    df_tmp.index.name = 'idx'
    df_tmp['Topic'] = yearly_topics[i]
    df_tmp.sort_index(inplace=True)
    total = np.log2(df_tmp['Num_Documents'].sum())
    df_tmp['Log'] = list(map(lambda s: float(s.replace('%',''))/100*total, df_tmp['Perc_Documents']))
    df.append(df_tmp)
    
df[num_years-1]

Unnamed: 0_level_0,Topic_Keywords,Num_Documents,Perc_Documents,Topic,Log
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"year, kill, murder, stop, death, happen, gun, ...",3089,13.8%,Gun violence,1.994125
2,"woman, men, report, problem, increase, link, r...",2823,12.61%,Prevalence,1.822168
3,"victim, police, man, partner, court, week, mal...",2748,12.28%,Men's actions,1.774482
4,"people, home, good, kid, violent, live, feel, ...",2348,10.49%,Government inaction,1.515824
5,"#domesticviolence, read, community, victoria, ...",2530,11.3%,Contexts & causes,1.63287
6,"support, service, fund, woman, program, house,...",2339,10.45%,Abuse experiences,1.510044
7,"woman, child, abuse, experience, relationship,...",2116,9.45%,Advocacy & campaigning,1.365542
8,"work, leave, survivor, great, story, end, comm...",2386,10.66%,Prevention strategy,1.540389
9,"issue, call, change, #auspol, law, perpetrator...",2005,8.96%,Politics & governance,1.294736


In [10]:
topics = set()
freqs = {}

for one_df in df:
    for topic in one_df['Topic']:
        freqs[topic] = freqs.get(topic, 0) + 1
        if topic not in topics:
            topics.add(topic)

for t in reversed(sorted(freqs, key=freqs.get)):
    print(t, freqs[t])

Government inaction 4
Advocacy & campaigning 3
Men's actions 3
Male perpetrators 2
Prevalence 2
Abuse experiences 2
Politics & governance 1
Prevention strategy 1
Contexts & causes 1
Gun violence 1
Men as victims 1
Prevalence & risk 1
Programs & services 1
Survival & inspiration 1
Social context 1
Victims' experiences 1
Successful programs 1
Social determinants 1
Law reform 1
Community support 1
Violence & policing 1
Policing 1
Services needed 1
Culture & attitudes 1
Social causes 1
Culture of violence 1
Services 1
Violence justified 1
Action needed 1
Legal action 1


In [11]:
import colorlover as cl
from IPython.display import HTML

In [12]:
import random

colors = []

reds = cl.interp( cl.scales['3']['seq']['Reds'], 30)
greys = cl.scales['3']['seq']['Greys']
others = cl.interp(cl.scales['3']['seq']['Greens']+cl.scales['3']['seq']['Blues'] + cl.scales['3']['seq']['Purples']+cl.scales['3']['seq']['YlGnBu']+cl.scales['10']['div']['BrBG']+greys+cl.scales['11']['div']['RdGy'][5:], 120 )

colors.extend(reversed(reds[0:30:5]))
random.shuffle(others)
colors.extend(others[:24])

HTML(cl.to_html( colors ))

# Plot all topics

In [13]:
import numpy as np

traces = []

for i, topic in enumerate(reversed(sorted(freqs, key=freqs.get))):
        
        y = []
        text = []
        for one_df in df:
            found = False
            for _, row in one_df.iterrows():
                if row['Topic'] == topic:
                    found = True
                    if i <= 5:
                        text.append(str(i+1))
                    else:
                        text.append(None)
                    y.append(row['Perc_Documents'])
            
            if not found:
                y.append(0)
                text.append(None)
        
        if i <= 5:
            name = topic+' ('+str(i+1)+')'
        else:
            name = topic
            
        traces.append(go.Bar(
            x=list(range(start_year, end_year+1)),
            y=y,
            name=name,
            hoverinfo='none',
            text=text,
            textposition='inside',
            showlegend=True,
            marker=dict(
                color=colors[i]
            )
        ))
                
layout = go.Layout(
    barmode='stack',
    bargap=0,
    plot_bgcolor=greys[0],
    width=650,
)

fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename='stacked-bar')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~qjtjjt/0 or inside your plot.ly account where it is named 'stacked-bar'


# Chord

In [17]:
from Chord.ChordDiagram import ChordDiagram

from bokeh.layouts import row
from bokeh.io import push_notebook, show, output_notebook
output_notebook()

In [18]:
matrix = np.array([[16,  3, 28,  0, 18],
                   [18,  0, 12,  5, 29],
                   [ 9, 11, 17, 27,  0],
                   [19,  0, 31, 11, 12],
                   [23, 17, 10,  0, 34]], dtype=int)

labels = ['One', 'Two', 'Three', 'Four', 'Five']
pd.DataFrame(matrix, columns=labels, index=labels)
cd = ChordDiagram(matrix)
fig = cd.plot(group=0)
t = show(row(fig, ), notebook_handle=True)
