# The Rise and Fall of the Third Reich
# A History of Nazi Germany 
# by William L. Shirer
## A Textual Analysis

Read in file.

In [2]:
import book_reader as br
import book_viz as bv
import datetime as dt 
import pandas as pd
import os
import json
from itertools import chain
from wordcloud import WordCloud, get_single_color_func

In [4]:
from IPython.display import Image, display

In [5]:
generate_book_df = False
generate_toc_df = False
generate_pivots = False
generate_places_vs_chapter = False
generate_people_vs_chapter = False
generate_places_vs_range = False
generate_people_vs_range = False
generate_csvs = False
generate_ents = False
book_short_name = 'rafo3r'
book_file = 'rafo3r.txt'
h5_file = 'rafo3r.h5'

In [6]:
places_json = json.loads(open('places.json', 'r', encoding='utf-8').read())
people_json = json.loads(open('people.json', 'r', encoding='utf-8').read())

In [7]:
people_list = (list(people_json.keys()) + 
               (list(chain.from_iterable(people_json.values()))))
places_list = (list(places_json.keys()) +
               (list(chain.from_iterable(places_json.values()))))

assert len(people_list) == len(set(people_list))
assert len(places_list) == len(set(places_list))
assert list(set(people_list) & set(places_list)) == []

In [8]:
%reload_ext book_reader
rafo3r_reader = br.book_reader(book_short_name,
                               generate_book_df, generate_toc_df,
                               generate_pivots, generate_places_vs_chapter,
                               generate_people_vs_chapter,
                               generate_places_vs_range,
                               generate_people_vs_range,
                               places_json, people_json)
returned_list = rafo3r_reader.main(book_file, h5_file)
rafo3r = returned_list[0]
toc = returned_list[1]
rafo3r_wordvscount_pivot = returned_list[2]
rafo3r_wordchaptervscount_pivot = returned_list[3]
rafo3r_wordbookvscount_pivot = returned_list[4]
places_vs_chapter_df = returned_list[5]
people_vs_chapter_df = returned_list[6]
places_vs_range_df = returned_list[7]
people_vs_range_df = returned_list[8]

Starting
Book processed
TOC processed
Chapter markers made
Pivots made
Places v Chapter made
People v Chapter made
Places v Range made
People v Range made


In [9]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF
py.sign_in('yg2bsm', '8e3m3cer5e')

In [10]:
def _col_clean(name):
    return name.replace('_', ' ').title()

In [None]:
words_on_graph = 10
drop_top_word = False

if drop_top_word:
    places_vs_chapter_df.drop(places_vs_chapter_df[
                              list(places_vs_chapter_df.sum(axis=0).
                              sort_values(ascending=False).index)].
                              columns[[0]], axis=1, inplace=True)
places_vs_chapter_df = places_vs_chapter_df[list(places_vs_chapter_df.
                                                 sum(axis=0).
                                                 sort_values(ascending=
                                                             False)
                                                 [:words_on_graph].index)]

#remove last chapter (aftword)
if places_vs_chapter_df[-1:].index == 33:
    places_vs_chapter_df = places_vs_chapter_df[:-1]

visibility_list = []
for i in range(len(places_vs_chapter_df.columns)):
    if i == Luke Starnes0: 
        visibility_list.append('legendonly')
    elif 1 <= i <= 4: 
        visibility_list.append('true')
    else:
        visibility_list.append('legendonly')

#colors = [(230, 84, 0), (116, 32, 104), (35, 68, 131), 
#          (12, 99, 124), (1, 137, 130), (71, 100, 117), 
#          (80, 84, 77)]

colors = [(191, 184, 162), (78, 77, 74), (148, 186, 101), (153, 0, 0), 
          (12, 99, 124), (39, 144, 176), (230, 84, 0), (35, 68, 131), 
          (177, 140, 29), (116, 32, 104), (1, 137, 130), (86, 87, 114), 
          (163, 30, 57), (71, 100, 117), (107, 121, 140), (235, 104, 37)]

color_list = []
for i in range(len(places_vs_chapter_df.columns)):
    index = i - (len(colors) * int(i/len(colors)))
    this_color = colors[index]
    color_list.append('rgb(%i, %i, %i)'%(this_color[0],
                                         this_color[1],
                                         this_color[2]))

#c = 255 / len(places_vs_chapter_df.columns)
new_col_names = list(map(_col_clean,
                         list(places_vs_chapter_df.columns)))
py.iplot(dict(data=[{'x': places_vs_chapter_df.index,
                     'y': places_vs_chapter_df[col],
                     'name': new_col_names[i],
                     'visible': visibility_list[i],
                     'fill': 'none',
                     'line': dict(color=(color_list[i]),
                                  width=4,
                                  smoothing=.8,
                                  shape="spline"),
                    } for i, col in enumerate(places_vs_chapter_df.columns)],
              layout=dict(title='RaFo3R Places vs Chapter',
                          #dragmode='zoom',
                          xaxis=dict(title='Chapter',
                                     tickvals=list(range(2, 36, 2)),
                                     tickmode='array',
                                     rangeslider=dict(thickness=0.2)),
                          yaxis=dict(title='Word Count'))),
         filename='plotly/places_vs_chapter')

In [None]:
words_on_graph = 10
drop_top_word = False

if drop_top_word:
    people_vs_chapter_df.drop(people_vs_chapter_df[
                              list(people_vs_chapter_df.sum(axis=0).
                                   sort_values(ascending=False).index)].
                              columns[[0]], axis=1, inplace=True)
people_vs_chapter_df = people_vs_chapter_df[list(people_vs_chapter_df.
                                                 sum(axis=0).
                                                 sort_values(ascending=False)
                                                 [:words_on_graph].index)]
#remove last chapter (aftword)
if places_vs_chapter_df[-1:].index == 33:
    people_vs_chapter_df = people_vs_chapter_df[:-1]

visibility_list = []
for i in range(len(places_vs_chapter_df.columns)):
    if i == 0: 
        visibility_list.append('legendonly')
    elif 1 <= i <= 4: 
        visibility_list.append('true')
    else:
        visibility_list.append('legendonly')
        
colors = [(191, 184, 162), (78, 77, 74), (148, 186, 101), (153, 0, 0), 
          (12, 99, 124), (39, 144, 176), (230, 84, 0), (35, 68, 131), 
          (177, 140, 29), (116, 32, 104), (1, 137, 130), (86, 87, 114), 
          (163, 30, 57), (71, 100, 117), (107, 121, 140), (235, 104, 37)]

color_list = []
for i in range(len(people_vs_chapter_df.columns)):
    index = i - (len(colors) * int(i/len(colors)))
    this_color = colors[index]
    color_list.append('rgb(%i, %i, %i)'%(this_color[0],
                                         this_color[1],
                                         this_color[2]))
new_col_names = list(map(_col_clean,
                         list(people_vs_chapter_df.columns)))


py.iplot(dict(data=[{
                   'x': people_vs_chapter_df.index,
                   'y': people_vs_chapter_df[col],
                   'name': new_col_names[i],
                   'visible': visibility_list[i],
                   'fill': 'none',
                   'line': dict(color=(color_list[i]),
                                width=4,
                                smoothing=.8,
                                shape="spline"),
                   } for i, col in enumerate(people_vs_chapter_df.columns)],
                   layout=dict(title = 'RaFo3R People vs Chapter',
                               #plot_bgcolor="rgb(238, 235, 234)",
                               #dragmode = 'zoom',
                               xaxis = dict(title = 'Chapter',
                                            tickvals = list(range(2,36,2)),
                                            tickmode = 'array',
                                            rangeslider = dict(thickness=0.2)),
                               yaxis = dict(title = 'Word Count'))), filename='plotly/people_vs_chapter')

In [None]:
words_on_graph = 10
drop_top_word = False

if drop_top_word:
    places_vs_range_df.drop(places_vs_range_df[
                            list(places_vs_range_df.sum(axis=0).
                            sort_values(ascending=False).index)].
                            columns[[0]], axis=1, inplace=True)

places_vs_range_df = places_vs_range_df[list(places_vs_range_df.
                                             sum(axis=0).
                                             sort_values(ascending=False)
                                             [:words_on_graph].index)]

new_col_names = list(map(_col_clean,
                                 list(places_vs_range_df.columns)))

visibility_list = []
for i in range(len(places_vs_chapter_df.columns)):
    if i == 0: 
        visibility_list.append('legendonly')
    elif 1 <= i <= 4: 
        visibility_list.append('true')
    else:
        visibility_list.append('legendonly')
        
colors = [(191, 184, 162), (78, 77, 74), (148, 186, 101), (153, 0, 0), 
          (12, 99, 124), (39, 144, 176), (230, 84, 0), (35, 68, 131), 
          (177, 140, 29), (116, 32, 104), (1, 137, 130), (86, 87, 114), 
          (163, 30, 57), (71, 100, 117), (107, 121, 140), (235, 104, 37)]

py.iplot(dict(data=[{
                'x': places_vs_range_df.index,
                'y': places_vs_range_df[col],
                'name': new_col_names[i],
                'visible': visibility_list[i],
                'fill' : 'none',
                'line': dict(color=(color_list[i]),
                                width=4,
                                smoothing=.8,
                                shape="spline"),
                   }  for i, col in enumerate(places_vs_range_df.columns)],
                   layout=dict(title = 'RaFo3R Places vs 10k Words',
                               #dragmode = 'zoom',
                               #legend=dict(y=1,x=0.02),
                               xaxis = dict(title = 'Per 10k Words',
                                            rangeslider = dict(thickness=0.20)),
                               yaxis = dict(title = 'Word Count'))), filename='plotly/places_vs_range')

In [None]:
words_on_graph = 10
drop_top_word = False

if drop_top_word:
    people_vs_range_df.drop(places_vs_range_df[
                            list(people_vs_range_df.sum(axis=0).
                            sort_values(ascending=False).index)].
                            columns[[0]], axis=1, inplace=True)

people_vs_range_df = people_vs_range_df[list(people_vs_range_df.
                                             sum(axis=0).
                                             sort_values(ascending=False)
                                             [:words_on_graph].index)]

new_col_names = list(map(_col_clean,
                                 list(people_vs_range_df.columns)))

visibility_list = []
for i in range(len(people_vs_range_df.columns)):
    if i == 0: 
        visibility_list.append('legendonly')
    elif 1 <= i <= 4: 
        visibility_list.append('true')
    else:
        visibility_list.append('legendonly')
        
colors = [(191, 184, 162), (78, 77, 74), (148, 186, 101), (153, 0, 0), 
          (12, 99, 124), (39, 144, 176), (230, 84, 0), (35, 68, 131), 
          (177, 140, 29), (116, 32, 104), (1, 137, 130), (86, 87, 114), 
          (163, 30, 57), (71, 100, 117), (107, 121, 140), (235, 104, 37)]

py.iplot(dict(data=[{
                'x': people_vs_range_df.index,
                'y': people_vs_range_df[col],
                'name': new_col_names[i],
                'visible': visibility_list[i],
                'fill' : 'none',
                'line': dict(color=(color_list[i]),
                                width=4,
                                smoothing=.8,
                                shape="spline"),
                   }  for i, col in enumerate(people_vs_range_df.columns)],
                   layout=dict(title = 'RaFo3R People vs 10k Words',
                               #dragmode = 'zoom',
                               xaxis = dict(title = 'Per 10k Words',
                                            rangeslider = dict(thickness=0.20)),
                               yaxis = dict(title = 'Word Count'))), filename='plotly/places_vs_range')

In [19]:
num_chapters = max(people_vs_chapter_df.index)
ch_list = list(range(1, num_chapters + 1))

#remove last chapter (aftword)
if ch_list[-1] == 33:
    ch_list = ch_list[:-1]
assert (len(ch_list) == 32)

In [12]:
top_words = []
num_top_words = 7



for i in ch_list:
    top_words.append(list(people_vs_chapter_df.loc[i].
                          sort_values(ascending=False)
                          [:num_top_words].index))
top_words_df = pd.DataFrame(top_words, index=ch_list,
                            columns=list(range(1, num_top_words + 1)))

py.iplot(FF.create_table(top_words_df, index=True),
         filename='plotly/top_people_table')

In [13]:
import random
def grey_color_func(word, font_size, position, orientation,
                    random_state=None, **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)

In [14]:
import PIL
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.gridspec as gridspec
import numpy as np

In [15]:
book_df = rafo3r

In [23]:
def matrix_cloud_maker(img_per_side=(1,1), image_inches=1,dpi=96, book_dict=[], file_name=''):
    #assumes a list of dicts in the following format:
        #[{section_num : book_list_for_section},{section_num : book_list_for_section},...]
    width = (img_per_side[0] * image_inches) + (0.025 * (img_per_side[0]-1))
    height = (img_per_side[1] * image_inches) + (0.025 * (img_per_side[1]-1))
    print (width, height)
    print (width*dpi, height*dpi)
    print (img_per_side[0],img_per_side[1])
    fig = plt.figure(figsize=(width,height), dpi=dpi)
    fig.set_figwidth(width)
    fig.set_figheight(height)
    ax = [fig.add_subplot(img_per_side[0],img_per_side[1],i+1) for i in range(len(book_dict))]
    print(dir(ax))
    #for it in iterable:
    for num, book_list in book_dict.items():
        i = num - 1
        #book_list = list(book_df['Word'][book_df[col_to_iterate] == it])
        book_wordcloud = WordCloud(width=image_inches * dpi,
                                   height=image_inches * dpi,
                                   #max_words=300,
                                   min_font_size=8,
                                   #max_font_size=100,
                                   color_func=get_single_color_func('darkred'),
                                   stopwords=rafo3r_reader.stopwords).generate(
                                   ' '.join(book_list))
        #ax[i].set_xticklabels([])
        #ax[i].set_yticklabels([])
        ax[i].axis('off')
        #ax[i].set_xticks([])
        #need to turn off ticks
        ax[i].set_aspect('equal')
        book_wordcloud.recolor(color_func=grey_color_func)
        ax[i].imshow(book_wordcloud.to_image())
    fig.subplots_adjust(wspace=0.025, hspace=0.025)
    plt.savefig(file_name, dpi=dpi)
    plt.close(fig)
    print ('done')

In [18]:
book_people_df = book_df.copy()
book_people_df = book_people_df[book_people_df['Word'].isin(list(people_json.keys()))][book_df['Count'] > 1]
book_people_df['Word'] = book_people_df['Word'].apply(lambda x: x.title())

book_places_df = book_df.copy()
book_places_df = book_places_df[book_df['Word'].isin(places_list)][book_df['Count'] > 1]
book_places_df['Word'] = book_places_df['Word'].apply(lambda x: x.title().replace('_', ''))


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.



In [70]:
book_full_dict = {}
book_people_dict = {}
book_places_dict = {}

for ch in ch_list:
    book_full_dict[ch] = list(book_df['Word'][book_df['Chapter'] == ch])
    book_people_dict[ch] = list(book_people_df['Word'][book_people_df['Chapter'] == ch])
    book_places_dict[ch] = list(book_places_df['Word'][book_places_df['Chapter'] == ch])


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.



In [71]:
matrix_cloud_maker(img_per_side = (6,6), image_inches = 3, dpi = 400, 
                   book_dict = book_full_dict, file_name = 'rafo3r_matrix_cloud.png')
matrix_cloud_maker(img_per_side = (6,6), image_inches = 3, dpi = 400, 
                   book_dict = book_people_dict, file_name = 'rafo3r_people_matrix_cloud.png')
matrix_cloud_maker(img_per_side = (6,6), image_inches = 3, dpi = 400, 
                   book_dict = book_places_dict, file_name = 'rafo3r_places_matrix_cloud.png')

done
done
done


In [22]:
bk_list = range(1,6)

book_full_dict = {}
book_people_dict = {}
book_places_dict = {}

for bk in bk_list:
    book_full_dict[bk] = list(book_df['Word'][book_df['Book'] == bk])
    book_people_dict[bk] = list(book_people_df['Word'][book_people_df['Book'] == bk])
    book_places_dict[bk] = list(book_places_df['Word'][book_places_df['Book'] == bk])

In [None]:
matrix_cloud_maker(img_per_side = (2,3), image_inches = 6, dpi = 400, 
                   book_dict = book_full_dict, file_name = 'rafo3r_by_book_matrix_cloud.png')
matrix_cloud_maker(img_per_side = (3,2), image_inches = 6, dpi = 400, 
                   book_dict = book_people_dict, file_name = 'rafo3r_by_book_people_matrix_cloud.png')
matrix_cloud_maker(img_per_side = (3,2), image_inches = 6, dpi = 400, 
                   book_dict = book_places_dict, file_name = 'rafo3r_by_book_places_matrix_cloud.png')

12.025 18.05
4810.0 7220.0
2 3
[<matplotlib.axes._subplots.AxesSubplot object at 0x11d8187f0>, <matplotlib.axes._subplots.AxesSubplot object at 0x11d585710>, <matplotlib.axes._subplots.AxesSubplot object at 0x1a0063b00>, <matplotlib.axes._subplots.AxesSubplot object at 0x111c205c0>, <matplotlib.axes._subplots.AxesSubplot object at 0x11c2cac18>]


In [None]:
%matplotlib

In [1]:
fig = plt.figure(figsize=(8,8)) # Notice the equal aspect ratio
ax = [fig.add_subplot(2,2,i+1) for i in range(4)]

for a in ax:
    a.set_xticklabels([])
    a.set_yticklabels([])
    a.set_aspect('equal')
    a.imshow(book_wordcloud.to_image())

fig.subplots_adjust(wspace=0.025, hspace=0.05)
plt.show()

NameError: name 'plt' is not defined

In [None]:
from wordcloud import random_color_func
wordcloud = WordCloud(background_color="white", width=1024, height=768,
                      color_func=random_color_func, palette=0).generate(' '.join(rafo3r_people))
#font_path="/Users/Emiel/Library/Fonts/FiraMono-Regular.ttf",

for i in range(0,7):
    wordcloud.recolor(palette=i)
    wordcloud.to_file('rafo3r_people_cloud' +str(i)+ '.png')

In [None]:
from wordcloud import WordCloud, get_single_color_func

places_list = list(places_json.keys()) + (list(chain.from_iterable(places_json.values())))

rafo3r_full_list = list(rafo3r['Word'])
rafo3r_people = list(rafo3r['Word'][rafo3r['Word'].isin(list(people_json.keys()))][rafo3r['Count']>1].apply(lambda x: x.title()))
rafo3r_places = list(rafo3r['Word'][rafo3r['Word'].isin(places_list)][rafo3r['Count']>1].apply(lambda x: x.title().replace('_','')))


rafo3r_wordcloud = WordCloud(width = 1280, height = 960, max_words = 300,min_font_size = 8,
                             max_font_size = 100,color_func = get_single_color_func('darkred'),
                             stopwords = rafo3r_reader.stopwords).generate(' '.join(rafo3r_full_list))#rafo3r_text)
rafo3r_places_wordcloud = WordCloud(width = 1280, height = 960, max_words = 200,min_font_size = 8,
                                    max_font_size = 150,color_func = get_single_color_func('lightsteelblue'),
                                    stopwords = rafo3r_reader.stopwords).generate(' '.join(rafo3r_places))
rafo3r_people_wordcloud = WordCloud(width = 1280, height = 960, max_words = 300,min_font_size = 8,
                                     max_font_size = 100,color_func = get_single_color_func('darkred'),
                                     stopwords = rafo3r_reader.stopwords).generate(' '.join(rafo3r_people))
#regexp = rafo3r_reader.re_splitter
full_cloud_file = "rafo3r_full_cloud.png"
places_cloud_file = "rafo3r_places_cloud.png"
people_cloud_file = "rafo3r_people_cloud.png"
rafo3r_wordcloud.to_file(full_cloud_file)
rafo3r_places_wordcloud.to_file(places_cloud_file)
rafo3r_people_wordcloud.to_file(people_cloud_file)
display(Image(full_cloud_file))
display(Image(places_cloud_file))
display(Image(people_cloud_file))

In [None]:
def word_vs_range_df_maker(book_df, word_json, break_point = 10000, min_count_req = 400):
    peak = len(book_df)
    broken_list = list(range(0,peak,break_point))
    broken_list.pop(0)#remove 0
    if broken_list[-1] != peak: broken_list.append(peak)
    plotter_df = pd.DataFrame()
    for word_main in word_json:
        these_words = word_json[word_main]
        for i, v in enumerate(broken_list):
            if i == 0:
                v0 = 0
            else:
                v0 = broken_list[i-1]
            plotter_df.loc[str(v),word_main] = _count_within_range(word_main,v0,v)
            for word_sub in these_words:
                plotter_df.loc[str(v),word_main] += _count_within_range(word_sub,v0,v)
    plotter_df = plotter_df.drop(plotter_df.sum(axis=0)
                                 [plotter_df.sum(axis=0)<min_count_req].index,axis=1)
    return plotter_df
#places_json = json.loads('{"germany":["german"],"austria":[]}')
#tester00 = word_vs_range_df_maker(rafo3r, places_json, 10000, 1)
#tester01 = word_vs_range_df_maker(rafo3r, people_json, 10000, 1)

In [None]:
rafo3r_viz = bv.book_viz(rafo3r, toc, p1, p2, places_json, people_json)
rafo3r_viz.places_vs_chapters()

In [None]:
#using pivot2, create a new dataframe with words (subset based on places from json) as columns and chapter (counts)
#as rows.
def word_vs_chapter_df_maker(book_pivot2, word_json, ch_list, min_count_req = 400):
    bp2 = book_pivot2.copy()  #to my suprise, without this i was modifying the actual df (didnt think would happen in func)
    
    #######                      
    #remove this once you fix bug in pivot maker
    #print('P1',bp2.head())
    bp2.index = bp2.index.droplevel(1)
    bp2 = bp2[~bp2.index.duplicated(keep='first')]
    #print('P2',bp2.head())
    #######                      
    
    plotter_df = pd.DataFrame()
    for word_main in word_json:
        other_words = word_json[word_main]
        s = 'Word == "%s"'%(word_main)
        master_df = bp2.query(s).reset_index().set_index('Chapter')
        master_df = master_df.reindex(ch_list).fillna(0)
        master_df['Word'] = word_main
        master_df.sort_index(inplace=True)
        try:
            del master_df['Book']
        except:
            #remove this try once you fix the issue that lets you remove the stuff at start.
            #issue is that you remove book up there so you cant delete it here.
            pass
        for word_sub in other_words:
            s = 'Word == "%s"'%(word_sub)
            minor_df = bp2.query(s).reset_index().set_index('Chapter')
            minor_df = minor_df.reindex(ch_list).fillna(0)
            minor_df.sort_index(inplace=True)
            try:
                del minor_df['Book']
            except:
                #remove this try once you fix the issue that lets you remove the stuff at start.
                #issue is that you remove book up there so you cant delete it here.
                pass
            master_df['Count'] = master_df['Count'] + minor_df['Count']
        plotter_df = pd.concat([plotter_df,master_df])
    plotter_df = plotter_df.reset_index()
    plotter_df.set_index(['Chapter', 'Word'], inplace=True)
    plotter_df = plotter_df.unstack(level=1) 
    plotter_df = plotter_df.drop(plotter_df.sum(axis=0)
                                 [plotter_df.sum(axis=0)<min_count_req].index,axis=1)
    plotter_df.columns = plotter_df.columns.droplevel(0)
    return plotter_df

In [None]:
Lukenum_chapters = max(book_pivot2.reset_index()['Chapter'])
ch_list = list(range(1,num_chapters+1))

In [None]:
#why does chapters total for poland to 1047.0, but range totals to 957.0
#print(_count_within_range(word, v0, v))
print (toc[-2:])
print (len(rafo3r))
print (rafo3r.query('Chapter == 33'))
#bug: words marked ch. 1 go from pos 0 to 2490, but 2490 is *start* of Ch. 1...
#and there are 33 chapter.  last 2...
#Ch32         A BRIEF EPILOGUE      1023    570995
#Ch33                AFTERWORD      1027    572270
#but words marked ch 33 runs from 570996 to 572270... 
#length of book is 573408

In [None]:
Starnesprint(places_vs_chapter_df.sum(axis=0).sort_values()[-5:])
print(places_vs_range_df.sum(axis=0).sort_values()[-5:])
print(people_vs_chapter_df.sum(axis=0).sort_values()[-5:])
print(people_vs_range_df.sum(axis=0).sort_values()[-5:])
#why are counts not the same between chapter split and 10k split?

In [None]:
def _col_clean(name):
    return name.replace('_',' ').title()

In [None]:
Luke Starnesc = 256 / len(places_vs_chapter_df.columns)
new_col_names = list(map(_col_clean, list(places_vs_chapter_df.columns)))

py.iplot(dict(data=[{
    'x': places_vs_chapter_df.index,
    'y': places_vs_chapter_df[col],
    'name': new_col_names[i],
    'fill' : 'tonexty',
    'line' : dict(color = ('rgb(%i, %i, 100)'%(int(c * i),int(255 - c * i)))),
}  for i, col in enumerate(places_vs_chapter_df.columns)], layout=dict(
            title = 'RaFo3R Places vs Chapter',
            dragmode = 'zoom',
            xaxis = dict(title = 'Chapter', tickvals = list(range(2,36,2)), tickmode = 'array', 
                         rangeslider = dict(thickness=0.2)),
            yaxis = dict(title = 'Word Count'))), filename='plotly/places_vs_chapter')

In [None]:
Luke Starnesc = 256 / len(people_vs_chapter_df.columns)
new_col_names = list(map(_col_clean, list(people_vs_chapter_df.columns)))

py.iplot(dict(data=[{
    'x': people_vs_chapter_df.index,
    'y': people_vs_chapter_df[col],
    'name': new_col_names[i],
    'fill' : 'tonexty',
    'line' : dict(color = ('rgb(%i, %i, 100)'%(int(c * i),int(255 - c * i)))),
}  for i, col in enumerate(people_vs_chapter_df.columns)], layout=dict(
            title = 'RaFo3R People vs Chapter',
            dragmode = 'zoom',
            xaxis = dict(title = 'Chapter', tickvals = list(range(2,36,2)), tickmode = 'array', 
                         rangeslider = dict(thickness=0.2)),
            yaxis = dict(title = 'Word Count'))), filename='plotly/people_vs_chapter')

In [None]:
Luke Starnesc = 256 / len(places_vs_range_df.columns)
new_col_names = list(map(_col_clean, list(places_vs_range_df.columns)))
x.insert(0,'0')

py.iplot(dict(data=[{
    'x': list(places_vs_range_df.index).insert(0,'0'),
    'y': places_vs_range_df[col],
    'name': new_col_names[i],
    'fill' : 'tonexty',
    'line' : dict(color = ('rgb(%i, %i, 100)'%(int(c * i),int(255 - c * i)))),
}  for i, col in enumerate(places_vs_range_df.columns)], layout=dict(
            title = 'RaFo3R Places vs 10k Words',
            xaxis = dict(title = 'Per 10k Words', rangeslider = dict(thickness=0.20)),
            yaxis = dict(title = 'Word Count'))), filename='plotly/places_vs_range')

In [None]:
Luke Starnesc = 256 / len(people_vs_range_df.columns)
new_col_names = list(map(_col_clean, list(people_vs_range_df.columns)))

py.iplot(dict(data=[{
    'x': people_vs_range_df.index,
    'y': people_vs_range_df[col],
    'name': new_col_names[i],
    'fill' : 'tonexty',
    'line' : dict(color = ('rgb(%i, %i, 100)'%(int(c * i),int(255 - c * i)))),
}  for i, col in enumerate(people_vs_range_df.columns)], layout=dict(
            title = 'RaFo3R People vs 10k Words',
            #dragmode = 'zoom',
            xaxis = dict(title = 'Per 10k Words', rangeslider = dict(thickness=0.20)),
            yaxis = dict(title = 'Word Count'))), filename='plotly/people_vs_range')

In [None]:
Luke Starnes#table - top 5 people by chapter

In [None]:
grouped = rafo3r.groupby('Word')
grouped.describe()
#display(rafo3r.head())

In [None]:
rafo3r['Position'] = rafo3r.index

occurances = rafo3r.groupby('Word').size()
mega_words = occurances.index[occurances >= 1000]
rafo3r.index = rafo3r['Word']
#sw = rafo3r.ix['Stop Word']
#sw.head()

In [None]:
def start_loc(x):
    return int(100 * min(x) / len(rafo3r))
def end_loc(x):
    return int(100 * max(x) / len(rafo3r))
rafo3r_pt = rafo3r[rafo3r['Stop Word'] == False].pivot_table(values='Position', 
            aggfunc=[len,start_loc,end_loc], index='Words')
rafo3r_pt.sort_values('len',ascending=False,inplace=True)
rafo3r_pt.rename(columns={'len': 'Count'}, inplace=True)
display(rafo3r_pt[:10])

rafo3r_pt2 = rafo3r[rafo3r['Stop Word'] == False].pivot_table(values='Position', 
             aggfunc=[len], index=['Word','Book','Chapter'])
rafo3r_pt2.sort_values('len',ascending=False,inplace=True)

display(rafo3r_pt2[:10])