In [None]:
import sys
sys.path.append('../src/')

from utils_tiramisu import *

from tqdm import tqdm

from pathlib import Path

# this is the same TIRAMISU_PATH as shown in start_here.ipynb
TIRAMISU_PATH = 

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np

### Find all the file types inside the corpus

In [None]:
file_extensions = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (c:File) return c.fileExtension as file_extension, count(c) as count
""")

file_extensions

In [None]:
sum(fileExtensions['count(c)']) # total number of files

### Number of PDF pages

In [None]:
pdf_pages = return_from_neo4j("""
match (n:File) - [:SPLIT_INTO] -> (c:File) - [:CONVERT_TO] -> (d:File) where c.fileExtension = 'pdf' 
     return count(d) as count
""")

pdf_pages = pdf_pages.iloc[0]['count']

### Number of MS Office document pages
### Excel files count as one page (sheet)

In [None]:
all_ms = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) 
where e.fileExtension in ['doc', 'docx', 'ppt', 'pptx'] 
return e.nodeID as nodeID, e.originalPath as path, e.fileExtension as file_extension
""")

all_ms

In [None]:
def get_page_count(path):
    
    with open(path, 'r') as f:
        data = json.load(f)
    try:
        return int(data['metadata']['xmpTPg:NPages']) if 'xmpTPg:NPages' in data['metadata'] else int(data['metadata']['meta:slide-count'])
    except KeyError:
        return 0
    except TypeError:
        return int(data['metadata']['xmpTPg:NPages'][0]) if 'xmpTPg:NPages' in data['metadata'] else int(data['metadata']['meta:slide-count'][0])

In [None]:
%%time

word_pages = []
ppt_pages = []
for i, row in tqdm(all_ms.iterrows(), total = all_ms.shape[0]):
    if row['file_extension'] == 'doc' or row['file_extension'] == 'docx':
        word_pages.append(get_page_count(f"../text_extraction/ms_tika/{row['nodeID']}.json"))
    else:
        ppt_pages.append(get_page_count(f"../text_extraction/ms_tika/{row['nodeID']}.json"))
    

In [None]:
sum(word_pages)

In [None]:
sum(ppt_pages)

In [None]:
excel_pages = file_extensions.loc[file_extensions.file_extension == 'xlsx', 'count'].iloc[0] + \
file_extensions.loc[file_extensions.file_extension == 'xls', 'count'].iloc[0]

### total number of documents 

In [None]:
all_documents = return_from_neo4j("""
match (d:Document) return count(d) as count
""")

all_documents['count'].iloc[0]

In [None]:
### total number of documents 
20780 + 1014 + 364 + 310 + 76 + 309

In [None]:
file_extensions

In [None]:
document_values = [all_documents['count'].iloc[0], 
                   file_extensions.loc[file_extensions.file_extension == 'docx', 'count'].iloc[0] +
                   file_extensions.loc[file_extensions.file_extension == 'doc', 'count'].iloc[0], 
                  file_extensions.loc[file_extensions.file_extension == 'pptx', 'count'].iloc[0] +
                  file_extensions.loc[file_extensions.file_extension == 'ppt', 'count'].iloc[0],
                  file_extensions.loc[file_extensions.file_extension == 'xlsx', 'count'].iloc[0] +
                  file_extensions.loc[file_extensions.file_extension == 'xls', 'count'].iloc[0]]
document_values_legend = ["pdf", "word", "powerpoint", "excel"]

In [None]:
sum(file_extensions.loc[~file_extensions.file_extension.isin(["pdf", "docx", "doc", "pptx", "ppt", "xlsx", "xls"]), "count"])

In [None]:
page_values = [pdf_pages, sum(word_pages), sum(ppt_pages), excel_pages]
file_values = [file_extensions.loc[file_extensions.file_extension == "pdf", "count"].iloc[0],
               file_extensions.loc[file_extensions.file_extension == 'docx', 'count'].iloc[0] +
               file_extensions.loc[file_extensions.file_extension == 'doc', 'count'].iloc[0],
               file_extensions.loc[file_extensions.file_extension == 'pptx', 'count'].iloc[0] +
               file_extensions.loc[file_extensions.file_extension == 'ppt', 'count'].iloc[0],
               file_extensions.loc[file_extensions.file_extension == 'xlsx', 'count'].iloc[0] +
               file_extensions.loc[file_extensions.file_extension == 'xls', 'count'].iloc[0],
                sum(file_extensions.loc[~file_extensions.file_extension.isin(["pdf", "docx", "doc", "pptx", "ppt", "xlsx", "xls"]), "count"])]
file_values_legend = ['pdf', 'word', 'powerpoint', 'excel', "other"]
page_values_legend = ['pdf', 'word', 'powerpoint', 'excel']

In [None]:
def cmap_key(key):
    
    color = {"pdf": "FFC06D",
    "word": "1588E0",
    "powerpoint": "BB443A",
    "excel":"00A651",
    "other": "D3D3D3"}
    
    return np.array([int(color[key][i:i+2], 16) for i in (0, 2, 4)])/255.


In [None]:
# taken from https://towardsdatascience.com/create-eye-catching-radial-bar-charts-with-matplotlib-fd03ff732048

matplotlib.rcParams.update({"axes.labelsize": 7,
"xtick.labelsize": 7,
"ytick.labelsize": 7,
"legend.fontsize": 7,
"font.size":7})
matplotlib.rc('font', family='Helvetica') 
matplotlib.rc('pdf', fonttype=42)
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams['axes.unicode_minus'] = False

matplotlib.rcParams['xtick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = 0.5
matplotlib.rcParams['xtick.minor.size'] = 2
matplotlib.rcParams['xtick.minor.width'] = 0.5

matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['ytick.major.width'] = 0.5
matplotlib.rcParams['ytick.minor.size'] = 2
matplotlib.rcParams['ytick.minor.width'] = 0.5

fig, ax = plt.subplots(1, 1, figsize = (50 * (1/2.54/10), 50 * (1/2.54/10)), dpi = 300, subplot_kw=dict(projection="polar") )

rect = [0.1,0.1,0.8,0.8]
ax.set_theta_direction(1)
ax.set_theta_offset(np.pi  / 2.0)


page_values = page_values / np.sum(page_values)*1.5*np.pi
vals_page = np.cumsum(np.append(0, page_values.flatten()[:-1])).reshape(page_values.shape)
file_values = file_values / np.sum(file_values)*1.5*np.pi
vals_file = np.cumsum(np.append(0, file_values.flatten()[:-1])).reshape(file_values.shape)
doc_values = document_values / np.sum(document_values)*1.5*np.pi
vals_doc = np.cumsum(np.append(0, doc_values.flatten()[:-1])).reshape(doc_values.shape)

cmap = matplotlib.cm.get_cmap("coolwarm")
ax.bar(x=vals_file, width=file_values, bottom = 1 , color = [cmap_key(k) for k in file_values_legend], height=0.3, align ='edge', linewidth = 0.1)
ax.bar(x=vals_doc, width=doc_values, bottom = 1 - 0.3 - 0.05 , color = [cmap_key(k) for k in document_values_legend], height=0.3, align ='edge', linewidth = 0.1)
ax.bar(x=vals_page, width=page_values, bottom = 1 - 0.3 - 0.3 - 0.05 - 0.05, color = [cmap_key(k) for k in page_values_legend], height=0.3, align ='edge', linewidth = 0.1)

# Hide all axis items
ax.spines['polar'].set_visible(False)
ax.set_xticks(np.pi/2* np.linspace(0, 3, 6, endpoint = True), [str(int(i * 100)) + "%" for i in np.linspace(0, 1, 6)])
ax.set_rticks([])

ax.grid(linewidth=0.5, color = 'k')

ax.spines['polar'].set_color('black')
ax.spines['start'].set_color('black')
ax.spines['inner'].set_color('black')
ax.spines['end'].set_color('black')
ax.xaxis.label.set_color('black')
ax.tick_params(axis='x', colors='black')
ax.yaxis.label.set_color('black')
ax.tick_params(axis='y', colors='black')
ax.spines['polar'].set_linewidth(0.5)
ax.spines['start'].set_linewidth(0.5)
ax.spines['inner'].set_linewidth(0.5)
ax.spines['end'].set_linewidth(0.5)
ax.xaxis.grid(True,color='k', alpha = 1)  
ax.yaxis.grid(False)

fig.tight_layout()
# plt.savefig('../cache/circle_stats_together_smaller.pdf', transparent=True, dpi = 300)
plt.show()