In [37]:
from typing import Iterable

from django.contrib.auth.decorators import login_required
from django.http import HttpRequest, HttpResponse
from django.shortcuts import render

from apps.views_decorators import objects_exists, user_has_access
from apps.files_management.models import File, FileVersion
from apps.projects.models import Project

from django.db.models import Q

import pandas as pd
from lxml import etree as et
from tqdm import tqdm

In [96]:
NAMESPACES = {'tei': 'http://www.tei-c.org/ns/1.0', 'xml': 'http://www.w3.org/XML/1998/namespace'}

In [6]:
project = Project.objects.get(title__exact='p')
files_query = Q(project=project)
files = File.objects.filter(files_query)

In [36]:
file = files[0] 
file.name

'dep_839064r043_tei_(original_normalized_depositions_marked_persons_gazetteer_gazetteer_date_annotation).xml'

In [47]:
content = get_latest_content(file)

In [201]:
def create_summary_for_document(doc_raw: str, doc_name:str='undefined')->pd.DataFrame: 
    doc_tree = et.fromstring(doc_raw.encode())

    header_tags = doc_tree.find('.//tei:teiHeader', namespaces=NAMESPACES).iter()
    body_tags = doc_tree.find('.//tei:body', namespaces=NAMESPACES).iter()

    stats_df = pd.DataFrame(columns=['document', 'tag', 'tag_id', 'location', 'attr_name', 'attr_value'])

    tag_count = 0
    for tag in header_tags:
        for attr_name, attr_value in tag.items():
            stats_df.loc[len(stats_df)] = {
                'document': doc_name,
                'tag': tag.tag,
                'tag_id': doc_name + str(tag_count),
                'location': 'header',
                'attr_name': attr_name,
                'attr_value': attr_value
            }
        tag_count += 1
        
    for tag in body_tags:
        for attr_name, attr_value in tag.items():
            stats_df.loc[len(stats_df)] = {
                'document': doc_name,
                'tag': tag.tag,
                'tag_id': doc_name + str(tag_count),
                'location': 'body',
                'attr_name': attr_name,
                'attr_value': attr_value
            }
        tag_count += 1

    return stats_df

In [190]:
def create_summary_for_document_collection(doc_gen: Iterable[str])->pd.DataFrame: 
    stats_df = pd.DataFrame(columns=['document', 'tag', 'tag_id', 'location', 'attr_name', 'attr_value'])

    for doc, doc_name in tqdm(doc_gen):
        stats = create_summary_for_document(doc, doc_name)
        stats_df = stats_df.append(stats)

    return stats_df

In [202]:
get_latest_content = lambda f: f.versions \
        .latest('creation_date') \
        .get_content() \
        .split('<?xml version="1.0"?>')[1] \
        .strip()

file_gen = ((get_latest_content(file), file.name) for file in files)
stats_df = create_summary_for_document_collection(file_gen)

44it [00:24,  1.80it/s]


In [205]:
def get_stats(stats_df):
    n_docs = len(stats_df['document'].unique())
    tag_g = stats_df.groupby('tag')
    tag_stats = tag_g.describe()
    
    stats = ({
        'name': row[0],
        'count': row[1]['tag_id']['unique'],
        'coverage': 100*row[1]['document']['unique'] / n_docs,
        'n_docs': row[1]['document']['unique'],
        'location': row[1]['location']['top'],
        'attributes': tuple({
                'name': attr[0],
                'top_perc': 100*attr[1]['attr_value']['freq']/attr[1]['attr_value']['count'],
                'top_value': attr[1]['attr_value']['top'],
                'coverage': 100*attr[1]['document']['count']/row[1]['tag_id']['unique'],
                'values': []
            }for attr in tag_g.get_group(row[0]).groupby('attr_name').describe().iterrows())
    } for row in tag_stats.iterrows())
    
    return stats
    
tuple(get_stats(stats_df))

({'name': 'include',
  'count': 45,
  'coverage': 100.0,
  'location': 'header',
  'attributes': ({'name': 'href',
    'top_perc': 100.0,
    'top_value': 'responsiblePeople.xml',
    'coverage': 100.0,
    'values': []},
   {'name': 'xpointer',
    'top_perc': 97.77777777777777,
    'top_value': 'RP5',
    'coverage': 100.0,
    'values': []})},
 {'name': '{http://www.tei-c.org/ns/1.0}add',
  'count': 346,
  'coverage': 72.72727272727273,
  'location': 'body',
  'attributes': ({'name': 'place',
    'top_perc': 100.0,
    'top_value': 'inline',
    'coverage': 100.0,
    'values': []},)},
 {'name': '{http://www.tei-c.org/ns/1.0}certainty',
  'count': 603,
  'coverage': 81.81818181818181,
  'location': 'header',
  'attributes': ({'name': 'assertedValue',
    'top_perc': 37.81094527363184,
    'top_value': 'date',
    'coverage': 100.0,
    'values': []},
   {'name': 'category',
    'top_perc': 100.0,
    'top_value': 'incompletness',
    'coverage': 100.0,
    'values': []},
   {'name':

In [204]:
len(stats_df['document'].unique())

44

In [194]:
stats_df.groupby('tag_id').describe()

Unnamed: 0_level_0,document,document,document,document,tag,tag,tag,tag,location,location,location,location,attr_name,attr_name,attr_name,attr_name,attr_value,attr_value,attr_value,attr_value
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
tag_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
9,88,44,dep_839072r048_tei_(original_normalized_deposi...,2,88,1,include,88,88,1,header,88,88,2,xpointer,44,88,3,responsiblePeople.xml,44
12,2,1,dep_839106r073_tei_(original_normalized_deposi...,2,2,1,include,2,2,1,header,2,2,2,xpointer,1,2,2,RP5,1
14,86,43,dep_839072r048_tei_(original_normalized_deposi...,2,86,1,{http://www.tei-c.org/ns/1.0}include,86,86,1,header,86,86,2,xpointer,43,86,2,ms839,43
17,88,44,dep_839072r048_tei_(original_normalized_deposi...,2,88,2,{http://www.w3.org/2001/XInclude}include,86,88,1,header,88,88,2,xpointer,44,88,4,projectDesc.xml,43
18,86,43,dep_839072r048_tei_(original_normalized_deposi...,2,86,1,{http://www.w3.org/2001/XInclude}include,86,86,1,header,86,86,2,xpointer,43,86,2,id1641-editorialDecl,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,1,1,dep_839096r065_tei_(original_normalized_deposi...,1,1,1,{http://www.tei-c.org/ns/1.0}person,1,1,1,body,1,1,1,sameAs,1,1,1,person839096r065-1,1
650,2,1,dep_839096r065_tei_(original_normalized_deposi...,2,2,1,{http://www.tei-c.org/ns/1.0}pb,2,2,1,body,2,2,2,pagenum,1,2,2,fol. 97v,1
657,2,1,dep_839096r065_tei_(original_normalized_deposi...,2,2,1,{http://www.tei-c.org/ns/1.0}date,2,2,1,body,2,2,2,{http://www.w3.org/XML/1998/namespace}id,1,2,2,1643-06-2,1
666,1,1,dep_839096r065_tei_(original_normalized_deposi...,1,1,1,{http://www.tei-c.org/ns/1.0}rolename,1,1,1,body,1,1,1,type,1,1,1,commissioner,1


In [179]:
tuple(
    {
        'name': attr[0],
        'top_perc': 100*attr[1]['attr_value']['freq']/attr[1]['attr_value']['count'],
        'top_value': attr[1]['attr_value']['top'],
        'coverage': attr[1]['document']['count'],
        'values': []
    }
    for attr in tag_g.get_group('{http://www.tei-c.org/ns/1.0}certainty').groupby('attr_name').describe().iterrows())

({'name': 'assertedValue',
  'top_perc': 37.81094527363184,
  'top_value': 'date',
  'coverage': 603,
  'values': []},
 {'name': 'category',
  'top_perc': 100.0,
  'top_value': 'incompletness',
  'coverage': 603,
  'values': []},
 {'name': 'cert',
  'top_perc': 100.0,
  'top_value': 'unknown',
  'coverage': 603,
  'values': []},
 {'name': 'degree',
  'top_perc': 22.388059701492537,
  'top_value': '0.8',
  'coverage': 603,
  'values': []},
 {'name': 'desc',
  'top_perc': 6.301824212271973,
  'top_value': "Automatically annotated based on the initial annotation '1643' from the document 'dep_814242r151_tei_(original_normalized_depositions_marked_persons).xml'. 1 out of 1 highly coincident terms. Normalized Levenshtein similarity between terms: (0.75,).",
  'coverage': 603,
  'values': []},
 {'name': 'locus',
  'top_perc': 100.0,
  'top_value': 'name',
  'coverage': 603,
  'values': []},
 {'name': 'resp',
  'top_perc': 100.0,
  'top_value': '#automatic_gazeetter_ner',
  'coverage': 603,
  

In [152]:
g = stats_df.groupby('tag')
for key, item in g:
    print(g.get_group(key), "\n\n")

                                             document      tag location  \
0   dep_839064r043_tei_(original_normalized_deposi...  include   header   
1   dep_839064r043_tei_(original_normalized_deposi...  include   header   
0   dep_839060r041_tei_(original_normalized_deposi...  include   header   
1   dep_839060r041_tei_(original_normalized_deposi...  include   header   
0   dep_839066r044_tei_(original_normalized_deposi...  include   header   
..                                                ...      ...      ...   
1   dep_839145r101_tei_(original_normalized_deposi...  include   header   
0   dep_839147r102_tei_(original_normalized_deposi...  include   header   
1   dep_839147r102_tei_(original_normalized_deposi...  include   header   
0   dep_839149r103_tei_(original_normalized_deposi...  include   header   
1   dep_839149r103_tei_(original_normalized_deposi...  include   header   

   attr_name             attr_value  
0       href  responsiblePeople.xml  
1   xpointer           