In [37]:
from typing import Iterable

from django.contrib.auth.decorators import login_required
from django.http import HttpRequest, HttpResponse
from django.shortcuts import render

from apps.views_decorators import objects_exists, user_has_access
from apps.files_management.models import File, FileVersion
from apps.projects.models import Project

from django.db.models import Q

import pandas as pd
from lxml import etree as et
from tqdm import tqdm

In [96]:
NAMESPACES = {'tei': 'http://www.tei-c.org/ns/1.0', 'xml': 'http://www.w3.org/XML/1998/namespace'}

In [6]:
project = Project.objects.get(title__exact='p')
files_query = Q(project=project)
files = File.objects.filter(files_query)

In [36]:
file = files[0] 
file.name

'dep_839064r043_tei_(original_normalized_depositions_marked_persons_gazetteer_gazetteer_date_annotation).xml'

In [47]:
content = get_latest_content(file)

In [201]:
def create_summary_for_document(doc_raw: str, doc_name:str='undefined')->pd.DataFrame: 
    doc_tree = et.fromstring(doc_raw.encode())

    header_tags = doc_tree.find('.//tei:teiHeader', namespaces=NAMESPACES).iter()
    body_tags = doc_tree.find('.//tei:body', namespaces=NAMESPACES).iter()

    stats_df = pd.DataFrame(columns=['document', 'tag', 'tag_id', 'location', 'attr_name', 'attr_value'])

    tag_count = 0
    for tag in header_tags:
        for attr_name, attr_value in tag.items():
            stats_df.loc[len(stats_df)] = {
                'document': doc_name,
                'tag': tag.tag,
                'tag_id': doc_name + str(tag_count),
                'location': 'header',
                'attr_name': attr_name,
                'attr_value': attr_value
            }
        tag_count += 1
        
    for tag in body_tags:
        for attr_name, attr_value in tag.items():
            stats_df.loc[len(stats_df)] = {
                'document': doc_name,
                'tag': tag.tag,
                'tag_id': doc_name + str(tag_count),
                'location': 'body',
                'attr_name': attr_name,
                'attr_value': attr_value
            }
        tag_count += 1

    return stats_df

In [190]:
def create_summary_for_document_collection(doc_gen: Iterable[str])->pd.DataFrame: 
    stats_df = pd.DataFrame(columns=['document', 'tag', 'tag_id', 'location', 'attr_name', 'attr_value'])

    for doc, doc_name in tqdm(doc_gen):
        stats = create_summary_for_document(doc, doc_name)
        stats_df = stats_df.append(stats)

    return stats_df

In [202]:
get_latest_content = lambda f: f.versions \
        .latest('creation_date') \
        .get_content() \
        .split('<?xml version="1.0"?>')[1] \
        .strip()

file_gen = ((get_latest_content(file), file.name) for file in files)
stats_df = create_summary_for_document_collection(file_gen)

44it [00:24,  1.80it/s]


In [205]:
def get_stats(stats_df):
    n_docs = len(stats_df['document'].unique())
    tag_g = stats_df.groupby('tag')
    tag_stats = tag_g.describe()
    
    stats = ({
        'name': row[0],
        'count': row[1]['tag_id']['unique'],
        'coverage': 100*row[1]['document']['unique'] / n_docs,
        'n_docs': row[1]['document']['unique'],
        'location': row[1]['location']['top'],
        'attributes': tuple({
                'name': attr[0],
                'top_perc': 100*attr[1]['attr_value']['freq']/attr[1]['attr_value']['count'],
                'top_value': attr[1]['attr_value']['top'],
                'coverage': 100*attr[1]['document']['count']/row[1]['tag_id']['unique'],
                'values': []
            }for attr in tag_g.get_group(row[0]).groupby('attr_name').describe().iterrows())
    } for row in tag_stats.iterrows())
    
    return stats
    
tuple(get_stats(stats_df))

({'name': 'include',
  'count': 45,
  'coverage': 100.0,
  'location': 'header',
  'attributes': ({'name': 'href',
    'top_perc': 100.0,
    'top_value': 'responsiblePeople.xml',
    'coverage': 100.0,
    'values': []},
   {'name': 'xpointer',
    'top_perc': 97.77777777777777,
    'top_value': 'RP5',
    'coverage': 100.0,
    'values': []})},
 {'name': '{http://www.tei-c.org/ns/1.0}add',
  'count': 346,
  'coverage': 72.72727272727273,
  'location': 'body',
  'attributes': ({'name': 'place',
    'top_perc': 100.0,
    'top_value': 'inline',
    'coverage': 100.0,
    'values': []},)},
 {'name': '{http://www.tei-c.org/ns/1.0}certainty',
  'count': 603,
  'coverage': 81.81818181818181,
  'location': 'header',
  'attributes': ({'name': 'assertedValue',
    'top_perc': 37.81094527363184,
    'top_value': 'date',
    'coverage': 100.0,
    'values': []},
   {'name': 'category',
    'top_perc': 100.0,
    'top_value': 'incompletness',
    'coverage': 100.0,
    'values': []},
   {'name':

In [370]:
def create_fast_stats_for_document_collection(doc_gen: Iterable[str])->pd.DataFrame: 
    stats_df = pd.DataFrame(columns=['body', 'header', 'count', 'attr_count'])

    for doc, doc_name in tqdm(doc_gen):
        doc_tree = et.fromstring(doc.encode())

        header_tags = doc_tree.find('.//tei:teiHeader', namespaces=NAMESPACES).iter()
        body_tags = doc_tree.find('.//tei:body', namespaces=NAMESPACES).iter()
        

        for tag in header_tags:
            try:
                if not tag.tag in stats_df.index:
                    stats_df.loc[tag.tag] = {
                        'body': 0,
                        'header': 1,
                        'count': 1,
                        'attr_count': len(tag.keys()),
                    }
                else:
                    stats_df.loc[tag.tag] = {
                        'body': stats_df.loc[tag.tag, 'body'],
                        'header': stats_df.loc[tag.tag, 'header'] + 1,
                        'count': stats_df.loc[tag.tag, 'header'] + 1,
                        'attr_count': stats_df.loc[tag.tag, 'attr_count'] + len(tag.keys()),
                    }
            except Exception:
                pass

        for tag in body_tags:
            try:
                if not tag.tag in stats_df.index:
                    stats_df.loc[tag.tag] = {
                        'body': 1,
                        'header': 0,
                        'count': 1,
                        'attr_count': len(tag.keys()),
                    }
                else:
                    stats_df.loc[tag.tag] = {
                        'body': stats_df.loc[tag.tag, 'body'] + 1,
                        'header': stats_df.loc[tag.tag, 'header'],
                        'count': stats_df.loc[tag.tag, 'header'] + 1,
                        'attr_count': stats_df.loc[tag.tag, 'attr_count'] + len(tag.keys()),
                    }
            except Exception:
                pass
    
    most_common = stats_df.sort_values('count', ascending=False).iloc[:4] 
    most_common['tag'] = most_common.index
    most_common['attr_count'] = most_common['attr_count'] / most_common['count']

    return most_common.to_dict(orient='records')        

In [371]:
get_latest_content = lambda f: f.versions \
        .latest('creation_date') \
        .get_content() \
        .split('<?xml version="1.0"?>')[1] \
        .strip()

file_gen = ((get_latest_content(file), file.name) for file in files)
create_fast_stats_for_document_collection(file_gen)

44it [00:06,  6.36it/s]


[{'body': 0,
  'header': 603,
  'count': 603,
  'attr_count': 8.0,
  'tag': '{http://www.tei-c.org/ns/1.0}certainty'},
 {'body': 0,
  'header': 340,
  'count': 340,
  'attr_count': 0.047058823529411764,
  'tag': '{http://www.tei-c.org/ns/1.0}surname'},
 {'body': 960,
  'header': 324,
  'count': 325,
  'attr_count': 4.812307692307693,
  'tag': '{http://www.tei-c.org/ns/1.0}person'},
 {'body': 0,
  'header': 324,
  'count': 324,
  'attr_count': 0.0,
  'tag': '{http://www.tei-c.org/ns/1.0}persName'}]

In [367]:
tag

[{'document': 'dep_839149r103_tei_(original_normalized_depositions_marked_persons_gazetteer_gazetteer_date_annotation).xml',
  'body': 0,
  'header': 603,
  'count': 603,
  'attr_count': 8.0,
  'tag': '{http://www.tei-c.org/ns/1.0}certainty'},
 {'document': 'dep_839149r103_tei_(original_normalized_depositions_marked_persons_gazetteer_gazetteer_date_annotation).xml',
  'body': 0,
  'header': 340,
  'count': 340,
  'attr_count': 0.047058823529411764,
  'tag': '{http://www.tei-c.org/ns/1.0}surname'},
 {'document': 'dep_839149r103_tei_(original_normalized_depositions_marked_persons_gazetteer_gazetteer_date_annotation).xml',
  'body': 960,
  'header': 324,
  'count': 325,
  'attr_count': 4.812307692307693,
  'tag': '{http://www.tei-c.org/ns/1.0}person'},
 {'document': 'dep_839149r103_tei_(original_normalized_depositions_marked_persons_gazetteer_gazetteer_date_annotation).xml',
  'body': 0,
  'header': 324,
  'count': 324,
  'attr_count': 0.0,
  'tag': '{http://www.tei-c.org/ns/1.0}persName'

In [325]:
def get_fast_stats(stats_df):
    most_common = tag.sort_values('count', ascending=False).iloc[:3]
    
    stats = ({
        'name': row[0].split('}')[1] if '}' in row[0] else row[0],
        'count': row[1]['count'],
        'header': 
        'coverage': 100*row[1]['document']['unique'] / n_docs,
        'n_docs': row[1]['document']['unique'],
        'location': row[1]['location']['top'],
        'attributes': tuple({
                'name': attr[0],
                'top_perc': 100*attr[1]['attr_value']['freq']/attr[1]['attr_value']['count'],
                'top_value': attr[1]['attr_value']['top'],
                'coverage': 100*attr[1]['document']['count']/row[1]['tag_id']['unique'],
                'values': []
            }for attr in tag_g.get_group(row[0]).groupby('attr_name').describe().iterrows())
    } for row in most_common.iterrows())
    
    return stats
    
tuple(get_stats(stats_df))