In [None]:
import xml.etree.ElementTree as ET
import bz2
# Read data into xml etree
with bz2.open('./data/gp2d.xml.bz2', 'rb') as bf:
    tree = ET.parse(bf)
root = tree.getroot()

In [None]:
rows = []
for text in root.findall('text'):    
    # Join multiple sentences from same text issue
    # curr_sent = []    
    for sent in text.getiterator('sentence'):
        sent_id = sent.get('id')
        
        for word in sent.getiterator('w'):
            r = word.attrib
            r['sent_id'] = sent_id
            r['word'] = word.text
            rows.append(r)

In [None]:
import pandas as pd
whole_df = pd.DataFrame(rows)

In [None]:
whole_df.head()

In [None]:
df = whole_df[['word', 'sent_id', 'pos', 'sense', 'msd']].copy()

In [None]:
import re

def first_lemma(word_lemma):
    word, lemma = word_lemma
    m = re.findall(r'\|(\w+)\|', lemma)
    if len(m) >= 1:
        return m[0]
    else:
        return word

df['first_lemma'] = whole_df[['word', 'lemma']].apply(first_lemma, axis=1)

In [None]:
print("Unique lemmas:", len(df['first_lemma'].unique()))
print("Unique word forms:", len(df['word'].unique()))

In [None]:
sent_group = df.groupby('sent_id')
word_count = sent_group['word'].count()

In [None]:
print("Min word count:", word_count.min())
print("Mean word count:", word_count.mean())
print("Median word count:", word_count.median())
print("Max word count:", word_count.max())

In [None]:
" ".join(df[df['sent_id'] == word_count.argmax()]['word'])

In [None]:
import matplotlib.pyplot as plt

plt.hist(word_count)
plt.show()

In [None]:
import xml.etree.ElementTree as ET
import bz2
import random


def sample_xml(filepath, sample_percent = 0.15, keep_attrs = ['lemma', 'msd', 'pos']):
    rows = []
    keep_sample = False
    total = 0
    replaced = 0
    with bz2.open(filepath, 'rb') as bf:
        for event, elem in ET.iterparse(bf, events=('start', 'end', 'start-ns', 'end-ns')):
            if event == "start" and elem.tag == "sentence":
                total += 1
                keep_sample = random.random() < sample_percent
                if keep_sample:
                    replaced += 1
                    sent_id = elem.get('id')
                    for word in elem.getiterator('w'):
                        r = {k: word.attrib[k] for k in keep_attrs}
                        r['sent_id'] = sent_id
                        r['word'] = word.text
                        rows.append(r)

    print("Total sentences: {}\tSampled sentences: {}".format(total, replaced))
    return rows
                
# gp2013_rows = sample_xml('/scratch/gussteen/final_project/gp2013.xml.bz2')
# Take all the rows
atta_rows = sample_xml('/scratch/gussteen/final_project/attasidor.xml.bz2', sample_percent=1.0)

In [None]:
import pandas as pd

atta_df = pd.DataFrame(atta_rows)

with open('/scratch/gussteen/final_project/attasidor.csv', 'w+') as f:
    atta_df.to_csv(f, index=False)

In [None]:
atta_sent_count = len(atta_df['sent_id'].unique())
print("Sentences in 8 Sidor:", atta_sent_count)

In [3]:
import xml.etree.ElementTree as ET
import bz2
import random

from itertools import chain

def reservoir_sample(filepath, number_rows, keep_attrs = ['lemma', 'msd', 'pos']):
    # Initialize the sample
    rows = [[]] * number_rows
    
    keep_sample = False
    i = 0
    replaced = 0
    with bz2.open(filepath, 'rb') as bf:
        for event, elem in ET.iterparse(bf, events=('start', 'end', 'start-ns', 'end-ns')):
            if event == "start" and elem.tag == "sentence":                
                j = random.randint(0, i)
                init_sample = i < number_rows
                replace_sample = i >= number_rows and j < number_rows
                keep_sample = init_sample or replace_sample
                if keep_sample:
                    if replace_sample:
                        # Reset the index and replace with new sentence
                        replaced += 1
                        rows[j] = []
                    sent_id = elem.get('id')
                    for word in elem.getiterator('w'):
                        r = {k: word.attrib[k] for k in keep_attrs}
                        r['sent_id'] = sent_id
                        r['word'] = word.text
                        if replace_sample:                                                        
                            rows[j].append(r)
                        else:
                            rows[i].append(r)                              
                i += 1

    print("Found total sentences: {}\t Replaced: {}".format(i, replaced))
    return list(chain(*rows))

gp2013_rows = reservoir_sample('/scratch/gussteen/final_project/gp2013.xml.bz2', 254711)

KeyboardInterrupt: 

In [None]:
gp2013_df = pd.DataFrame(gp2013_rows)

with open('/scratch/gussteen/final_project/gp2013_sample.csv', 'w+') as f:
    gp2013_df.to_csv(f, index=False)

In [None]:
gp2013_sent_count = len(gp2013_df['sent_id'].unique())
print("Sentences in GP2013:", gp2013_sent_count)

In [None]:
gp2013_df.head()

In [None]:
atta_df.head(100)

In [None]:
atta_sent_count = len(atta_df['sent_id'].unique())
print("Sentences in 8 Sidor sample:", atta_sent_count)

In [None]:
import pandas as pd

gp2013_df = pd.DataFrame(rows)

with open('./gp2013_sample.csv', 'w+') as f:
    gp2013_df.to_csv(f, index=False)

In [None]:
del gp2013_df
gp2013_df = pd.read_csv('./gp2013_sample.csv')

In [None]:
gp2013_df