# Repräsentation von reinen Text-Korpora (SAC)

Text+Berg XML in Textformat verwandeln mit xmlstarlet (ein Satz pro Zeile, Tokens mit Leerzeichen getrennt)

_(Annahme: Im Unterverzeichnis SAC befinden sich alle XML-Dateien von Text+Berg)_

In [None]:
%%bash 
for f in SAC/*_{mul,de}.xml ; do
    if test $f.txt -ot $f   # if $f.txt exists and is newer than $f
    then
        echo "$f needs conversion ..." 1>&2
        xmlstarlet sel -T -t -m //div -nl -m ".//s[@lang='de']" -nl  -m  ".//w" -v "." -o $' ' < $f > $f.txt
    fi
done

In [None]:
! edit SAC/SAC-Jahrbuch_1864_mul.xml.txt

### Eigene gutenberg-artige Korpora erstellen

In [None]:
import nltk
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer
root = 'SAC/'
file_pattern = r'.+\.txt'

my_SAC = PlaintextCorpusReader(root,
                    file_pattern,
                    encoding='utf-8',
                    word_tokenizer=RegexpTokenizer(r'[ \n]', gaps=True, discard_empty=True),
                    sent_tokenizer=RegexpTokenizer(r'\n', gaps=True, discard_empty=True)
                    )

my_SAC.sents()[:5]


# Häufigkeitsverteilungen
Ob Generator oder explizite Liste macht einen riesigen Unterschied im Speicherverbrauch!

In [None]:
# %time sac_fd = nltk.FreqDist(list(my_SAC.words()))
%time sac_fd = nltk.FreqDist(my_SAC.words())

In [None]:
sac_fd.N()

In [None]:
sac_fd.tabulate(10)

In [None]:
%time sorted([w for w in sac_fd \
            if len(w) > 32 and sac_fd[w] > 2])

### Tagged Text+Berg Corpus

In [None]:
from nltk.corpus.reader.tagged import TaggedCorpusReader

In [None]:
%%bash 
for f in SAC/*_{mul,de}.xml ; do
    if test $f.vttxt -ot $f 
    then
        echo "$f needs conversion ..." 1>&2
        xmlstarlet sel -T -t -m //div -nl -m ".//s[@lang='de']" -nl  -m  ".//w" -v "." -o $'\t' -v '@pos' -o ' ' < $f > $f.vttxt
    fi
done

In [None]:
! edit SAC/SAC-Jahrbuch_1864_mul.xml.vttxt

In [None]:
file_pattern = r'.+\.vttxt$'
my_tagged_SAC = TaggedCorpusReader(root,
                    file_pattern,
                    encoding='utf-8',
                    sep='\t',
                    word_tokenizer=RegexpTokenizer(r'[^ ]+'),
                    sent_tokenizer=RegexpTokenizer(r'[^\n]+')
                    )

In [None]:
my_tagged_SAC.tagged_sents()[0:13]

In [None]:
%time sac_tfd = nltk.ConditionalFreqDist((t,w) for (w,t) in my_tagged_SAC.tagged_words())

In [None]:
sac_tfd['NE'].tabulate(3)

In [None]:
sorted([w for w in sac_tfd['ADJA'] \
            if len(w) > 18 and sac_tfd['ADJA'][w] > 0])

In [None]:
sac_1865 = nltk.ConditionalFreqDist((t,w) for (w,t) in my_tagged_SAC.tagged_words('SAC-Jahrbuch_1865_mul.xml.ttxt') if len(w)> 5)
sac_1999 = nltk.ConditionalFreqDist((t,w) for (w,t) in my_tagged_SAC.tagged_words('SAC-Jahrbuch_1999_de.xml.ttxt')if len(w)> 5)
sac_diff = (sac_1865-sac_1999)-sac_1999

In [None]:
sac_diff['VVINF'].tabulate(10)