# Prose corpus segmentation

In [1]:
from nltk import word_tokenize
import numpy as np
import pandas as pd
from typing import Dict, List, Optional, Tuple, Union
from utils import build_chunks, split_texts_into_segments, unify_texts_amount

In [2]:
corpus = pd.read_csv("../data/corpora/prose_corpus.csv")

### hyperparameter

In [100]:
segments = 10000

### Splitting into segments

In [4]:
%%time
new_corpus = split_texts_into_segments(corpus,
                                       max_segments=3,
                                       n=segments,
                                       same_len=True)
new_corpus.head()

CPU times: user 4min 6s, sys: 405 ms, total: 4min 7s
Wall time: 4min 7s


Unnamed: 0,filename,author,title,year,textlength,text
0,Jakob_Christoph_Heer_-_Der_Wetterwart_(1905)_1,Jakob Christoph Heer,Der Wetterwart_1,1905,10000,1925 I Die feierliche Abendhelle steht über de...
1,Jakob_Christoph_Heer_-_Der_Wetterwart_(1905)_2,Jakob Christoph Heer,Der Wetterwart_2,1905,10000,"spüre es , die Ingenieurkunst wäre mein innigs..."
2,Jakob_Christoph_Heer_-_Der_Wetterwart_(1905)_3,Jakob Christoph Heer,Der Wetterwart_3,1905,10000,uns stets ein treuer Nachbar gewesen . « Ich h...
3,Johannes_Richard_zur_Megede_-_Der_Ueberkater_-...,Johannes Richard zur Megede,Der Ueberkater - Band 1_1,1904,10000,Erster Band Fräulein Elisabeth von Skal in dan...
4,Johannes_Richard_zur_Megede_-_Der_Ueberkater_-...,Johannes Richard zur Megede,Der Ueberkater - Band 1_2,1904,10000,. Sollte dieses Dorschgesicht am Ende einem fe...


In [38]:
new_corpus.shape

(1464, 6)

In [63]:
shorten_new_corpus = new_corpus.groupby('author').filter(lambda x : len(x)>20)

In [5]:
shorten_new_corpus.shape

(696, 6)

In [6]:
corpus.shape

(488, 6)

In [7]:
#shorten_new_corpus.to_csv(f"../data/corpora/prose_corpus_{segments}seg.csv", index=False)

## Preprocessing of the segmentation corpus

### Remove punctuation marks

In [84]:
shorten_new_corpus = pd.read_csv("../data/corpora/prose_corpus_10000seg.csv")

In [85]:
shorten_new_corpus.head(2)

Unnamed: 0,filename,author,title,year,textlength,text
0,Jakob_Christoph_Heer_-_Der_Wetterwart_(1905)_1,Jakob Christoph Heer,Der Wetterwart_1,1905,10000,1925 I Die feierliche Abendhelle steht über de...
1,Jakob_Christoph_Heer_-_Der_Wetterwart_(1905)_2,Jakob Christoph Heer,Der Wetterwart_2,1905,10000,"spüre es , die Ingenieurkunst wäre mein innigs..."


In [86]:
from nltk.tokenize import RegexpTokenizer

snc2 = shorten_new_corpus
tokenizer = RegexpTokenizer(r'\w+')
snc2["text"] = shorten_new_corpus["text"].apply(tokenizer.tokenize)
snc2["text"] = snc2["text"].apply(" ".join)

In [88]:
snc2.head(2)

Unnamed: 0,filename,author,title,year,textlength,text
0,Jakob_Christoph_Heer_-_Der_Wetterwart_(1905)_1,Jakob Christoph Heer,Der Wetterwart_1,1905,10000,1925 I Die feierliche Abendhelle steht über de...
1,Jakob_Christoph_Heer_-_Der_Wetterwart_(1905)_2,Jakob Christoph Heer,Der Wetterwart_2,1905,10000,spüre es die Ingenieurkunst wäre mein innigste...


### Remove unique words to an author (there are none)

#### remove all columns except 'author' and 'text' and summarize texts of every author

In [89]:
tmps = snc2
tmps = tmps.drop("filename", axis=1)
tmps = tmps.drop("title", axis=1)
tmps = tmps.drop("year", axis=1)
tmps = tmps.drop("textlength", axis=1)
tmps.head(2)

Unnamed: 0,author,text
0,Jakob Christoph Heer,1925 I Die feierliche Abendhelle steht über de...
1,Jakob Christoph Heer,spüre es die Ingenieurkunst wäre mein innigste...


In [90]:
from collections import defaultdict

summarized_dict = defaultdict(list)
actual_author = ""
for index, row in tmps.iterrows():
    summarized_dict[row["author"]].append(row["text"])
    
sum_dict = {}
for k,v in summarized_dict.items():
    sum_dict[k] = " ".join(v)
    
sum_corpus = pd.DataFrame(sum_dict.items(), columns=["author", "text"])

In [93]:
from utils import document_term_matrix

dtm, vector = document_term_matrix(sum_corpus, 
                                   "bow", 
                                   "author", 
                                   max_features=5000,
                                   binary=True)

In [94]:
dtm.head(2)

Unnamed: 0,10,ab,abend,abendessen,abends,abenteuer,aber,abermals,abgeben,abgelegt,...,überzeugen,überzeugt,überzeugung,übrig,übrige,übrigen,übrigens,übte,üppig,üppige
Jakob Christoph Heer,1,1,1,1,1,1,1,0,0,1,...,0,1,1,1,1,1,1,1,1,1
Wilhelm Heinrich Riehl,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [95]:
for col in dtm.columns:
    sumcol = sum(dtm[col])
    if sumcol == 1:
        print(sumcol)

There are no words which only occurs in texts of one author for this case.

### Remove words with counts less than 5 (there are none)

In [96]:
dtm2, vector = document_term_matrix(sum_corpus, 
                                   "bow", 
                                   "author", 
                                   max_features=5000,
                                   binary=False)

In [97]:
dtm2.head(2)

Unnamed: 0,ab,abend,abends,abenteuer,aber,abermals,abgesehen,abgrund,abraham,abreise,...,überrascht,überraschung,übers,überzeugen,überzeugt,überzeugung,übrig,übrige,übrigen,übrigens
Jakob Christoph Heer,66,106,12,3,1157,0,4,4,0,5,...,17,14,2,0,9,2,5,3,20,10
Wilhelm Heinrich Riehl,59,91,29,7,1131,15,1,6,6,3,...,10,8,3,2,1,3,8,7,13,35


In [98]:
for col in dtm2.columns:
    sumcol = sum(dtm2[col])
    if sumcol <= 5:
        print(sumcol)

There are no words which occurs less than 6 times.

#### saving prose corpus with segmentation

In [102]:
#snc2.to_csv(f"../data/corpora/prose_corpus_{segments}seg.csv", index=False)