# Speeches corpus segmentation

In [23]:
from nltk import word_tokenize
import numpy as np
import pandas as pd
from typing import Dict, List, Optional, Tuple, Union
from utils import build_chunks, cut_author_from_text, document_term_matrix, split_texts_into_segments

In [15]:
corpus = pd.read_csv("../data/corpora/speeches_corpus.csv")

In [16]:
corpus.shape

(130, 3)

## Removing punctuation marks

In [17]:
from nltk.tokenize import RegexpTokenizer

tmp_corpus = corpus
tokenizer = RegexpTokenizer(r'\w+')
tmp_corpus["text"] = corpus["text"].apply(tokenizer.tokenize)
tmp_corpus["text"] = tmp_corpus["text"].apply(" ".join)

In [19]:
tmp_corpus.head(2)

Unnamed: 0,author,text,title
0,Angela Merkel,werte Festversammlung ich bin heute sehr gerne...,Rede von Bundeskanzlerin Angela Merkel anlässl...
1,Angela Merkel,Sehr geehrter Herr Müller lieber Herr Minister...,Rede von Bundeskanzlerin Angela Merkel anlässl...


#### sum corpus temporarly

In [20]:
## removing title column (temporarly) ##
tmp_corpus2 = tmp_corpus
tmp_corpus2 = tmp_corpus2.drop("title", axis=1)
tmp_corpus2.head(2)

Unnamed: 0,author,text
0,Angela Merkel,werte Festversammlung ich bin heute sehr gerne...
1,Angela Merkel,Sehr geehrter Herr Müller lieber Herr Minister...


In [33]:
from collections import defaultdict

summarized_dict = defaultdict(list)
actual_author = ""
for index, row in tmp_corpus2.iterrows():
    summarized_dict[row["author"]].append(row["text"])
    
sum_dict = {}
for k,v in summarized_dict.items():
    sum_dict[k] = " ".join(v)
    
sum_corpus = pd.DataFrame(sum_dict.items(), columns=["author", "text"])

#### check if one word occurs only on speeches of one author 

In [49]:
from utils import document_term_matrix

dtm, vector = document_term_matrix(sum_corpus, 
                                   "bow", 
                                   "author", 
                                   max_features=5000,
                                   binary=True)

for col in dtm.columns:
    sumcol = sum(dtm[col])
    if sumcol == 1:
        print(sumcol)

#### check if there are words which only occurs 5 times or less

In [59]:
dtm2, vector = document_term_matrix(sum_corpus, 
                                   "bow", 
                                   "author", 
                                   max_features=3000,
                                   binary=False)

for col in dtm2.columns:
    sumcol = sum(dtm2[col])
    if sumcol <= 5:
        print(sumcol)

### hyperparameters
Segments: 50, 250

In [19]:
segments = 50

### Splitting into segments

In [20]:
%%time
new_corpus = split_texts_into_segments(corpus,
                                       corpus_type="speeches",
                                       n=segments,
                                       same_len=True)
new_corpus.head()

CPU times: user 1.41 s, sys: 3.84 ms, total: 1.42 s
Wall time: 1.42 s


Unnamed: 0,title,author,textlength,text
0,Rede von Bundeskanzlerin Angela Merkel anlässl...,Angela Merkel,50,"werte Festversammlung , ich bin heute sehr ger..."
1,Rede von Bundeskanzlerin Angela Merkel anlässl...,Angela Merkel,50,"einem Engagement , das weithin anerkannt ist ...."
2,Rede von Bundeskanzlerin Angela Merkel anlässl...,Angela Merkel,50,", leben diese Werte täglich und treten beharrl..."
3,Rede von Bundeskanzlerin Angela Merkel anlässl...,Angela Merkel,50,"private Wohneigentum dazu beizutragen , die St..."
4,Rede von Bundeskanzlerin Angela Merkel anlässl...,Angela Merkel,50,"für viele Jahre , wenn nicht sogar lebenslängl..."


In [21]:
new_corpus.shape

(5194, 4)

In [22]:
#new_corpus.to_csv(f"../data/corpora/speeches_corpus_{segments}seg.csv", index=False)