# Corpus Building Decisions Documentation

## Dilemmas:
1. Maybe paragraph corpus, and not text (because the B1 articles have more than one, while B2-C1 articles have only one)
2. Using regular news articles?

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir('..')

from IPython.display import IFrame

import dwscraper

In [3]:
dwscraper.consts.LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]

In [4]:
page_df = dwscraper.build_page_df(300 , to_filter=False)

## URL type (`av` - Video without text)

In [5]:
url_types = page_df["url"].apply(lambda s: s.split("/")[-1].split("-")[0])

In [6]:
url_types.value_counts()

a     266
av     21
l      13
Name: url, dtype: int64

In [7]:
page_df[url_types == "av"]["artikel"].value_counts()

Top-Thema             6
Video-Thema           4
Wort der Woche        4
Sprachbar             3
Alltagsdeutsch        2
Top-Thema – Audios    2
Name: artikel, dtype: int64

In [8]:
page_df = page_df[url_types != "av"]

## Levels vs. Type of Article (`artikel`)

In [9]:
leveled_artikel_count = page_df.groupby(["artikel", "levels"]).size()
leveled_artikel_count[leveled_artikel_count>5].dropna(how="all").fillna(0).sort_values()

artikel                          levels  
Top-Thema – Lektionen            (B1,)        8
Alltagsdeutsch – Podcast         (C1,)       13
Wort der Woche                   (B2, C1)    14
                                 (B2,)       19
Video-Thema – Podcast ohne UT    (B2, C1)    30
Nachrichten                      (B2, C1)    38
Sprachbar                        (C1,)       38
Langsam gesprochene Nachrichten  (B2, C1)    46
Top-Thema – Podcast              (B1,)       53
dtype: int64

### TAKEN TO CORPUS

#### Top-Thema – Podcast - B1 [One Text]


In [10]:
page = page_df[page_df["artikel"] == "Top-Thema – Podcast"].iloc[0]
print(page["url"])
#print(page["content"])
IFrame("http://dw.com" 
       + page["url"],
       width=1000, height=500)

/de/hitlers-mein-kampf-erscheint-wieder/a-18952131


#### Nachrichten - B2 & C1 - [Multiple Texts]

In [11]:
page = page_df[page_df["artikel"] == "Nachrichten"].iloc[0]
#print(page["content"])
IFrame("http://dw.com" 
       + page["url"],
       width=1000, height=500)

#### Langsam gesprochene Nachrichten - B2 & C1 [Multiple Texts]

In [12]:
page = page_df[page_df["artikel"] == "Langsam gesprochene Nachrichten"].iloc[0]
#print(page["content"])
IFrame("http://dw.com" 
       + page["url"],
       width=1000, height=500)

#### Top-Thema – Lektionen - B1 [One Text]

In [13]:
page = page_df[page_df["artikel"] == "Top-Thema – Lektionen"].iloc[0]
#print(page["content"])
IFrame("http://dw.com" 
       + page["url"],
       width=1000, height=500)

### NOT TAKEN TO CORPUS

#### Video-Thema – Lektionen - B2 & C1 [Dialogue]

In [14]:
page = page_df[page_df["artikel"] == "Video-Thema – Lektionen"].iloc[0]
#print(page["content"])
IFrame("http://dw.com" 
       + page["url"],
       width=1000, height=500)

#### Sprachbar - C1 & C2 [Not a news article]

In [15]:
page = page_df[page_df["artikel"] == "Sprachbar"].iloc[0]
#print(page["content"])
IFrame("http://dw.com" 
       + page["url"],
       width=1000, height=500)

#### Alltagsdeutsch – Podcast - C1 & C2 [Dialogue / No Text at all]

In [16]:
page = page_df[page_df["artikel"] == "Alltagsdeutsch – Podcast"].iloc[0]
#print(page["content"])
IFrame("http://dw.com" 
       + page["url"],
       width=1000, height=500)

#### Video-Thema – Podcast ohne UT - B2 & C1

In [17]:
page = page_df[page_df["artikel"] == "Video-Thema – Podcast ohne UT"].iloc[0]
#print(page["content"])
IFrame("http://dw.com" 
       + page["url"],
       width=1000, height=500)

#### Wort der Woche  - B2, B2 & C1 [not new article]

In [18]:
page = page_df[page_df["artikel"] == "Wort der Woche"].iloc[0]
#print(page["content"])
IFrame("http://dw.com" 
       + page["url"],
       width=1000, height=500)