In [1]:
!pwd

/home/rsebastian/code/elixier-scraper/notebook


In [7]:
import pandas as pd
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
import numpy as np
import pickle

In [32]:
oer = pd.read_csv('../data/elixier_physics_all.csv', index_col=0)

In [33]:
bildungsebene = [be.lower() for be in [
    "Sekundarstufe I", # (1645)
    "Sekundarstufe II", # (1118)
    "Primarstufe", # (426)
    "Hochschule", # (85)
    "Berufliche Bildung", # (68)
    "Elementarbildung", # (28)
    "Fort- und Weiterbildung" # (27)
]]

In [34]:
bildungsebene_clean = []
for val in oer.bildungsebene:
    if type(val) == str:
        str_out = []
        val_lower = val.lower()
        for be in bildungsebene:
            if be in val_lower:
                str_out.append(be)
        bildungsebene_clean.append(str_out)
    else:
        bildungsebene_clean.append([])
oer['bildungsebene_clean'] = bildungsebene_clean

In [35]:
oer.bildungsebene_clean.apply(lambda x: ' '.join(x) if type(x) == list else '').value_counts()

sekundarstufe i sekundarstufe ii                                                          962
                                                                                          818
sekundarstufe i                                                                           584
sekundarstufe i primarstufe                                                               268
primarstufe                                                                                70
sekundarstufe i sekundarstufe ii primarstufe                                               58
sekundarstufe i sekundarstufe ii hochschule                                                38
sekundarstufe i berufliche bildung                                                         24
sekundarstufe i sekundarstufe ii hochschule berufliche bildung fort- und weiterbildung     20
sekundarstufe i primarstufe elementarbildung                                               16
sekundarstufe i sekundarstufe ii berufliche bildung         

In [8]:
def get_ignore(url):
    try:
        return requests.get(url)
    except Exception as e:
        return e

In [4]:
responses = [get_ignore(u) for u in tqdm(oer.url)]

  utils.DeprecatedIn35,
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 2899/2899 [59:07<00:00,  1.22s/it]


In [8]:
# with open('../data/responses', 'wb') as f:
#     pickle.dump(responses, f)
with open('../data/responses', 'rb') as f:
    responses = pickle.load(f)

In [17]:
success_mask = pd.Series(responses).apply(lambda r: (not isinstance(r, Exception)) and (r.status_code == 200))

In [31]:
oer = oer[success_mask]

TypeError: tuple indices must be integers or slices, not Series

In [30]:
oer.columns

AttributeError: 'tuple' object has no attribute 'columns'

In [26]:
descriptions = oer['lange beschreibung']
existing_descriptions = descriptions[descriptions.notna()]
existing_descriptions.shape[0]

365

In [12]:
rs = rs[success_mask]

In [13]:
content_type = rs.apply(lambda r: r.headers['Content-Type'].lower().split(';')[0])
content_type.value_counts().sort_values(ascending=False)

text/html                                                                  2559
application/pdf                                                             160
audio/mpeg                                                                   29
application/vnd.openxmlformats-officedocument.wordprocessingml.document      14
application/zip                                                              13
application/octet-stream                                                      3
application/x-shockwave-flash                                                 2
image/jpeg                                                                    2
application/x-msdos-program                                                   1
image/gif                                                                     1
dtype: int64

In [20]:
html_responses = rs[content_type.str.contains('text/html')]

In [45]:
def title(soup):
    maybe_title = soup.find('title')
    if maybe_title is None:
        return ""
    else:
        return maybe_title.text

In [36]:
def content_paras(soup):
    return [p.text for p in soup.find_all('p') if len(p.text.strip()) > 200]

In [37]:
def multidict(kvs):
    out = dict()
    for k, v in kvs:
        if k in out:
            out[k].append(v)
        else:
            out[k] = [v]
    return out

def headings(soup):
    return multidict([(t.name, t.text.strip()) for i in range(6) for t in soup.find_all(f'h{i}')])

In [70]:
def content(response):
    url = response.url
    domain = url.split('/')[2]
    soup = BeautifulSoup(response.content)
    return {'paras': content_paras(soup), 
            'headings': headings(soup), 
            'title': title(soup), 
            'url': url,
            'domain': domain}

In [53]:
def content_counts(cont):
    out = {
        'num_paras': len(cont['paras']),
        'tatal_para_words': sum([len(p.split(' ')) for p in cont['paras']]),
        'title_size': len(cont['title']),
        'url': cont['url']
    }
    for h_type in cont['headings']:
        out[h_type + '_size'] = len(cont['headings'][h_type])
    return out

In [72]:
contents = html_responses.apply(content)

In [73]:
with open('../data/cache/parsed_contents.pkl', 'wb') as f:
    pickle.dump(contents, f)

In [58]:
content_estimates = pd.DataFrame([content_counts(c) for c in contents])

In [67]:

content_estimates.groupby('url').aggregate({
    'num_paras': ['min', 'max', 'mean'],
    'avg_para_size_words': ['min', 'max', 'mean'],
    'h1_size': ['min', 'mean'],
    'h2_size': ['min', 'mean'],
    'h3_size': ['min', 'mean'],
    'h4_size': ['min', 'mean'],
    'h5_size': ['min', 'mean'],
    'url': ['count']
}).sort_values(by=('url', 'count'), ascending=False).iloc[:50]

Unnamed: 0_level_0,num_paras,num_paras,num_paras,avg_para_size_words,avg_para_size_words,avg_para_size_words,h1_size,h1_size,h2_size,h2_size,h3_size,h3_size,h4_size,h4_size,h5_size,h5_size,url
Unnamed: 0_level_1,min,max,mean,min,max,mean,min,mean,min,mean,min,mean,min,mean,min,mean,count
url,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
www.leifiphysik.de,0,46,6.048662,0.0,370.0,49.94046,1.0,1.001217,4.0,4.989051,1.0,1.879459,2.0,4.599757,1.0,3.384298,822
www.lehrer-online.de,0,25,3.968153,0.0,357.8125,53.075625,1.0,1.0,1.0,3.983842,3.0,8.958333,4.0,9.278846,,,628
www.supra-lernplattform.de,0,25,4.402174,0.0,150.08,55.54921,1.0,1.0,1.0,1.923913,2.0,2.543478,4.0,4.0,2.0,4.054054,92
www.bildung-lsa.de,0,1,0.825,0.0,27.0,22.275,,,,,,,,,,,80
www.planet-schule.de,0,15,2.824324,0.0,153.5,72.052266,1.0,3.507042,1.0,2.857143,1.0,3.017241,10.0,10.0,2.0,3.358491,74
de.wikipedia.org,0,228,30.145455,0.0,162.25,78.601883,1.0,1.0,1.0,9.872727,1.0,7.636364,1.0,6.375,,,55
www.abi-physik.de,0,11,4.226415,0.0,319.0,47.764368,3.0,3.0,1.0,3.660377,1.0,3.471698,,,,,53
www.zum.de,0,2,0.090909,0.0,204.0,9.409091,1.0,1.95,4.0,4.05,,,12.0,12.0,,,33
www.heise.de,3,31,12.807692,57.375,102.166667,74.022156,1.0,1.307692,2.0,2.076923,1.0,3.68,,,,,26
www.komm-mach-mint.de,1,5,2.038462,38.4,91.5,86.419231,1.0,1.230769,2.0,2.0,,,2.0,2.0,,,26
