In [15]:
import pandas as pd
import numpy as np_
import json
import re
from matplotlib import pyplot as plt
from tqdm import tqdm
import requests
import pickle

In [2]:
def get_domain(url):
    return re.match('https?://(.*?)(/|$).*', url).groups()[0]


In [3]:
def save_to_csv(frequencies, name):
    frequencies.to_frame().reset_index().rename(
        columns={'index': 'domain', 0: 'frequency'}
    ).to_csv(f'../data/{name}_lor_domains.csv', index_label='id')

In [4]:
def read_from_csv(name):
    pd.read_csv(f'../data/{name}_lor_domains.csv')

### Chemie

In [5]:
with open('../data/elixier_chemie_all.json', 'r')  as f:
    chemie_json = json.load(f)

In [6]:
chemie_domain_frequencies = pd.Series([get_domain(u['url']) for u in chemie_json], name='url').value_counts()

In [7]:
chemie_domain_frequencies.head()

www.lehrer-online.de                    504
media.sodis.de                          142
www.seilnacht.com                        86
daten.didaktikchemie.uni-bayreuth.de     51
www.lingonetz.de                         46
Name: url, dtype: int64

### Physik

In [8]:
physik_df = pd.read_csv('../data/elixier_physics_all.csv')

In [9]:
physik_domain_frequencies = physik_df.url.apply(get_domain).value_counts()

In [10]:
physik_domain_frequencies.head()

www.leifiphysik.de            822
www.lehrer-online.de          636
www.supra-lernplattform.de     92
www.bildung-lsa.de             80
media.sodis.de                 76
Name: url, dtype: int64

### Combined

In [11]:
out = dict()
for i in chemie_domain_frequencies.index:
    out[i] = [chemie_domain_frequencies[i], 'chemie']
    
for i in physik_domain_frequencies.index:
    if i in out:
        out[i] = [out[i][0] + physik_domain_frequencies[i], 'physik+chemie']
    else:
        out[i] = [physik_domain_frequencies[i], 'physik']

counts_domain = pd.DataFrame(out).transpose().rename(columns={0: 'count', 1: 'domain'}).sort_values(by='count', ascending=False)
counts = counts_domain.reset_index().rename(columns={'index': 'url_domain'})

In [12]:
counts.head()

Unnamed: 0,url_domain,count,domain
0,www.lehrer-online.de,1140,physik+chemie
1,www.leifiphysik.de,822,physik
2,media.sodis.de,218,physik+chemie
3,www.planet-schule.de,116,physik+chemie
4,www.supra-lernplattform.de,106,physik+chemie


In [13]:
domain_urls = ['https://' + d for d in counts.url_domain]

In [17]:
with open('emails', 'rb') as f:
    emails = pickle.load(f)

In [19]:
dict(zip(domain_urls, emails))

{'https://www.lehrer-online.de': None,
 'https://www.leifiphysik.de': None,
 'https://media.sodis.de': None,
 'https://www.planet-schule.de': None,
 'https://www.supra-lernplattform.de': None,
 'https://www.lingonetz.de': None,
 'https://www.seilnacht.com': None,
 'https://de.wikipedia.org': None,
 'https://www.bildung-lsa.de': None,
 'https://www.zum.de': None,
 'https://www.abi-physik.de': None,
 'https://static.bildung-rp.de': None,
 'https://daten.didaktikchemie.uni-bayreuth.de': None,
 'https://www.komm-mach-mint.de': None,
 'https://lernarchiv.bildung.hessen.de': None,
 'https://www.lncu.de': None,
 'https://www.chemieunterricht.de': None,
 'https://www.heise.de': None,
 'https://www.zauberhafte-physik.net': None,
 'https://www.wissenschaftsjahr.de': None,
 'https://digitallearninglab.de': None,
 'https://mp3.bildung.hessen.de': None,
 'https://www.dlr.de': None,
 'https://primas.ph-freiburg.de': None,
 'https://www.science-on-stage.de': None,
 'https://www.schule-bw.de': None,
 

In [28]:
def get_email(domain):
    try:
        response = requests.get(domain, timeout=3)
        return set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", response.text, re.I))
    except:
        return None

In [None]:
emails = [get_email(d) for d in tqdm(domain_urls)]
with open('emails', 'wb') as f:
    pickle.dump(emails, f)



  0%|                                                                                                               | 0/584 [00:00<?, ?it/s][A[A

  0%|▏                                                                                                      | 1/584 [00:05<50:47,  5.23s/it][A[A

  0%|▎                                                                                                      | 2/584 [00:05<21:54,  2.26s/it][A[A

  1%|▌                                                                                                      | 3/584 [00:05<12:41,  1.31s/it][A[A

  1%|▋                                                                                                      | 4/584 [00:06<10:33,  1.09s/it][A[A

  1%|▉                                                                                                      | 5/584 [00:06<08:33,  1.13it/s][A[A

  1%|█                                                                                                      | 

In [27]:
emails

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

### Scrape Emails

In [63]:
requests.exceptions.TooManyRedirects

requests.exceptions.TooManyRedirects

In [68]:


from bs4 import BeautifulSoup
import requests
import requests.exceptions
from urllib.parse import urlsplit
from collections import deque
import re


def get_emails(domain):
    # a queue of urls to be crawled
    new_urls = deque([f'https://{domain}'])

    # a set of urls that we have already crawled
    processed_urls = set()

    # a set of crawled emails
    emails = set()
    
    request_count = 0

    # process urls one by one until we exhaust the queue
    while len(new_urls):
        # move next url from the queue to the set of processed urls
        url = new_urls.popleft()
        processed_urls.add(url)

        # extract base url to resolve relative links
        parts = urlsplit(url)
        base_url = "{0.scheme}://{0.netloc}".format(parts)
        path = url[:url.rfind('/')+1] if '/' in parts.path else url

        # get url's content
#         print("Processing %s" % url)
        try:
            request_count += 1
            response = requests.get(url)
        except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.TooManyRedirects):
            # ignore pages with errors
            continue

        # extract all email addresses and add them into the resulting set
        new_emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", response.text, re.I))
        emails.update(new_emails)
        if len(new_emails) > 5:
            break
            
        if request_count > 10:
            break

        # create a beutiful soup for the html document
        soup = BeautifulSoup(response.text)

        # find and process all the anchors in the document
        link = None
        for anchor in soup.find_all("a"):
            # extract link url from the anchor
            link = anchor.attrs["href"] if "href" in anchor.attrs else ''
        if link is None:
            continue
        # resolve relative links
        if link.startswith('/'):
            link = base_url + link
        elif not link.startswith('http'):
            link = path + link
        # add the new url to the queue if it was not enqueued nor processed yet
        if not link in new_urls and not link in processed_urls:
            new_urls.append(link)

#     print(f'Found {emails}')
    return emails


In [69]:
emails_out = dict()
for d in tqdm(counts.url_domain):
    print(d)
    emails_out[d] = get_emails(d)

  0%|                                                   | 0/584 [00:00<?, ?it/s]

www.lehrer-online.de


  0%|                                           | 1/584 [00:00<08:25,  1.15it/s]

www.leifiphysik.de


  1%|▏                                          | 3/584 [00:02<05:41,  1.70it/s]

media.sodis.de
www.planet-schule.de


  1%|▎                                          | 4/584 [00:03<07:27,  1.30it/s]

www.supra-lernplattform.de


  1%|▍                                          | 6/584 [00:04<06:15,  1.54it/s]

www.lingonetz.de
www.seilnacht.com


  1%|▌                                          | 7/584 [00:13<33:25,  3.47s/it]

de.wikipedia.org


  1%|▌                                          | 8/584 [00:14<24:00,  2.50s/it]

www.bildung-lsa.de


  2%|▋                                          | 9/584 [00:14<17:37,  1.84s/it]

www.zum.de


  2%|▋                                         | 10/584 [00:15<14:23,  1.50s/it]

www.abi-physik.de


  2%|▊                                         | 12/584 [00:15<08:05,  1.18it/s]

static.bildung-rp.de
daten.didaktikchemie.uni-bayreuth.de
www.komm-mach-mint.de


  2%|█                                         | 14/584 [00:18<09:45,  1.03s/it]

lernarchiv.bildung.hessen.de


  3%|█                                         | 15/584 [00:19<10:38,  1.12s/it]

www.lncu.de


  3%|█▏                                        | 16/584 [00:29<32:16,  3.41s/it]

www.chemieunterricht.de


  3%|█▏                                        | 17/584 [00:29<24:08,  2.55s/it]

www.heise.de


  3%|█▎                                        | 18/584 [00:33<26:41,  2.83s/it]

www.zauberhafte-physik.net
www.wissenschaftsjahr.de


  3%|█▍                                        | 20/584 [00:35<19:48,  2.11s/it]

digitallearninglab.de


  4%|█▌                                        | 21/584 [00:37<19:31,  2.08s/it]

mp3.bildung.hessen.de


  4%|█▌                                        | 22/584 [00:38<15:23,  1.64s/it]

www.dlr.de


  4%|█▋                                        | 23/584 [00:46<33:20,  3.57s/it]

primas.ph-freiburg.de


  4%|█▋                                        | 24/584 [00:55<46:33,  4.99s/it]

www.science-on-stage.de


  4%|█▊                                        | 25/584 [00:57<37:20,  4.01s/it]

www.schule-bw.de


  4%|█▊                                        | 26/584 [00:57<28:30,  3.07s/it]

www.esa.int


  5%|█▉                                        | 27/584 [01:00<26:59,  2.91s/it]

educ.ethz.ch


  5%|██                                        | 28/584 [01:00<19:51,  2.14s/it]

www.chemie-master.de


  5%|██                                        | 29/584 [01:09<37:50,  4.09s/it]

www.wissenschaft-schulen.de
www.forscher-online.de


  5%|██▏                                       | 31/584 [01:09<21:33,  2.34s/it]

www.tempolimit-lichtgeschwindigkeit.de


  5%|██▎                                       | 32/584 [01:10<17:21,  1.89s/it]

www.bildungsserver.de


  6%|██▎                                     | 33/584 [01:31<1:03:03,  6.87s/it]

www.ltam.lu


  6%|██▍                                       | 34/584 [01:32<48:58,  5.34s/it]

online-media.uni-marburg.de


  6%|██▌                                       | 35/584 [01:41<58:07,  6.35s/it]

blogs.helmholtz.de


  6%|██▌                                       | 36/584 [01:42<42:56,  4.70s/it]

physikaufgaben.de


  6%|██▋                                       | 37/584 [01:50<53:20,  5.85s/it]

physikforkids.de


  7%|██▋                                       | 38/584 [01:52<43:12,  4.75s/it]

mint-zirkel.de


  7%|██▉                                       | 40/584 [01:53<22:50,  2.52s/it]

www.bautschweb.de
www.umwelt-im-unterricht.de


  7%|██▉                                       | 41/584 [01:54<17:46,  1.96s/it]

sodis.de


  7%|██▉                                     | 42/584 [04:04<6:01:41, 40.04s/it]

www.unterrichtsmaterial-schule.de


  7%|██▉                                     | 43/584 [04:04<4:14:29, 28.22s/it]

www.dguv-lug.de


  8%|███                                     | 44/584 [04:05<2:59:47, 19.98s/it]

wiki.zum.de
www.meine-forscherwelt.de


  8%|███▏                                    | 46/584 [04:05<1:37:16, 10.85s/it]

medienportal.siemens-stiftung.org


  8%|███▏                                    | 47/584 [04:05<1:14:01,  8.27s/it]

www.mint-ec.de


  8%|███▍                                      | 48/584 [04:07<59:03,  6.61s/it]

www.schulentwicklung.nrw.de


  8%|███▌                                      | 49/584 [04:08<44:45,  5.02s/it]

melt.fwu.de
www.youtube.com


  9%|███▋                                      | 51/584 [04:10<27:54,  3.14s/it]

chids.online.uni-marburg.de


  9%|███▋                                      | 52/584 [04:18<39:56,  4.50s/it]

www.physikfuerkids.de


  9%|███▉                                      | 54/584 [04:21<25:32,  2.89s/it]

www.seilnacht.tuttlingen.com
www.walter-fendt.de


  9%|███▉                                      | 55/584 [04:30<40:12,  4.56s/it]

physik.uibk.ac.at


 10%|███▊                                    | 56/584 [06:39<5:50:42, 39.85s/it]

www.chempage.de


 10%|███▉                                    | 57/584 [06:48<4:32:05, 30.98s/it]

www.chemgapedia.de


 10%|███▉                                    | 58/584 [06:49<3:15:15, 22.27s/it]

www.physik-am-auto.de


 10%|████                                    | 59/584 [06:58<2:40:40, 18.36s/it]

mediaserve.kompetenzz.net


 10%|████                                    | 60/584 [06:58<1:53:42, 13.02s/it]

www.roentgenmuseum.de


 10%|████▏                                   | 61/584 [07:26<2:30:39, 17.28s/it]

www.ubz-stmk.at


 11%|████▏                                   | 62/584 [07:29<1:54:25, 13.15s/it]

virtuelle-experimente.de


 11%|████▍                                   | 64/584 [07:39<1:13:12,  8.45s/it]

wikis.zum.de
www.compass-project.eu


 11%|████▋                                     | 65/584 [07:39<51:29,  5.95s/it]

www.physik-schule.de


 11%|████▋                                     | 66/584 [07:40<39:32,  4.58s/it]

www.poleninderschule.de


 11%|████▊                                     | 67/584 [07:42<32:24,  3.76s/it]

www.u-helmich.de


 12%|████▉                                     | 68/584 [07:52<48:05,  5.59s/it]

lehrerfortbildung-bw.de


 12%|████▉                                     | 69/584 [07:54<37:58,  4.42s/it]

esero.de


 12%|█████                                     | 70/584 [07:54<27:16,  3.18s/it]

www.xplora.org
www.geomar.de


 12%|█████▏                                    | 72/584 [07:55<16:27,  1.93s/it]

www.didaktik.physik.uni-muenchen.de


 12%|█████▎                                    | 73/584 [08:04<31:03,  3.65s/it]

www.bildungspartner.schulministerium.nrw.de


 13%|█████▎                                    | 74/584 [08:05<25:39,  3.02s/it]

www.energie-macht-schule.de


 13%|█████▍                                    | 75/584 [08:06<21:06,  2.49s/it]

www.educ.ethz.ch


 13%|█████▍                                    | 76/584 [08:07<16:58,  2.01s/it]

www.me-vermitteln.de


 13%|█████▌                                    | 77/584 [08:09<18:33,  2.20s/it]

www.max-wissen.de


 14%|█████▋                                    | 79/584 [08:10<10:49,  1.29s/it]

www.kindernetz.de
www.fwu-mediathek.de


 14%|█████▊                                    | 80/584 [08:11<08:52,  1.06s/it]

www.mybookmachine-online.de


 14%|█████▌                                  | 81/584 [10:20<5:28:23, 39.17s/it]

www.zeitbild.de


 14%|█████▌                                  | 82/584 [10:21<3:52:00, 27.73s/it]

platform.govie.de


 14%|█████▋                                  | 83/584 [10:21<2:43:04, 19.53s/it]

www.genius-community.com


 14%|█████▊                                  | 84/584 [10:22<1:56:10, 13.94s/it]

www.jugendtechnikschule.de


 15%|█████▊                                  | 85/584 [10:26<1:30:11, 10.84s/it]

wissenschaftsjahr-2020.visionkino.de


 15%|█████▉                                  | 86/584 [10:27<1:05:33,  7.90s/it]

idw-online.de


 15%|██████▎                                   | 87/584 [10:27<47:30,  5.74s/it]

www.labbe.de


 15%|██████▎                                   | 88/584 [10:28<35:02,  4.24s/it]

space2school.de


 15%|██████▍                                   | 90/584 [10:38<34:26,  4.18s/it]

www.chemiekiste.de
www.helmholtz.de


 16%|██████▌                                   | 91/584 [10:39<25:11,  3.07s/it]

www.fokus-biologische-vielfalt.de


 16%|██████▌                                   | 92/584 [10:40<20:06,  2.45s/it]

material.kompetenzz.net


 16%|██████▋                                   | 93/584 [10:40<15:33,  1.90s/it]

www.fwu-mediathek.com


 16%|██████▊                                   | 95/584 [10:41<08:53,  1.09s/it]

www.chemieplanet.de
www.chemiedidaktik.uni-wuppertal.de


 17%|██████▉                                   | 97/584 [10:42<06:29,  1.25it/s]

www.chemikus.de
www.medienwerkstatt-online.de


 17%|███████                                   | 99/584 [10:51<18:45,  2.32s/it]

www.bimsev.de
physikunterricht-online.de


 17%|███████                                   | 99/584 [10:52<53:14,  6.59s/it]


InvalidURL: Failed to parse: https://physikunterricht-online.dejavascript:void(0);