In [19]:
import pandas as pd
import requests
import json
import urllib.parse

In [20]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [57]:
#result_gu = requests.get('https://api.github.com/search/repositories?q="Goethe University"&per_page=100')

suche = ['"university frankfurt"', '"universität frankfurt"', '"goethe universität"', '"goethe university"']
searchmode = ' OR '.join(suche)
query = urllib.parse.quote(searchmode)  # URL-kodiert den Suchbegriff

all_entrys = []

for page in [1, 2]:
    url = f"https://api.github.com/search/repositories?q={query}&per_page=100&page={page}"
    result = requests.get(url)
    data = result.json()
    all_entrys.extend(data.get('items', []))

print(f"Gesamtanzahl gefundener Repositories: {len(all_entrys)}")

Gesamtanzahl gefundener Repositories: 112


In [58]:
liste = []
for entry in all_entrys:
    liste.append({
        "dc.title": entry["name"],
        "dc.contributor.author": entry["owner"],
        "dc.identifier.uri": entry["html_url"],
        "dc.description.abstract": entry["description"],
        "dc.relation": entry["topics"],
        "dc.programming.language": entry["language"],
        "dc.licence": entry["license"]["name"] if entry["license"] else None,
        "dc.date.issued": entry["updated_at"],
        "full_name": entry["full_name"]
    })

df_git = pd.DataFrame(liste)

In [59]:
# Spalte dc.type = Software hinzugfügen
df_git["dc.type"] = "Software"
df_git["monitoring.source"] = "GitHub"

In [60]:
# dc.relation in string umwandeln
df_git["dc.relation"] = df_git["dc.relation"].apply(
    lambda x: ", ".join(x) if isinstance(x, list)
    else x if pd.notna(x)
    else "")

In [61]:
#Jahreszahl aus Publication_date extrahieren
def extract_year(value):
    try:
        date = pd.to_datetime(value)  # konvertiert in Datumsstandard
        return date.year  # Gibt nur das Jahr zurück
    except ValueError:
        return value # gibt alten Wert bei Error zurück (ist dann schon JJJJ)

df_git["dc.date.issued"] = df_git["dc.date.issued"].apply(extract_year)

In [62]:
# Profilname rausfiltern
df_git["dc.contributor.author"] = df_git["dc.contributor.author"].apply(
    lambda x: next(iter(x.values())) if isinstance(x, dict) else x)

In [63]:
df_git_sw = df_git[df_git["dc.licence"].str.contains('GNU|MIT|BSD|Apache', na=False)] # filtert nach bestimmten Lizenzen
df_git_sw = df_git_sw.reset_index(drop=True)  # neuer Index (Zählung)

In [64]:
# Überschrift aus README rausziehen

In [65]:
import base64   #codiert binärdateien zu text (ich brauche ja was aus der readme-datei)
import re   # python modul für reguläre ausdrücke (zb Überschriften)

In [68]:
GITHUB_TOKEN = "..." # HIER GITHUB-TOKEN EINFÜGEN!! wird gebraucht um eine höhere Anzahl von Abfragen durchführen zu können

headers = {
    "Accept": "application/vnd.github.v3+json",
    "Authorization": f"token {GITHUB_TOKEN}"
}

def get_readme_heading(owner, repo):   # funktion mit 2 argumenten: Benutzer, name des repos
    url = f"https://api.github.com/repos/{owner}/{repo}/readme"  #fragt readme-datei ab

    r = requests.get(url, headers=headers) # abrufen der readme-datei, ergebnis ist ein response-objekt

    if r.status_code == 200:   #prüft, ob Anfrage erfolgreich war (200 heißt erfolgreich)
        data = r.json()
        content_encoded = data.get("content", "")   # holt content-Feld (base64-codierter string)
        content = base64.b64decode(content_encoded).decode("utf-8", errors="ignore")  # verwandelt codierte zeichen in normal lesbaren Text

        match = re.search(r"^#\s+(.+)", content, re.MULTILINE)   # suche nach der ersten Zeile, die mit # beginnt, also einer H1-Überschrift in Markdown.
        if match:
            return match.group(1).strip()   # der Text nach dem #
        else:
            return "keine Überschrift gefunden"
    else:
        return f"kein README ({r.status_code})"


# vorher definierte Funktion auf df_git_sw anwenden, owner+repo benötigt -> full_name
df_git_sw["readme_überschrift"] = df_git_sw["full_name"].apply(lambda full_name: get_readme_heading(*full_name.split("/")))

In [69]:
# Readme-Titel in dc.title übertragen, wenn brauchbar
df_git_sw.loc[~df_git_sw["readme_überschrift"].isin(["keine Überschrift gefunden", "kein README (404)"]), "dc.title"] = df_git_sw['readme_überschrift']

Unnamed: 0,dc.title,dc.contributor.author,dc.identifier.uri,dc.description.abstract,dc.relation,dc.programming.language,dc.licence,dc.date.issued,full_name,dc.type,monitoring.source,readme_überschrift
0,Real-time Object Detection for Autonomous Driv...,alen-smajic,https://github.com/alen-smajic/Real-time-Objec...,My Computer Vision project from my Computer Vi...,"bdd100k, berkeley-deep-drive, cnn, computer-sc...",Jupyter Notebook,MIT License,2025,alen-smajic/Real-time-Object-Detection-for-Aut...,Software,GitHub,Real-time Object Detection for Autonomous Driv...
1,Towards Explainable AI Systems for Traffic Sig...,alen-smajic,https://github.com/alen-smajic/Towards-Explain...,This project is part of the CS course 'Systems...,"autonomous-driving, blender, classification, c...",C#,MIT License,2025,alen-smajic/Towards-Explainable-AI-System-for-...,Software,GitHub,Towards Explainable AI Systems for Traffic Sig...
2,EPI Zusammenfassung,Chris022,https://github.com/Chris022/epi-zusammenfassung,Meine Zusammenfassung des Moduls: EPI für die ...,,,GNU General Public License v3.0,2025,Chris022/epi-zusammenfassung,Software,GitHub,EPI Zusammenfassung
3,MSc05 template repository,PeerHerholz,https://github.com/PeerHerholz/MSc05_template_...,A template repository for projects conducted a...,,,"BSD 3-Clause ""New"" or ""Revised"" License",2022,PeerHerholz/MSc05_template_repository,Software,GitHub,MSc05 template repository
4,"Subway Station Hazard Detection, Goethe Univer...",Psarpei,https://github.com/Psarpei/Subway-Station-Haza...,This project is part of the CS course 'Systems...,"blender, computer-vision, csharp, deep-learnin...",Jupyter Notebook,MIT License,2025,Psarpei/Subway-Station-Hazard-Detection,Software,GitHub,"Subway Station Hazard Detection, Goethe Univer..."
5,Introduction to High-Performance Computing in ...,bsotomayorg,https://github.com/bsotomayorg/Intro_HPC_Python,"Extended material of the summer course ""Introd...","high-performance-computing, matplotlib-exercis...",Jupyter Notebook,MIT License,2025,bsotomayorg/Intro_HPC_Python,Software,GitHub,Introduction to High-Performance Computing in ...
6,ACoLi Dicts,acoli-repo,https://github.com/acoli-repo/acoli-dicts,3000+ machine-readable open source dictionarie...,"dataset, dictionary, open-source, rdf, transla...",Shell,Apache License 2.0,2025,acoli-repo/acoli-dicts,Software,GitHub,ACoLi Dicts
7,Real-time detection of student engagement usin...,fd-jian,https://github.com/fd-jian/master-thesis,Master thesis written in 2021 for MSc Wirtscha...,,,GNU General Public License v3.0,2021,fd-jian/master-thesis,Software,GitHub,Real-time detection of student engagement usin...
8,tcs.uni-frankfurt.de,goethe-tcs,https://github.com/goethe-tcs/tcs.uni-frankfur...,Website of the Theoretical Computer Science Gr...,,HTML,GNU General Public License v3.0,2025,goethe-tcs/tcs.uni-frankfurt.de,Software,GitHub,tcs.uni-frankfurt.de
9,DBMS Project 2-B,TheBv,https://github.com/TheBv/DBMS-2A,Repository for the DBMS Course at the Goethe U...,,TypeScript,GNU Lesser General Public License v3.0,2024,TheBv/DBMS-2A,Software,GitHub,DBMS Project 2-B


In [73]:
#nicht mehr benötigte Spalten löschen
df_git_clean = df_git_sw.drop(columns = ["full_name", "readme_überschrift"])
df_git_clean

Unnamed: 0,dc.title,dc.contributor.author,dc.identifier.uri,dc.description.abstract,dc.relation,dc.programming.language,dc.licence,dc.date.issued,dc.type,monitoring.source
0,Real-time Object Detection for Autonomous Driv...,alen-smajic,https://github.com/alen-smajic/Real-time-Objec...,My Computer Vision project from my Computer Vi...,"bdd100k, berkeley-deep-drive, cnn, computer-sc...",Jupyter Notebook,MIT License,2025,Software,GitHub
1,Towards Explainable AI Systems for Traffic Sig...,alen-smajic,https://github.com/alen-smajic/Towards-Explain...,This project is part of the CS course 'Systems...,"autonomous-driving, blender, classification, c...",C#,MIT License,2025,Software,GitHub
2,EPI Zusammenfassung,Chris022,https://github.com/Chris022/epi-zusammenfassung,Meine Zusammenfassung des Moduls: EPI für die ...,,,GNU General Public License v3.0,2025,Software,GitHub
3,MSc05 template repository,PeerHerholz,https://github.com/PeerHerholz/MSc05_template_...,A template repository for projects conducted a...,,,"BSD 3-Clause ""New"" or ""Revised"" License",2022,Software,GitHub
4,"Subway Station Hazard Detection, Goethe Univer...",Psarpei,https://github.com/Psarpei/Subway-Station-Haza...,This project is part of the CS course 'Systems...,"blender, computer-vision, csharp, deep-learnin...",Jupyter Notebook,MIT License,2025,Software,GitHub
5,Introduction to High-Performance Computing in ...,bsotomayorg,https://github.com/bsotomayorg/Intro_HPC_Python,"Extended material of the summer course ""Introd...","high-performance-computing, matplotlib-exercis...",Jupyter Notebook,MIT License,2025,Software,GitHub
6,ACoLi Dicts,acoli-repo,https://github.com/acoli-repo/acoli-dicts,3000+ machine-readable open source dictionarie...,"dataset, dictionary, open-source, rdf, transla...",Shell,Apache License 2.0,2025,Software,GitHub
7,Real-time detection of student engagement usin...,fd-jian,https://github.com/fd-jian/master-thesis,Master thesis written in 2021 for MSc Wirtscha...,,,GNU General Public License v3.0,2021,Software,GitHub
8,tcs.uni-frankfurt.de,goethe-tcs,https://github.com/goethe-tcs/tcs.uni-frankfur...,Website of the Theoretical Computer Science Gr...,,HTML,GNU General Public License v3.0,2025,Software,GitHub
9,DBMS Project 2-B,TheBv,https://github.com/TheBv/DBMS-2A,Repository for the DBMS Course at the Goethe U...,,TypeScript,GNU Lesser General Public License v3.0,2024,Software,GitHub


In [81]:
#löscht Thesis
df_git_clean = df_git_clean[~df_git_clean["dc.title"].str.contains('thesis|Thesis')]
#df_git_clean = df_git_sw.reset_index(drop=True)  # neuer Index (Zählung)

df_git_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 34 entries, 0 to 37
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   dc.title                 34 non-null     object
 1   dc.contributor.author    34 non-null     object
 2   dc.identifier.uri        34 non-null     object
 3   dc.description.abstract  33 non-null     object
 4   dc.relation              34 non-null     object
 5   dc.programming.language  29 non-null     object
 6   dc.licence               34 non-null     object
 7   dc.date.issued           34 non-null     int64 
 8   dc.type                  34 non-null     object
 9   monitoring.source        34 non-null     object
dtypes: int64(1), object(9)
memory usage: 2.9+ KB


In [76]:
# Dublettenprüfung
df_git_clean = df_git_clean.drop_duplicates(subset = ["dc.title"], keep = "first")

In [80]:
df_git_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 34 entries, 0 to 37
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   dc.title                 34 non-null     object
 1   dc.contributor.author    34 non-null     object
 2   dc.identifier.uri        34 non-null     object
 3   dc.description.abstract  33 non-null     object
 4   dc.relation              34 non-null     object
 5   dc.programming.language  29 non-null     object
 6   dc.licence               34 non-null     object
 7   dc.date.issued           34 non-null     int64 
 8   dc.type                  34 non-null     object
 9   monitoring.source        34 non-null     object
dtypes: int64(1), object(9)
memory usage: 2.9+ KB


In [11]:
df_git_clean.to_csv("data_github.csv")

In [15]:
#übrige Einträge anschauen, was das für Lizenzen sind, könnte - je nach Menge händisch geprüft werden
df_gitgu_check = df_gitgu_sw_all[~df_gitgu_sw_all["lizenz"].str.contains('GNU|MIT|BSD|Apache')] # gibt übrige Einträge aus: ~ ist not-Operator
df_gitgu_check

Unnamed: 0,name,full name,url,beschreibung,sprache,sterne,lizenz,letztes update
2,gu-urlshorter,PhysikOnline-FFM/gu-urlshorter,https://github.com/PhysikOnline-FFM/gu-urlshorter,"URL-Shortener for Goethe University, Frankfurt",PHP,4,Other,2023-01-28T11:26:14Z
8,goetheuni-layout,svenk/goetheuni-layout,https://github.com/svenk/goetheuni-layout,"The webdesign of Goethe-Universität Frankfurt,...",PHP,1,Creative Commons Zero v1.0 Universal,2020-12-13T11:33:07Z
17,Ensemble_Learning_Image_Transformers,cr-heidemann/Ensemble_Learning_Image_Transformers,https://github.com/cr-heidemann/Ensemble_Learn...,This repo was created for my master thesis abo...,Python,0,Other,2024-08-25T08:52:43Z
30,grading-gu-quarto,tisprang/grading-gu-quarto,https://github.com/tisprang/grading-gu-quarto,Quarto template for grading term papers at Goe...,TeX,0,Creative Commons Zero v1.0 Universal,2024-10-30T09:56:11Z
38,scene_orientation_contextual_cueing,jlschnatz/scene_orientation_contextual_cueing,https://github.com/jlschnatz/scene_orientation...,Data and Analysis Scripts of a study conducted...,TeX,0,Other,2023-11-17T22:50:46Z
40,website_editing-film-music,korngold-werkausgabe/website_editing-film-music,https://github.com/korngold-werkausgabe/websit...,This is the repository for the conference webs...,TypeScript,0,Other,2025-05-07T16:04:00Z
