-
Notifications
You must be signed in to change notification settings - Fork 1
/
tools_from_wikidata_sync.py
73 lines (63 loc) · 3.21 KB
/
tools_from_wikidata_sync.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from SPARQLWrapper import SPARQLWrapper, JSON, CSV
import pandas as pd
import datetime
# wikidata endpoint
endpoint_url = "https://query.wikidata.org/sparql"
# SPARQL query
tools_query = """
SELECT DISTINCT ?tool ?toolLabel ?description ?genreLabel ?website
(group_concat(DISTINCT ?classLabel; SEPARATOR=", ") as ?classification)
(group_concat(DISTINCT ?licenseELabel; SEPARATOR=", ") as ?license)
(group_concat(DISTINCT ?programmingLangLabel; SEPARATOR=", ") as ?programming_language)
(group_concat(DISTINCT ?sourceRepo; SEPARATOR=", ") as ?source_repos)
?update
(group_concat(DISTINCT ?sourceLabel; SEPARATOR=", ") as ?sources) WHERE {
?class wdt:P279 wd:Q124614077.
_:subClasses (wdt:P279*) ?class.
?tool wdt:P366 _:subClasses ;
wdt:P136 ?genre ;
OPTIONAL { ?tool wdt:P275 ?licenseE . ?licenseE rdfs:label ?licenseELabel . FILTER(lang(?licenseELabel) = "en") }
OPTIONAL { ?tool schema:description? ?description . FILTER(lang(?description) = "en") }
OPTIONAL { ?tool wdt:P5017 ?update }
OPTIONAL { ?tool wdt:P277 ?programmingLang . ?programmingLang rdfs:label ?programmingLangLabel . FILTER(lang(?programmingLangLabel) = "en")}
OPTIONAL { ?tool wdt:P1324 ?sourceRepo }
OPTIONAL { ?tool wdt:P856 ?website}
OPTIONAL { ?tool wdt:P1343 ?source . ?source rdfs:label ?sourceLabel . FILTER(lang(?sourceLabel) = "en") }
?class rdfs:label ?classLabel .
FILTER(lang(?classLabel) = "en")
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
GROUP BY ?tool ?toolLabel ?description ?genreLabel ?website ?update
ORDER BY ?tool
"""
def get_tools_data(endpoint_url, tools_query):
# Create a SPARQLWrapper instance
sparql = SPARQLWrapper(endpoint_url)
# Set the query and response format
sparql.setQuery(tools_query)
sparql.setReturnFormat(JSON)
# Execute the query and get the results
results = sparql.query().convert()
df = pd.json_normalize(results["results"]["bindings"]).filter(like="value")
df = df.rename(columns= lambda x: str(x)[:-6])
df['update'] = pd.to_datetime(df['update'])
df['update'] = df['update'].dt.strftime('%Y-%m-%d %H:%M:%S')
df["tool"] = df["tool"].replace(r"http://www.wikidata.org/entity/", "", regex=True)
df.genreLabel= df.genreLabel.map({"application programming interface": "API", "command-line interface": "CLI", "graphical user interface": "GUI"})
df.classification = df.classification.map(lambda x:x.replace("knowledge graph", "KG"))
#df = df.assign(open_source= df.licenseLabel != "proprietary license")
#date_range = pd.date_range(start=df['update'].min(), end=datetime.datetime.now(tz=datetime.timezone.utc) + datetime.timedelta(days=365), freq='Y')
df = df.rename(columns={
"tool": "wikidata_id",
"toolLabel": "tool_name",
"genreLabel": "tool_type",
"sources": "information_source",
"update" : "last_entry_date"
})
return df
if __name__ == "__main__":
df = get_tools_data(endpoint_url, tools_query)
with open("tools_from_wikidata.jsonl", "w") as out_file:
for _, row in df.iterrows():
row_json = row.to_json()
out_file.write(f"{row_json}\n")