# Creating Collections as Data Using Federated Queries
This is a quick pipeline to put together the queries, simplifying the process of obtaining a cumulative dataframe in .CSV format.

In [1]:
# Load modules
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON

# Define the SPARQL endpoint
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(JSON)

# Function to execute a query and return results as a list of dictionaries
def execute_query(query):
    sparql.setQuery(query)
    results = sparql.query().convert()
    return results["results"]["bindings"]

## Collating the Queries

In [6]:
# Define queries
queries = {
    'BNE': """
        PREFIX bne-def: <https://datos.bne.es/def/>
        PREFIX dcterms: <http://purl.org/dc/terms/>
        
        SELECT ?author ?authorLabel ?work ?workLabel ?edition ?placeOfProduction ?yearOfPublication ?langCode
        WHERE {
            ?author wdt:P950 ?id .
            ?author wdt:P135 wd:Q530936 .
            ?author rdfs:label ?authorLabel.  FILTER(LANG(?authorLabel) = "en").
            BIND(uri(concat("https://datos.bne.es/resource/", ?id)) as ?bneID)
            SERVICE <http://datos.bne.es/sparql> {
                ?bneID bne-def:OP5001 ?work .
                ?work rdfs:label ?workLabel .
                OPTIONAL {?work bne-def:OP1002 ?m . ?m bne-def:OP2001 ?edition . ?edition bne-def:P3003 ?placeOfProduction}
                OPTIONAL {?work bne-def:OP1002 ?m . ?m bne-def:OP2001 ?edition . ?edition bne-def:P3006 ?yearOfPublication}
                OPTIONAL {?work bne-def:OP1002 ?m . ?m bne-def:OP2001 ?edition . ?edition dcterms:language ?langCode}
            }
        }
        LIMIT 1000
    """,
    'BNF': """
        PREFIX dcterms: <http://purl.org/dc/terms/>
        PREFIX rdarelationships: <http://rdvocab.info/RDARelationshipsWEMI/>
        PREFIX rdagroup1elements: <http://rdvocab.info/Elements/>
        
        SELECT ?author ?authorLabel ?expression ?title ?edition ?placeOfPublication ?yearOfPublication ?langCode 
        WHERE {
          ?author wdt:P268 ?id
          BIND(uri(concat(concat("http://data.bnf.fr/ark:/12148/cb", ?id),"#about")) as ?idBnf)
          ?author wdt:P135 wd:Q530936 .
          ?author rdfs:label ?authorLabel.  FILTER(LANG(?authorLabel) = "en").
          SERVICE <http://data.bnf.fr/sparql> {
            ?expression <http://id.loc.gov/vocabulary/relators/aut> ?idBnf .
            OPTIONAL {?expression dcterms:language ?langCode .}
            OPTIONAL {?expression dcterms:publisher ?edition .}
            ?manifestation rdarelationships:expressionManifested ?expression .
            ?manifestation dcterms:title ?title .
            ?manifestation dcterms:date ?yearOfPublication .
            OPTIONAL{ ?manifestation rdagroup1elements:placeOfPublication ?placeOfPublication .}
          }
        }
        LIMIT 1000
    """,
    'BVMC': """
       PREFIX rdaw: <http://rdaregistry.info/Elements/w/>
       PREFIX rdam: <http://rdaregistry.info/Elements/m/>
       PREFIX rdae: <http://rdaregistry.info/Elements/e/>
       PREFIX madsrdf: <http://www.loc.gov/mads/rdf/v1#>
        
       SELECT ?author ?authorLabel ?work ?workLabel ?placeOfProduction ?yearOfPublication ?langCode
       WHERE {
          ?author wdt:P2799 ?id .
          ?author wdt:P135 wd:Q530936 .
          ?author rdfs:label ?authorLabel.  FILTER(LANG(?authorLabel) = "en").
          BIND(uri(concat("https://data.cervantesvirtual.com/person/", ?id)) as ?bvmcID)
          SERVICE <http://data.cervantesvirtual.com/openrdf-sesame/repositories/data> {
            ?work rdaw:author ?bvmcID .
            ?work rdfs:label ?workLabel .
            ?work rdaw:manifestationOfWork ?manifestation .
            ?work rdaw:expressionOfWork ?expression .
            OPTIONAL {?expression rdae:languageOfExpression ?language . ?language madsrdf:code ?langCode .}
            OPTIONAL {?manifestation rdam:placeOfProduction ?placeOfProduction .}
            OPTIONAL {?manifestation rdam:dateOfPublication ?dateOfPublication . BIND(REPLACE(str(?dateOfPublication), "https://data.cervantesvirtual.com/date/", "", "i") AS ?yearOfPublication) .}
          }
       }
       LIMIT 1000
    """
}

### Processing Queries Results

In [7]:
# Function to process query results and return a list of dictionaries
def process_query_results(results, source):
    processed_data = []

    # Define the field name mapping depending on the source
    field_mapping = {
        'BNE': {
            'author': 'author',
            'work': 'work',
            'workLabel': 'workLabel',
            'edition': 'edition',
            'placeOfProduction': 'placeOfProduction',
            'yearOfPublication': 'yearOfPublication',
            'langCode': 'langCode'
        },
        'BNF': {
            'author': 'author',
            'work': 'expression',
            'workLabel': 'title',
            'edition': 'edition',
            'placeOfProduction': 'placeOfPublication',
            'yearOfPublication': 'yearOfPublication',
            'langCode': 'langCode'
        },
        'BVMC': {
            'author': 'author',
            'work': 'work',
            'workLabel': 'workLabel',
            'edition': 'edition',
            'placeOfProduction': 'placeOfProduction',
            'yearOfPublication': 'yearOfPublication',
            'langCode': 'langCode'
        }
    }

    # Map the query result fields to the corresponding output fields
    for result in results:
        # For each source, apply the field mapping
        author = result.get(field_mapping[source]['author'], {}).get('value', None)
        work = result.get(field_mapping[source]['work'], {}).get('value', None)
        workLabel = result.get(field_mapping[source]['workLabel'], {}).get('value', None)  # Removed comma here
        edition = result.get(field_mapping[source]['edition'], {}).get('value', None)  # Removed comma here
        placeOfProduction = result.get(field_mapping[source]['placeOfProduction'], {}).get('value', None)
        yearOfPublication = result.get(field_mapping[source]['yearOfPublication'], {}).get('value', None)
        langCode = result.get(field_mapping[source]['langCode'], {}).get('value', None)

        # Handle missing or empty fields
        placeOfProduction = placeOfProduction if placeOfProduction else 'Unknown'
        yearOfPublication = yearOfPublication if yearOfPublication else 'Unknown'
        langCode = langCode if langCode else 'Unknown'

        # Append the processed result to the list
        processed_data.append({
            'source': source,
            'author': author,
            'work': work,
            'workLabel': workLabel,
            'edition': edition,
            'placeOfProduction': placeOfProduction,
            'yearOfPublication': yearOfPublication,
            'langCode': langCode,
        })

    return processed_data

In [8]:
# Create an empty list to collect the queries results
queries_results = []

# Execute each query, process the results, and extend to the queries_results list
for source, query in queries.items():
    results = execute_query(query)
    processed_data = process_query_results(results, source)
    queries_results.extend(processed_data)
print(queries_results)
# Create a dataframe from the collected results
df_combined = pd.DataFrame(queries_results, columns=['source', 'author', 'work', 'workLabel', 'edition', 'placeOfProduction', 'yearOfPublication', 'langCode'])

# Preview the results
df_combined.head(10)

[{'source': 'BNE', 'author': 'http://www.wikidata.org/entity/Q5682', 'work': 'https://datos.bne.es/resource/XX3383764', 'workLabel': 'Novelas ejemplares', 'edition': 'https://datos.bne.es/resource/bima0000013178', 'placeOfProduction': 'Paris', 'yearOfPublication': '1809', 'langCode': 'http://id.loc.gov/vocabulary/languages/fre'}, {'source': 'BNE', 'author': 'http://www.wikidata.org/entity/Q5682', 'work': 'https://datos.bne.es/resource/XX3383563', 'workLabel': 'Don Quijote de la Mancha', 'edition': 'https://datos.bne.es/resource/bimo0000398030', 'placeOfProduction': '[Murcia', 'yearOfPublication': 'D.L. 1993', 'langCode': 'http://id.loc.gov/vocabulary/languages/spa'}, {'source': 'BNE', 'author': 'http://www.wikidata.org/entity/Q5682', 'work': 'https://datos.bne.es/resource/XX1924290', 'workLabel': 'La cueva de Salamanca', 'edition': 'https://datos.bne.es/resource/bimo0002046013', 'placeOfProduction': '[Granada]', 'yearOfPublication': '2005', 'langCode': 'http://id.loc.gov/vocabulary/lan

Unnamed: 0,source,author,work,workLabel,edition,placeOfProduction,yearOfPublication,langCode
0,BNE,http://www.wikidata.org/entity/Q5682,https://datos.bne.es/resource/XX3383764,Novelas ejemplares,https://datos.bne.es/resource/bima0000013178,Paris,1809,http://id.loc.gov/vocabulary/languages/fre
1,BNE,http://www.wikidata.org/entity/Q5682,https://datos.bne.es/resource/XX3383563,Don Quijote de la Mancha,https://datos.bne.es/resource/bimo0000398030,[Murcia,D.L. 1993,http://id.loc.gov/vocabulary/languages/spa
2,BNE,http://www.wikidata.org/entity/Q5682,https://datos.bne.es/resource/XX1924290,La cueva de Salamanca,https://datos.bne.es/resource/bimo0002046013,[Granada],2005,http://id.loc.gov/vocabulary/languages/spa
3,BNE,http://www.wikidata.org/entity/Q5682,https://datos.bne.es/resource/XX3383764,Novelas ejemplares,https://datos.bne.es/resource/Mimo0001661915,[Tokio,1993],http://id.loc.gov/vocabulary/languages/jpn
4,BNE,http://www.wikidata.org/entity/Q5682,https://datos.bne.es/resource/XX4894754,El celoso extremeño,https://datos.bne.es/resource/a5599310,[S.l],[1917?],http://id.loc.gov/vocabulary/languages/rus
5,BNE,http://www.wikidata.org/entity/Q5682,https://datos.bne.es/resource/XX1924374,El viejo celoso,https://datos.bne.es/resource/a5610044,[S.l.],[19--],http://id.loc.gov/vocabulary/languages/fre
6,BNE,http://www.wikidata.org/entity/Q5682,https://datos.bne.es/resource/XX3383764,Novelas ejemplares,https://datos.bne.es/resource/a5612044,Paris,[ca. 1930],http://id.loc.gov/vocabulary/languages/fre
7,BNE,http://www.wikidata.org/entity/Q5682,https://datos.bne.es/resource/XX3383764,Novelas ejemplares,https://datos.bne.es/resource/bima0000029111,A Amsterdam et Leipzig,1768,http://id.loc.gov/vocabulary/languages/fre
8,BNE,http://www.wikidata.org/entity/Q5682,https://datos.bne.es/resource/XX3383764,Novelas ejemplares,https://datos.bne.es/resource/a5521349,A Paris,1625,http://id.loc.gov/vocabulary/languages/fre
9,BNE,http://www.wikidata.org/entity/Q5682,https://datos.bne.es/resource/XX3383764,Novelas ejemplares,https://datos.bne.es/resource/a5521388,A Paris,1640,http://id.loc.gov/vocabulary/languages/fre


In [9]:
# Describe the results
df_combined.describe()

Unnamed: 0,source,author,work,workLabel,edition,placeOfProduction,yearOfPublication,langCode
count,3000,3000,3000,3000,1000,3000,3000,3000
unique,3,3,1815,1441,965,713,670,40
top,BNE,http://www.wikidata.org/entity/Q5682,https://datos.bne.es/resource/XX3383563,Don Quijote de la Mancha,https://datos.bne.es/resource/a7150251,Madrid,1911-1913,es
freq,1000,2294,963,975,4,444,137,971


In [10]:
# Export the results dataframe in .CSV file format
df_combined.to_csv('df_combined.csv', index=False)