### **API SADVR - Portrait statistique**  
https://www.cen.umontreal.ca/espacedoc/sadvr/  

Le Service d’accès aux données de la Vitrine-Recherche (SADVR) est un service institutionnel de partage de données relatives aux profils des professeur·e·s et à leurs activités académiques ainsi qu'aux expertises et disciplines de recherche des facultés, départements et centres de recherche affiliés à l'Université.

---

#### **Professeurs**

In [38]:
import requests
import json
import pandas as pd
from ast import literal_eval
from collections import Counter
import math
from SADVR_infoIndividus import getInfoIndividus

Import des données

In [35]:
baseURI = 'https://www.recherche.umontreal.ca/vitrine/rest/api/1.7/umontreal'

# uri id/individu
uri = f'{baseURI}/id/individu'
res = json.loads(requests.get(uri).text)
dataIndividus = pd.DataFrame(res['data'])

# requête SOLR dans le répertoire des professeurs
index = 0
res = json.loads(requests.get(f'{baseURI}/recherche/professeur/select?q=ID:*&start={index}').text)
nbResults = res['paginationSOLR']['numFound']

dataProfs = []
for i in range(0, nbResults, 20):
    res = json.loads(requests.get(
        f'{baseURI}/recherche/professeur/select?q=ID:*&start={index}&rows=20'
        ).text)['data']
    
    dataProfs += res
    index += 20

dataProfs = pd.DataFrame(dataProfs)

# uri /info/individu
ids_profs = dataProfs['idsadvr'].tolist()


            
#infoProfs = getInfoIndividus(ids_profs)

In [57]:
ids_profs[20:40]

['in13602',
 'in13603',
 'in13604',
 'in13605',
 'in13606',
 'in13608',
 'in13609',
 'in13610',
 'in13611',
 'in13614',
 'in13615',
 'in13616',
 'in13617',
 'in13618',
 'in13619',
 'in13620',
 'in13621',
 'in13622',
 'in13623',
 'in13626']

In [72]:
nb_batches = math.floor(len(ids_profs)/20)
batches = []

for i in range(nb_batches):
    batch = []
    for j in range(0, 20):
        batch.append(f'idsadvr[{j}]=' + ids_profs[i])
    
    batch = "&".join(batch)
    batches.append(batch)
    

In [73]:
batches

['idsadvr[0]=in13580&idsadvr[1]=in13581&idsadvr[2]=in13583&idsadvr[3]=in13584&idsadvr[4]=in13585&idsadvr[5]=in13586&idsadvr[6]=in13587&idsadvr[7]=in13588&idsadvr[8]=in13589&idsadvr[9]=in13590&idsadvr[10]=in13591&idsadvr[11]=in13592&idsadvr[12]=in13593&idsadvr[13]=in13594&idsadvr[14]=in13595&idsadvr[15]=in13596&idsadvr[16]=in13597&idsadvr[17]=in13598&idsadvr[18]=in13599&idsadvr[19]=in13601',
 'idsadvr[0]=in13581&idsadvr[1]=in13583&idsadvr[2]=in13584&idsadvr[3]=in13585&idsadvr[4]=in13586&idsadvr[5]=in13587&idsadvr[6]=in13588&idsadvr[7]=in13589&idsadvr[8]=in13590&idsadvr[9]=in13591&idsadvr[10]=in13592&idsadvr[11]=in13593&idsadvr[12]=in13594&idsadvr[13]=in13595&idsadvr[14]=in13596&idsadvr[15]=in13597&idsadvr[16]=in13598&idsadvr[17]=in13599&idsadvr[18]=in13601&idsadvr[19]=in13602',
 'idsadvr[0]=in13583&idsadvr[1]=in13584&idsadvr[2]=in13585&idsadvr[3]=in13586&idsadvr[4]=in13587&idsadvr[5]=in13588&idsadvr[6]=in13589&idsadvr[7]=in13590&idsadvr[8]=in13591&idsadvr[9]=in13592&idsadvr[10]=in13593&

In [None]:
output = []
for batch in batches:
    data = json.loads(requests.get(f'{baseURI}/info/individu?{batch}').text)['data']
    for x in data:
        output.append(x)

pd.DataFrame(output)

In [26]:
print(batch)

idsadvr[2700]=in35943&idsadvr[2700]=in35947&idsadvr[2700]=in35955&idsadvr[2700]=in22406


In [32]:
baseURI = 'https://www.recherche.umontreal.ca/vitrine/rest/api/1.7/umontreal'
len(json.loads(requests.get(f'{baseURI}/info/individu?idsadvr[0]={ids_profs[0]}&idsadvr[1]={ids_profs[1]}').text)['data'])

2

Normalisation / filtrage

In [None]:
dataIndividus = dataIndividus.explode('etablissementsAffilies').reset_index(drop=True)
dataIndividus['etablissementAffilie'] = dataIndividus['etablissementsAffilies'].apply(
    lambda x: x['nom'] if isinstance(x, dict) else None)

dataIndividus = dataIndividus.drop(columns=
        ['etablissementsAffilies', 'fonction', 'courriel', 'visuelOfficiel', 'visuelOfficielGrand']
    ).drop_duplicates()

dataIndividus = dataIndividus[dataIndividus['nom'] != '?_?']

# Séparer les colonnes qui contiennent des données structurées en JSON en muliples colonnes distinctes
def explodeNormalize(df: pd.DataFrame, columns: list, ):
    """
    Cette fonction prend en paramètre un DataFrame et une liste contenant les noms des colonnes à normaliser.
    Elle retourne le DataFrame modifié
    """
    for col in columns:
        dTypeCol = Counter(df[col].apply(lambda x: type(literal_eval(str(x)))).tolist()).most_common(1)[0][0]
        if dTypeCol == list:
            df = df.explode(col).reset_index(drop=True)

        dfTemp = pd.json_normalize(df[col]).add_prefix(f'{col}.') 
        
        df = pd.concat([df, dfTemp], axis=1).drop(col, axis=1)

    return df

# Colonnes à normaliser
columns = ['affiliations', 'expertise', 'expertise.secteursRecherche',
    'expertise.disciplines', 'expertise.motsCles', 'expertise.periodesChronologiques',
    'expertise.phraseCle', 'expertise.continents']

dataProfs = explodeNormalize(dataProfs, columns)

dataProfs = dataProfs.drop(
    columns=["prenom", "nom", "etablissementsAffilies", "visuelOfficiel", "visuelOfficielGrand",
      "courriels", "telephones", "urlVitrine", "nomSAD", "prenomSAD", "in-memoriam",
      "affiliations.courrielInstitutionnel", "affiliations.local", "affiliations.exclusion", "affiliations.exclusionTel", 
      "affiliations.fonction.nom", "affiliations.telephone.numero", "affiliations.telephone.poste"])

data = dataIndividus.merge(dataProfs, how='right')
data

In [None]:
data.to_csv('./tables/SADVR_professeurs.csv', index=False)