In [2]:
import os
import re
import json
import time
import js2py
import requests
import pandas as pd

# Шанхайский рейтинг университетов

## Общий рейтинг

In [31]:
url_ranking = 'https://www.shanghairanking.com/api/pub/v1/inst'
resp_ranking = requests.get(url_ranking)
ranking = pd.DataFrame()

if resp_ranking.status_code == 200:
    json_data = resp_ranking.json()
    ranking = pd.json_normalize(json_data['data'])
else:
    print(f"{url_ranking} return status code {resp_ranking.status_code}")

ranking.to_csv('data/shanghai.csv', index=False)
ranking.to_excel('data/shanghai.xlsx', index=False)
ranking.head()

Unnamed: 0,nameEn,univLogo,univUp,region,rankingInfo,ranking
0,Harvard University,logo/032bd1b77.png,harvard-university,United States,ARWU 2022,1
1,Stanford University,logo/13de8913b.png,stanford-university,United States,ARWU 2022,2
2,Massachusetts Institute of Technology (MIT),logo/79165fd8b.png,massachusetts-institute-of-technology-mit,United States,ARWU 2022,3
3,University of Cambridge,logo/8d9861b69.png,university-of-cambridge,United Kingdom,ARWU 2022,4
4,"University of California, Berkeley",logo/0ff179fb8.png,university-of-california-berkeley,United States,ARWU 2022,5


Пока данные не много, можно, для примера, посмотреть ТОП 10 🇷🇺 российских университетов в рейтинге.

In [7]:
ranking[ranking['region'] == 'Russia'][['nameEn', 'ranking']].head(10)

Unnamed: 0,nameEn,ranking
114,Moscow State University,101-150
347,Saint Petersburg State University,301-400
529,Moscow Institute of Physics and Technology,501-600
626,HSE University,601-700
654,Sechenov University,601-700
742,Novosibirsk State University,701-800
753,Skolkovo Institute of Science and Technology,701-800
761,Tomsk State University,701-800
797,Ural Federal University,701-800
847,National Research Nuclear University MEPhI (Mo...,801-900


Для того чтобы получить данные по предметным рейтингам, а также получить ретроспективные данные, потребуется обойти все страницы университетов на сайте. Рейтинг большой (более 3 тыс. записей), поэтому запасаемся терпением и ждём, переодически проверяем содержимое папки `data/downloads/shanghai`.

In [16]:
data_folder = 'data/downloads/shanghai'

for i, r in ranking.iterrows():
    file_path = f"{data_folder}/{i+1}.json"

    if not os.path.isfile(file_path):
        university_path = r['univUp']
        university_url = f'https://www.shanghairanking.com/institution/{university_path}'
        html_resp = requests.get(university_url)

        if html_resp.status_code == 200:
            html = html_resp.text
            reg = f"\/_nuxt\/static\/\d+\/institution\/{university_path}\/payload\.js"
            payload = re.search(f"/_nuxt/static/\d+/institution/{university_path}/payload\.js", html)[0]
            university_id = payload.replace('/_nuxt/static/', '').replace(f"/{university_path}/institution/payload.js", '')
            js_url = f"https://www.shanghairanking.com/{payload}"
            js_resp = requests.get(js_url)

            if js_resp.status_code == 200:
                js_data = js_resp.text
                js_data = js2py.eval_js(js_data.replace(f'__NUXT_JSONP__("/institution/{university_path}", ', '')[:-2])
                university_data = js_data.data[0].univData.to_dict()

                with open(file_path, 'w') as f:
                    f.write(json.dumps(university_data, ensure_ascii=False, indent=4))
            else:
                print(f"{university_url} return status code {html_resp.status_code}")
        else:
            print(f"{university_url} return status code {html_resp.status_code}")

        # Бережём источник данных
        time.sleep(0.5)

Каждый скачанный файл университета содержит намного больше данных, чем просто позиция в рейтинге. Для примера, посмотрим на первый университет из списка.

In [3]:
example = json.load(open(f"{data_folder}/493.json", 'r'))
example

{'address': 'Trg Republike Hrvatske 14',
 'detail': {'arwu': {'datasetId': 1,
   'intro': 'The Academic Ranking of World Universities (ARWU) was first published in June 2003 by the Center for World-Class Universities (CWCU), Graduate School of Education (formerly the Institute of Higher Education) of Shanghai Jiao Tong University, China, and updated on an annual basis. Since 2009 the Academic Ranking of World Universities (ARWU) has been published and copyrighted by ShanghaiRanking Consultancy. ShanghaiRanking Consultancy is a fully independent organization on higher education intelligence and not legally subordinated to any universities or government agencies. ARWU uses six objective indicators to rank world universities, including the number of alumni and staff winning Nobel Prizes and Fields Medals, number of highly cited researchers selected by Clarivate, number of articles published in journals of Nature and Science, number of articles indexed in Science Citation Index Expanded™ a

Для упрощения возьмём только данные связанные с ретроспективой по основному рейтингу и предметные рейтинги.

In [30]:
file_numbers = sorted(list([int(x.replace('.json', '')) for x in os.listdir(data_folder)]))
rating_retro = pd.DataFrame()
rating_subject = pd.DataFrame()

for fn in file_numbers:
    file_path = f"{data_folder}/{fn}.json"
    json_data = json.load(open(file_path, 'r'))

    try:
        if json_data['detail']['arwu'] and 'rkHistory' in json_data['detail']['arwu'].keys():
            chunk_retro = pd.json_normalize(json_data['detail']['arwu']['rkHistory'])
            chunk_retro['nameEn'], chunk_retro['region']  = json_data['nameEn'], json_data['region']
            rating_retro = pd.concat([rating_retro, chunk_retro], ignore_index=True)

        if json_data['detail']['gras']['subjAdva']:
            chunk_subject = pd.json_normalize(json_data['detail']['gras']['subjAdva'])
            chunk_subject['nameEn'], chunk_subject['region'] = json_data['nameEn'], json_data['region']
            rating_subject = pd.concat([rating_subject, chunk_subject], ignore_index=True)
    except Exception as e:
        print(file_path, json_data['detail']['gras']['rkHistory'], json_data['detail']['gras']['subjAdva'])

retro_years = list(rating_retro['yr'].unique())
rating_retro.to_csv(f"data/shanghai_retro_{min(retro_years)}-{max(retro_years)}.csv", index=False)
rating_retro.to_excel(f"data/shanghai_retro_{min(retro_years)}-{max(retro_years)}.xlsx", index=False)
rating_subject.to_csv('data/shanghai_subject.csv', index=False)
rating_retro.to_excel('data/shanghai_subject.xlsx', index=False)

В качестве результатов по ретроспективе, давайте посмотрим количество российских университетов по годам.

In [25]:
rating_retro[rating_retro['region'] == 'Russia'][['yr', 'nameEn']].\
    groupby(by='yr').count().rename(columns={'nameEn': 'count'})

Unnamed: 0_level_0,count
yr,Unnamed: 1_level_1
2020,13
2021,13
2022,13


А в предметных рейтингах выделим самые представленные области университетами 🇷🇺 России.

In [32]:
rating_subject[rating_subject['region'] == 'Russia'][['name', 'nameEn']].\
    groupby(by='name').count().rename(columns={'nameEn': 'count'}).\
    sort_values(by='count', ascending=False).head(10)

Unnamed: 0_level_0,count
name,Unnamed: 1_level_1
Physics,6
Metallurgical Engineering,6
Biological Sciences,4
Nanoscience & Nanotechnology,4
Mathematics,4
Materials Science & Engineering,3
Agricultural Sciences,2
Economics,2
Pharmacy & Pharmaceutical Sciences,2
Mechanical Engineering,2
