**Установка библиотек**

In [9]:
!pip install aiohttp
!pip install nest-asyncio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


**Импорт библиотек**

In [120]:
import asyncio
import aiohttp
import requests
import time
import nest_asyncio
from bs4 import BeautifulSoup
import json
import re
import sys
import pandas as pd
from collections import Counter
import numpy as np
nest_asyncio.apply()

In [83]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Функции**

In [12]:
class AsyncParser:
    """ Class that parses webpages asynchronously """

    def __init__(self, 
                 urls, 
                 process,
                 n_connections,
                 n_retries = 5,
                 retrywait = 0.5):
        """
        @param urls list of web adresses to parse
        @param process function that accepts url and page text, handles it 
               and returns result
        @param n_connections limit of simultaneously opened connections 
        @param n_retries number of possible retries
        @param retrywait seconds to wait before retrying to reach url
        """
        self.urls = urls
        self.process = process
        self.n_connections = n_connections
        self.n_retries = n_retries
        self.retrywait = retrywait
        self.ranflag = False
        self.notreachedurls = []

    async def geturl(self, session, url):
        """ Processes one url """
        async with session.get(url) as resp:
            if resp.status == 200:
                text_ = await resp.text()
                return self.process(url, text_)
        async with session.get(url) as resp:
            for _ in range(self.n_retries):
                if resp.status == 200:
                    text_ = await resp.text()
                    return self.process(url, text_)
            self.notreachedurls.append(url)
            return None            
    
    async def geturls(self):
        """ Processes all urls """

        connector = aiohttp.TCPConnector(limit=self.n_connections)
        async with aiohttp.ClientSession(connector=connector) as session:
            tasks = []
            for url in self.urls:
                tasks.append(asyncio.ensure_future(self.geturl(session, url)))

            results = await asyncio.gather(*tasks)
            return results

    def parse(self):
        """ Start parsing job """
        self.ranflag = True
        return asyncio.run(self.geturls())

    def notreached(self):
        """ Returns list of not reached urls """
        if self.ranflag == False:
            raise AttributeError('Didn\'t parse')
        return self.notreachedurls

**Сбор данных**

In [41]:
def get_urls_from_page(url, text):
    '''Собирает все ссылки на курсовые со страницы'''
    urls = []
    soup = BeautifulSoup(text)
    pres = soup.find_all('h3', {'class':'vkr-card__title'})
    for pre in pres:
        url = 'https://www.hse.ru' + str(pre.find('a')['href'])
        urls.append(url)
    return urls


vkrurls = []
bounds = [(1, 1001), (1001, 2001), (2001, 3001), (3001, 4001), 
          (4001, 5001), (5001, 6001), (6001, 7001), (7001, 7291)]
delim = '# # # # # # # # # # # # # # # # # # # # # #'
lensum = 0
for bound in bounds:
    l, r = bound
    urls = [f'https://www.hse.ru/edu/vkr/?page={i}' for i in range(l, r)]
    lensum += len(urls)
print(lensum)

7290


In [42]:
for bound in bounds:
    l, r = bound
    print(bound)
    urls = [f'https://www.hse.ru/edu/vkr/?page={i}' for i in range(l, r)]
    parser = AsyncParser(urls, get_urls_from_page, 20)
    start_time = time.time()
    results = parser.parse()
    seconds = time.time() - start_time
    print(f'Spent: {seconds:.0f} seconds')
    print(f'Not reached: {len(parser.notreached())} urls')
    for res in results:
        vkrurls.extend(res)
    print(delim)

(1, 1001)
Spent: 181 seconds
Not reached: 0 urls
# # # # # # # # # # # # # # # # # # # # # #
(1001, 2001)
Spent: 221 seconds
Not reached: 0 urls
# # # # # # # # # # # # # # # # # # # # # #
(2001, 3001)
Spent: 225 seconds
Not reached: 0 urls
# # # # # # # # # # # # # # # # # # # # # #
(3001, 4001)
Spent: 223 seconds
Not reached: 0 urls
# # # # # # # # # # # # # # # # # # # # # #
(4001, 5001)
Spent: 214 seconds
Not reached: 0 urls
# # # # # # # # # # # # # # # # # # # # # #
(5001, 6001)
Spent: 214 seconds
Not reached: 0 urls
# # # # # # # # # # # # # # # # # # # # # #
(6001, 7001)
Spent: 206 seconds
Not reached: 0 urls
# # # # # # # # # # # # # # # # # # # # # #
(7001, 7291)
Spent: 56 seconds
Not reached: 0 urls
# # # # # # # # # # # # # # # # # # # # # #


In [43]:
with open('vkrurls', 'w') as f:
    json.dump(vkrurls, f)

In [111]:
def get_vkr_dict(url, text):
    '''Собирает словарь со всеми данными о курсовой с ее страницы'''
    a = json.loads(re.findall('"single":({.*}),"footer"', text)[0])
    a['vkrurl'] = url
    return a
    

total_result = []
bounds = [(0, 5001), (5001, 10001), (10001, 15001), (15001, 20001), 
          (20001, 25001), (25001, 30001), (30001, 35001), (35001, 40001), 
          (40001, 45001), (45001, 50001), (50001, 55001), (55001, 60001), 
          (60001, 65001), (65001, 70001), (70001, 72830)]
vkrurls_ed = list(set(vkrurls))
print(len(vkrurls_ed))
delim = '# # # # # # # # # # # # # # # # # # # # # #'
lensum = 0
vkrurls_edsubsum = []
for bound in bounds:
    l, r = bound
    vkrurls_sub = vkrurls_ed[l:r]
    lensum += len(vkrurls_sub)
    vkrurls_edsubsum.extend(vkrurls_sub)
print(lensum)
print(vkrurls_ed == vkrurls_edsubsum)

72830
72830
True


In [112]:
for bound in bounds:
    l, r = bound
    print(bound)
    vkrurls_sub = vkrurls_ed[l:r]
    parser = AsyncParser(vkrurls_sub, get_vkr_dict, 20)
    start_time = time.time()
    results = parser.parse()
    seconds = time.time() - start_time
    print(f'Spent: {seconds:.0f} seconds')
    print(f'Not reached: {len(parser.notreached())} urls')
    total_result.extend(results)
    print(delim)

(0, 5001)
Spent: 229 seconds
Not reached: 0 urls
# # # # # # # # # # # # # # # # # # # # # #
(5001, 10001)
Spent: 229 seconds
Not reached: 0 urls
# # # # # # # # # # # # # # # # # # # # # #
(10001, 15001)
Spent: 230 seconds
Not reached: 0 urls
# # # # # # # # # # # # # # # # # # # # # #
(15001, 20001)
Spent: 230 seconds
Not reached: 0 urls
# # # # # # # # # # # # # # # # # # # # # #
(20001, 25001)
Spent: 229 seconds
Not reached: 0 urls
# # # # # # # # # # # # # # # # # # # # # #
(25001, 30001)
Spent: 226 seconds
Not reached: 0 urls
# # # # # # # # # # # # # # # # # # # # # #
(30001, 35001)
Spent: 227 seconds
Not reached: 0 urls
# # # # # # # # # # # # # # # # # # # # # #
(35001, 40001)
Spent: 233 seconds
Not reached: 0 urls
# # # # # # # # # # # # # # # # # # # # # #
(40001, 45001)
Spent: 228 seconds
Not reached: 0 urls
# # # # # # # # # # # # # # # # # # # # # #
(45001, 50001)
Spent: 231 seconds
Not reached: 0 urls
# # # # # # # # # # # # # # # # # # # # # #
(50001, 55001)
Spent: 231 

In [114]:
with open('vkrs', 'w') as f:
    json.dump(total_result, f)

**Запись данных**

In [115]:
vkrurl = []
abstract = []
has_file = []
hasEnVersion = []
id = []
learnProgram_id = []
learnProgram_title = []
level = []
orgUnit = []
rating = []
student = []
supervisors = []
title = []
year = []
for elem in total_result:
    vkrurl.append(elem['vkrurl'])
    abstract.append(elem['abstract'])
    if elem['file'] == None:
        has_file.append(False)
    else:
        has_file.append(True)
    hasEnVersion.append(elem['hasEnVersion'])
    id.append(elem['id'])
    if elem['learnProgram'] == None:
        learnProgram_id.append(None)
        learnProgram_title.append(None)
    else:
        learnProgram_id.append(elem['learnProgram']['id'])
        learnProgram_title.append(elem['learnProgram']['title'])
    level.append(elem['level'])
    orgUnit.append(elem['orgUnit']['title'])
    rating.append(elem['rating'])
    student.append(elem['student'])
    if len(elem['supervisors']) > 0:
        supervisors.append(elem['supervisors'][0]['name'])
    else:
        supervisors.append(None)
    title.append(elem['title'])
    year.append(elem['year'])
df = pd.DataFrame({'vkrurl': vkrurl, 'abstract':abstract, 'has_file':has_file, 
                   'hasEnVersion':hasEnVersion, 'id':id, 'learnProgram_id': learnProgram_id, 
                   'learnProgram_title':learnProgram_title, 'level':level, 'orgUnit':orgUnit, 
                   'rating':rating, 'student': student, 'title': title, 'year': year})
print(df.shape[0])

72830


In [116]:
datadir = '/content/drive/MyDrive/Colab Notebooks/pet_projects/abstract_sum/results/' + 'vkr.csv'
df.to_csv(datadir, index = False)
pd.read_csv(datadir).head(2)

Unnamed: 0,vkrurl,abstract,has_file,hasEnVersion,id,learnProgram_id,learnProgram_title,level,orgUnit,rating,student,title,year
0,https://www.hse.ru/edu/vkr/206738445,В данной дипломной работе описывается разработ...,False,True,206738445,135181640.0,Информатика и вычислительная техника,Бакалавриат,Московский институт электроники и математики и...,,Зверев Валерий Дмитриевич,Разработка веб-приложения для содания модели у...,2017
1,https://www.hse.ru/edu/vkr/153010267,Выбор между активным и пассивным управлением я...,False,False,153010267,135181656.0,Экономика,Бакалавриат,Факультет экономических наук,,Гильманова Диана Маратовна,Оценка эффективности активно и пассивно управл...,2015


In [118]:
df.sample(2)

Unnamed: 0,vkrurl,abstract,has_file,hasEnVersion,id,learnProgram_id,learnProgram_title,level,orgUnit,rating,student,title,year
41640,https://www.hse.ru/edu/vkr/183222993,Данный проект посвящен разработке автоматическ...,True,True,183222993,135181634.0,Прикладная математика и информатика,Бакалавриат,Факультет компьютерных наук,8.0,Проваторова Вера Сергеевна,Генератор отзывов на русском языке на основе л...,2016
16726,https://www.hse.ru/edu/vkr/219185276,Сопоставительный анализ сериального рынка и зр...,False,True,219185276,141401705.0,Медиапроизводство в креативных индустриях,Магистратура,"Факультет коммуникаций, медиа и дизайна",,Козлова Александра Андреевна,Сопоставительный анализ сериального рынка и зр...,2018
