# Zakupki website scraping for Piotr

The aim of this notebook is to scrape details of each contract hosted on the Russian Zakupki public sector contract awarding website.

The input for this project will be the Zakupki URL. This code can be run on different dates to pull fresh contract data.

Method:
1.   Identify the number of pages of contracts to be scraped (using the contract filters provided).
2.   Iterate through each page, scraping the registration number of each contract.
3.   Access the website for each contract by placing the registraion number in the URL.
4.   Scrape the details for each contract and add them to a list of Contracts dataclasses.
5.   Format these Contract objects as a dataframe and output the dataframe to a csv file.


The output of this project will be the CSV file, with each row representing a new contract from the webstie.


### Section 1: Setup

In [1]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
from datetime import date
from dataclasses import dataclass
from tqdm import tqdm
from dateutil import parser
from threading import Thread
import pandas as pd
from datetime import datetime, date, timedelta
import logging
import http.client
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor, as_completed
import math
from os import walk
import json
import numpy as np
import csv


In [2]:
## Finding the memory leak

from collections import Counter
import linecache
import os
import tracemalloc

def display_top(snapshot, key_type='lineno', limit=3):
    snapshot = snapshot.filter_traces((
        tracemalloc.Filter(False, "<frozen importlib._bootstrap>"),
        tracemalloc.Filter(False, "<unknown>"),
    ))
    top_stats = snapshot.statistics(key_type)

    print("Top %s lines" % limit)
    for index, stat in enumerate(top_stats[:limit], 1):
        frame = stat.traceback[0]
        # replace "/path/to/module/file.py" with "module/file.py"
        filename = os.sep.join(frame.filename.split(os.sep)[-2:])
        print("#%s: %s:%s: %.1f KiB"
              % (index, filename, frame.lineno, stat.size / 1024))
        line = linecache.getline(frame.filename, frame.lineno).strip()
        if line:
            print('    %s' % line)

    other = top_stats[limit:]
    if other:
        size = sum(stat.size for stat in other)
        print("%s other: %.1f KiB" % (len(other), size / 1024))
    total = sum(stat.size for stat in top_stats)
    print("Total allocated size: %.1f KiB" % (total / 1024))


tracemalloc.start()

In [3]:
logging = False

if logging:

    http.client.HTTPConnection.debuglevel = 1

    # You must initialize logging, otherwise you'll not see debug output.
    logging.basicConfig()
    logging.getLogger().setLevel(logging.DEBUG)
    requests_log = logging.getLogger("requests.packages.urllib3")
    requests_log.setLevel(logging.DEBUG)
    requests_log.propagate = True

### Section 2: Determine Number of pages to scrape
Test connection to the website and determine number of pages to scrape

In [4]:
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

# @lru_cache(maxsize=None)
def getPage(tempURL):
  # If User-Agent is not set to custom, the website will know a Python script is accessing it and block some of the request

  response = session.get(tempURL, headers={'User-Agent': 'Custom'})
  return BeautifulSoup(response.content, "html.parser")

In [5]:
# Getting the dates we want to scrape.

url="https://zakupki.gov.ru/epz/contract/search/results.html?morphology=on&search-filter=%D0%94%D0%B0%D1%82%D0%B5+%D1%80%D0%B0%D0%B7%D0%BC%D0%B5%D1%89%D0%B5%D0%BD%D0%B8%D1%8F&fz44=on&contractStageList_0=on&contractStageList_1=on&contractStageList=0%2C1&contractPriceFrom=1000000&contractCurrencyID=-1&budgetLevelsIdNameHidden=%7B%7D&customerPlace=5277383&customerPlaceCodes=66000000000&contractDateFrom={}&contractDateTo={}&selectedLaws=FZ44&sortBy=UPDATE_DATE&pageNumber={}&sortDirection=false&recordsPerPage=_500&showLotsInfoHidden=false"
startDate = date(2016, 1, 1)
endDate = date(2020, 12, 31)
days = timedelta(days=1)

startDateBefore = startDate

calendar=[]

while startDate<=endDate:
  calendar.append(startDate.strftime('%d.%m.%Y'))
  startDate+=days

print("Created {} dates".format(len(calendar)))

Created 1827 dates



### Section 3: Scrape each registration number

Scrape the reg numbers of each contract, so they can be accessed individually

In [6]:
class Page:

    def __init__(self, day, pageNum, pagefile):

        self.day = day
        self.pageNum = pageNum
        self.pagefile = pagefile


In [7]:
def getContracts(page):

    regNumbersList = []

    # Obtain a list of all the sections of HTML containing a contract in the web page
    listOfContracts = page.find_all("div", {"class": "registry-entry__header-mid__number"})
    
    # Segment the registration number from the URL of each contract
    for contract in listOfContracts:
        href = contract.find("a")['href']
        regNum = href[href.index("Number=")+7:]
        regNumbersList.append(regNum)

    return regNumbersList

In [8]:
def progress(idx, data):

    x_ = int(((idx+1) * 100) / len(data))
    y_ = idx % math.ceil(len(data) / 10)
    
    print(" ----\n{}% completed\n----".format(x_)) if y_ == 0 else None

In [9]:
# Getting the web page for all the contracts for each date in the range we want to scrape.

regNumbersDict = {}

def getRegNumbersForDate(i, day):

  if day in regNumbersDict:
    return

  tempURL = url.format(day, day, 1)

  print(tempURL)

  page = getPage(tempURL)

  # Scrape the max number of pages
  try:
    maxPageNum = int(page.select('a[data-pagenumber]')[-2].find("span").text)
    print("{} pages for this day".format(maxPageNum))
  except:
    maxPageNum = 1


  # Leave my variable names alone :(
  totalRegNumbersForThisDay = 0

  for i in range(1, maxPageNum+1):

    # Creating a temporary URL for each page containing contracts
    tempPageURL = url.format(day, day, i)

    # Request the page and format it as a BeautifulSoup object so that we can perform scrapings
    page = getPage(tempPageURL)

    regNumbersList = getContracts(page)

    totalRegNumbersForThisDay += len(regNumbersList)

    regNumbersDict[day] = regNumbersList


  print("Fetched day {} had {} contracts \n".format(day, totalRegNumbersForThisDay), end='')

  del page
  del regNumbersList

  # progress(i, calendar)


In [10]:

## This now has regNumbers caching too 

cachedRegNums = {}

# load the data from the json file
with open('cachedRegNums.json', 'r') as f:
  cachedRegNums = json.load(f)


with ThreadPoolExecutor(max_workers=50) as ex:
  for i, day in enumerate(calendar):
    if day in cachedRegNums:
      regNumbersDict[day] = cachedRegNums[day]
      print("Cached day {} had {} contracts \n".format(day, len(regNumbersDict[day])), end='')
    else:
      ex.submit(getRegNumbersForDate, i, day)

    
combinedRegNumbersDict = {**cachedRegNums, **regNumbersDict}

# print(combinedRegNumbersDict)


with open('cachedRegNums.json', 'w') as f:
  json.dump(combinedRegNumbersDict, f)


tempRegNumbers = list(regNumbersDict.values())

regNumbers = []

for t in tempRegNumbers:
  regNumbers.extend(t)

# print(regNumbers)

print("------------------- \n {} contracts found in total".format(len(regNumbers)))

Cached day 01.01.2016 had 80 contracts 
Cached day 02.01.2016 had 4 contracts 
Cached day 03.01.2016 had 2 contracts 
Cached day 04.01.2016 had 4 contracts 
Cached day 05.01.2016 had 1 contracts 
Cached day 06.01.2016 had 5 contracts 
Cached day 07.01.2016 had 0 contracts 
Cached day 08.01.2016 had 3 contracts 
Cached day 09.01.2016 had 1 contracts 
Cached day 10.01.2016 had 2 contracts 
Cached day 11.01.2016 had 115 contracts 
Cached day 12.01.2016 had 41 contracts 
Cached day 13.01.2016 had 23 contracts 
Cached day 14.01.2016 had 25 contracts 
Cached day 15.01.2016 had 25 contracts 
Cached day 16.01.2016 had 1 contracts 
Cached day 17.01.2016 had 0 contracts 
Cached day 18.01.2016 had 40 contracts 
Cached day 19.01.2016 had 34 contracts 
Cached day 20.01.2016 had 19 contracts 
Cached day 21.01.2016 had 21 contracts 
Cached day 22.01.2016 had 20 contracts 
Cached day 23.01.2016 had 3 contracts 
Cached day 24.01.2016 had 1 contracts 
Cached day 25.01.2016 had 48 contracts 
Cached day 2

The below is just for testing labels etc.

In [11]:
# url="https://zakupki.gov.ru/epz/contract/contractCard/common-info.html?reestrNumber=3662310492216000051"
# soup=getPage(url)
# secondtab=soup.findAll("span", {"class":"grey-main-light"})[-2]
# print(secondtab)
# # # # products=soup.findAll("span",{"class":"section__info"})
# # # product=products[24].text.strip()
# # print(product)

# # # # # ### first tab info ###
# # table=soup.findAll("td",{"class":"tableBlock__col"})
# # # # cardinfo=soup.findAll("span",{"class":"cardMainInfo__content"})
# # # signed=cardinfo[3].text.strip()
# # # deadline=cardinfo[4].text.strip()
# # # # price=soup.find("span", {"class":"cardMainInfo__content cost"}).text.replace("\xa0","").replace(",",".").replace("₽","").strip()
# # print(signed)
# # # org=table[0].text.strip().split("\n")
# # # procurer=org[0]
# # # INN=org[9]
# # # KPP=org[13]
# # # registered=org[17]

# try:
#     address=table[3].text.strip()
#     numbermail=table[3].text.strip().split("\n")
# #     number=numbermail[0]
# #     mail=numbermail[1].strip()

# # except:
# #     address=table[2].text.strip()
# #     numbermail=table[2].text.strip().split("\n")
# #     number=numbermail[0]
# #     mail=numbermail[1].strip()

# table=soup.findAll("section",{"class":"blockInfo__section section"})
# # # check=table[6]
# # # print(check)

# # section=soup.findAll("span",{"class":"section__info"})
# # # method=section[3].text.strip()
# # # print(method)
# # # # print(method)
# # # # # # ### second tab info ###
# # print((table))

# try:
#   check=table[3]
#   method=check.findAll('span', {'class':'section__info'})[0].text.strip()

# except:
#   method=section[6].text.strip()

# print(method)

# # # secondtab=soup.findAll("div", {"class": "col"})
# # # # print(secondtab)
# # # tables = []
# for tab in secondtab:
#     tables.append(tab.findAll("tbody"))

# object=tables[-1][0].findAll("td",{"class":"tableBlock__col"})
# code=object[1].text.strip().split("\n")[-1].strip()
# print(code)


### Section 4: Details scraping

The Contract Dataclass will store the information during scraping.
If any information can't be scraped, default values have been provided in their place

In [12]:
@dataclass
class Contract:

  # TODO: Add reg number to class

  # Main Section
  id: float = 0
  price: float = 0.0
  signed: date = None
  deadline: date = None

  # Tab 1
  method: str = "none"
  procurer: str = "none"
  supplier: str = "none"
  proinn: str = "none"
  supinn: str = "none"
  registered: date = None
  address: str = "none"
  number: str = "none"
  mail: str = "none"

  # Tab 2
  code: float = 0.0
  product: str = "none"
  

  def __repr__(self):
    return "\nContract id= {} \n First tab: price={}, signed={}, deadline={}, method={}, procurer={}, supplier={}, proinn={}, supinn={}, registered={}, address={}, number={}, mail={} \n  Second tab: code={}, product={})".format(self.id, self.price, self.signed, self.deadline, self.method, self.procurer, self.supplier, self.proinn, self.supinn, self.registered, self.address, self.number, self.mail, self.code,  self.product)

  

Method for scraping the data from each contract

In [13]:
def getSectionDict(page):

    sections=page.findAll("section",{"class":"blockInfo__section section"})

    # print([key.findAll("span") for key in sections])

    # Turning the sections into a dictionary that will be easier to work with.
    pairs = [key.findAll("span") for key in sections]

    pairs = list(filter(None, pairs))


    titles = []
    values = []

    for x in pairs:
        if len(x) > 1:
            try:
                titles.append(x[0])
                values.append(x[1])
            except:
                pass

    sectionDict = {titles[i].text.strip() : values[i].text.strip() for i in range(len(titles))}

    return sectionDict


def getTableDict(page, secondTab=False):


    if secondTab:
        sectionOfInterest = page.findAll("div", {"class": "col"})[-1]
    else:
        sectionOfInterest = page

    table = sectionOfInterest.findAll("tr",{"class":"tableBlock__row"})

    # print(table[3])

    headers = [i.text.strip() for i in table[0].findAll("th", {"class":"tableBlock__col tableBlock__col_header"})]
    data = [list(filter(None, [j.strip() for j in i.text.split("\n")])) for i in table[1].findAll("td")]

    if len(headers) == 0:
        headers = [i.text.strip() for i in table[2].findAll("th", {"class":"tableBlock__col tableBlock__col_header"})]
        data = [list(filter(None, [j.strip() for j in i.text.split("\n")])) for i in table[3].findAll("td")]

    if len(headers) == 0:
        headers = [i.text.strip() for i in table[3].findAll("th", {"class":"tableBlock__col tableBlock__col_header"})]
        data = [list(filter(None, [j.strip() for j in i.text.split("\n")])) for i in table[4].findAll("td")]



    # This is hacky.
    if len(data) < len(headers):
        data = [[[]] for i in range(len(headers))]

        

    tableDict = {headers[i] : data[i] for i in range(len(headers))}

    return tableDict

In [14]:
def scrapeData(reg):

  try:
    # Input: reg = one registration number.

    # Different URL from the one above, this accesses more information from Zakupki.
    dir = "https://zakupki.gov.ru/epz/contract/contractCard/{}.html?reestrNumber={}"

    # Getting the web page for the given contract
    tempDir = dir.format("common-info", reg)
    page = getPage(tempDir)

    # We probably don't need this with the method I've used below.
    contractTypeTwo = False

    # Enter the text here that should be present to signify the second type of contract.
    if page.findAll(text="Основание заключения контракта с единственным поставщиком"):
      contractTypeTwo = True
      
    id = reg  
    sectionDict = getSectionDict(page)
    firstTableDict = getTableDict(page)

    # print(sectionDict)
    # print(firstTableDict)

    # print(sectionDict, firstTableDict)
    # try:
    #   price=sectionDict["Цена контракта"].replace("\xa0","").replace(",",".").replace("₽","").strip().split()[0]
    # except:
    #   price=sectionDict["Ориентировочное значение цены контракта"].replace("\xa0","").replace(",",".").replace("₽","").strip().split()[0]
    #   try:
    #     price=sectionDict["Максимальное значение цены контракта"].replace("\xa0","").replace(",",".").replace("₽","").strip().split()[0]
    #   except:
    price=page.find('span', {'class':'cardMainInfo__content cost'}).text.strip().replace("₽","").replace(" ","")
      
    signed=sectionDict["Дата заключения контракта"].split()[0]
    deadline=sectionDict["Дата окончания исполнения контракта"].split()[0]
    
    ### fixed issue with method ### 
    try:
      method = sectionDict["Способ определения поставщика (подрядчика, исполнителя)"]
    except:
      if page.findAll(text="Основание заключения контракта с единственным поставщиком"):
            method="Закупка у единственного поставщика (подрядчика, исполнителя)"
            
    procurer=sectionDict["Полное наименование заказчика"]
    supplier=firstTableDict["Организация"][0]

    proinn=sectionDict["ИНН"]

    ### fixed issues for missing values sometimes in the table ###
    
    registered=firstTableDict["Организация"][-1]
    
    ### fixed, testing ###   
    try: 
      if firstTableDict["Организация"][-4]=="КПП:":
          supinn=firstTableDict["Организация"][-5]
      else:
          supinn=firstTableDict["Организация"][-3]
    except:
      supinn=""

    ### fixed issues in lower table ### 
    
    try:
      address=firstTableDict["Адрес места нахождения"]
    except:
      address=firstTableDict["Адрес в стране регистрации"]
    number=firstTableDict["Телефон, электронная почта"][0]
    mail=firstTableDict["Телефон, электронная почта"][1]

    ### details about winner - ALSO, THERE'S OPTION TO SCRAPE SUBCONTRACTORS ### 

    page.decompose()

    ### Second tab ###
    tempDir = dir.format("payment-info-and-target-of-order", reg)
    page = getPage(tempDir)

    ### code stands for the product code, which can be later identified to return industry type ### 
    
    secondTableDict = getTableDict(page, True)
    
    try:
      code = secondTableDict["Позиции по КТРУ, ОКПД2"][1][1:-1]
    except:
      code = ''
        
    product = secondTableDict["Наименование объекта закупки и его характеристики"][0]
    
    # Create the Contract dataclass object and append it to a list of objects.
    # This method means that missing data can be accounted for.
    # print(method)

    contract = Contract(id=id, price=price, signed=signed, deadline=deadline, method=method, procurer=procurer, supplier=supplier, proinn=proinn, supinn=supinn, registered=registered, address=address, number=number, mail=mail, code=code, product=product)
    
    # contracts.append(contract)
    # print('Completed {}'.format(id))

    page.decompose()

    return contract
  except Exception as e:
    failedRegNumbers.append(reg)
    print("Failed to scrape {}".format(reg))
    print(e)

### Section 5: Starting execution
Scrape the contracts themselves using threading

In [15]:
def scrape(reg):
    
    try:
        _ = int(reg)
        # print("Scraping {}".format(reg))
        return [scrapeData(reg)]
    except TypeError:
        
        # TODO make 500 contracts change here.
        
        contracts = []

        

        for idx, r in enumerate(reg):
            # print("Scraping {}".format(r))
            contracts.append(scrapeData(r))
            # progress(i + idx, regNumbers)

        return contracts

In [16]:
progressNum = 0

failedRegNumbers = []

# regNumbers = [3666404326716000002, 3666404326716000002, 1722300731616000034, 1722300731616000034, 2666001041516000469]

print("Starting scrape with {} reg numbers\n".format(len(regNumbers)))

# scrape(regNumbers[:10])

# for contract in contracts:
#     print(contract)

# for regNumber in tqdm(regNumbers[:50]):
#   thread = Thread(target = scrapeData, args = (regNumber,))
#   thread.start()

# regNumbers = ['3662502457421000001']
# regNumbers = ['1665800691921000016']

threading = True

if threading:

    interval = 1

    with ThreadPoolExecutor(max_workers=20) as ex:
        threads = []

        cachedContracts = {}

        with open('cachedContracts.csv', encoding="utf-8") as f:
            cachedContracts = list(csv.reader(f))
        
        cachedContracts = list(filter(None, cachedContracts))
        
        cachedContractRegNums = [row[0] for row in cachedContracts]

        uncachedRegNumbers = list(set(regNumbers) - set(cachedContractRegNums))

        print("{} of {} contracts are uncached. Fetching...".format(len(uncachedRegNumbers), len(regNumbers)))
        
        for i in range(0, len(uncachedRegNumbers), interval):
            tempNumbers = uncachedRegNumbers[i:i+interval]
            # print(tempNumbers)
            threads.append(ex.submit(scrape, tempNumbers))
        

        completed = 0

        for result in tqdm(as_completed(threads)):

            try:
                contracts = result.result()

                completed += interval

                # progress(completed, uncachedRegNumbers)

                formattedContracts = [list(contract.__dict__.values()) for contract in contracts]

                # newCachedContracts = {**cachedContracts, **formattedContracts}

                with open('cachedContracts.csv', 'a', encoding="utf-8", newline='') as f:
                    writer = csv.writer(f)
                    writer.writerows(formattedContracts)
            except Exception as e:
                print(e)
                
            # print("{} finished".format(interval))

else:

    scrape(regNumbers)

# print("Scraped {} contracts".format(len(contracts)))
print("Failed to scrape {} contracts".format(len(failedRegNumbers)))
print(failedRegNumbers) if len(failedRegNumbers) > 0 else None

snapshot = tracemalloc.take_snapshot()
display_top(snapshot)

Starting scrape with 41177 reg numbers

23831 of 41177 contracts are uncached. Fetching...


110it [01:17,  1.63it/s]

Failed to scrape 3660601895220000071
'Организация'


116it [01:23,  1.16it/s]

Failed to scrape 3666100466118000147
'Наименование объекта закупки и его характеристики'


156it [01:50,  1.64it/s]

Failed to scrape 3666000588819000002
'Наименование объекта закупки и его характеристики'


167it [01:56,  1.66it/s]

Failed to scrape 2661900214920000296
'Организация'


231it [02:40,  1.52it/s]

'NoneType' object has no attribute '__dict__'


244it [02:50,  1.53it/s]

Failed to scrape 2661900214920000262
'Организация'


251it [02:55,  1.79it/s]

'NoneType' object has no attribute '__dict__'


338it [03:48,  2.04it/s]

'NoneType' object has no attribute '__dict__'


358it [04:03,  1.46it/s]

'NoneType' object has no attribute '__dict__'


448it [05:02,  1.62it/s]

Failed to scrape 1665801738920000114
'Наименование объекта закупки и его характеристики'


471it [05:17,  1.46it/s]

Failed to scrape 2666100878620000122
'Организация'


535it [06:02,  1.59it/s]

'NoneType' object has no attribute '__dict__'


617it [06:56,  1.55it/s]

Failed to scrape 3666000798020000347
'Наименование объекта закупки и его характеристики'


700it [07:56,  1.79it/s]

Failed to scrape 1666101464520000297
'Организация'


734it [08:21,  1.58it/s]

Failed to scrape 1665808957720000109
list index out of range


839it [09:30,  1.28it/s]

Failed to scrape 2662309705520000308
'Организация'


966it [10:53,  1.65it/s]

Failed to scrape 1772905090120000118
'Наименование объекта закупки и его характеристики'


980it [11:03,  2.01it/s]

'NoneType' object has no attribute '__dict__'


1034it [11:39,  1.37it/s]

'NoneType' object has no attribute '__dict__'


1159it [13:05,  1.55it/s]

Failed to scrape 3660200715619000001
'Наименование объекта закупки и его характеристики'


1337it [15:15,  1.32it/s]

Failed to scrape 3666000798021000034
'Наименование объекта закупки и его характеристики'


1355it [15:30,  1.26it/s]

Failed to scrape 2666100878620000126
'Организация'


1364it [15:36,  1.20it/s]

'NoneType' object has no attribute '__dict__'


1529it [17:30,  1.28it/s]

Failed to scrape 3667808274920000002
'Наименование объекта закупки и его характеристики'


1562it [17:52,  1.83it/s]

'NoneType' object has no attribute '__dict__'


1646it [18:42,  1.47it/s]

'NoneType' object has no attribute '__dict__'


1668it [18:57,  1.19it/s]

Failed to scrape 3666005552917000016
'Наименование объекта закупки и его характеристики'


1787it [20:14,  1.49it/s]

Failed to scrape 2667138083420000438
'Организация'


1879it [21:12,  1.56it/s]

'NoneType' object has no attribute '__dict__'


1940it [21:57,  1.16it/s]

Failed to scrape 2667114714720000013
'Организация'


1976it [22:21,  1.69it/s]

Failed to scrape 2661800178420000132
'Организация'


2155it [24:17,  1.54it/s]

'NoneType' object has no attribute '__dict__'


2159it [24:20,  1.51it/s]

Failed to scrape 3662501736918000006
'Наименование объекта закупки и его характеристики'


2170it [24:27,  1.62it/s]

Failed to scrape 3666000798021000033
'Наименование объекта закупки и его характеристики'


2215it [24:58,  1.77it/s]

Failed to scrape 3666000798020000269
'Наименование объекта закупки и его характеристики'


3691it [26:06, 1408.54it/s]

'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'


4700it [26:06, 2513.61it/s]

'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'


5163it [26:08, 706.55it/s] 

Failed to scrape 3660800191517000008
'Наименование объекта закупки и его характеристики'
'NoneType' object has no attribute '__dict__'


5221it [26:27, 54.82it/s] 

Failed to scrape 3664500283620000005
'Наименование объекта закупки и его характеристики'
'NoneType' object has no attribute '__dict__'


5931it [31:35,  1.40it/s]

Failed to scrape 3666100466119000006
'Наименование объекта закупки и его характеристики'


5964it [31:57,  1.27it/s]

Failed to scrape 1666101464520000321
'Наименование объекта закупки и его характеристики'


6035it [32:48,  1.65it/s]

Failed to scrape 2665804173720000304
'Организация'


6237it [35:07,  1.59it/s]

'NoneType' object has no attribute '__dict__'


6310it [36:00,  1.42it/s]

'NoneType' object has no attribute '__dict__'


6393it [36:57,  1.34it/s]

Failed to scrape 1665808957719000131
list index out of range


6467it [37:55,  1.30it/s]

Failed to scrape 2666100878620000127
'Организация'


6478it [38:03,  1.19it/s]

'NoneType' object has no attribute '__dict__'


6497it [38:16,  1.44it/s]

Failed to scrape 3661900735520000025
'Наименование объекта закупки и его характеристики'


6980it [43:37,  1.63it/s]

Failed to scrape 1666100166019000069
Failed to scrape 3662501738320000005'Организация'
'Наименование объекта закупки и его характеристики'



7001it [43:53,  1.27it/s]

Failed to scrape 2665808158520000020
'Организация'


7103it [45:02,  1.55it/s]

Failed to scrape 1666001327920000180
'Организация'


7137it [45:31,  1.27it/s]

Failed to scrape 3666600829019000001
'Наименование объекта закупки и его характеристики'


7276it [47:15,  1.45it/s]

Failed to scrape 1665902945020000053
'Телефон, электронная почта'


7312it [47:39,  1.76it/s]

'NoneType' object has no attribute '__dict__'


7364it [48:18,  1.27it/s]

Failed to scrape 2661900214920000465
'Организация'


7502it [49:57,  1.22it/s]

'NoneType' object has no attribute '__dict__'


7542it [50:24,  1.68it/s]

Failed to scrape 1666100918720000043
'Организация'


7554it [50:33,  1.23it/s]

Failed to scrape 1660900206920000209
'Организация'


7574it [50:48,  1.25it/s]

'NoneType' object has no attribute '__dict__'


7714it [52:30,  1.31it/s]

Failed to scrape 1665802145918000128
'Наименование объекта закупки и его характеристики'


7748it [52:53,  1.37it/s]

Failed to scrape 2667422224020000184
'Организация'


8021it [56:13,  1.32it/s]

Failed to scrape 1668503165720000001
'Организация'


8052it [56:36,  1.42it/s]

Failed to scrape 3662310492216000051
'Наименование объекта закупки и его характеристики'


8194it [58:21,  1.30it/s]

Failed to scrape 3660601895220000076
'Организация'


8245it [58:53,  1.26it/s]

Failed to scrape 2667143939720000061
'Наименование объекта закупки и его характеристики'


8360it [1:00:09,  1.93it/s]

Failed to scrape 3660601895220000036
list index out of range


8445it [1:01:09,  1.44it/s]

Failed to scrape 3662502483118000001
'Наименование объекта закупки и его характеристики'


8557it [1:02:30,  1.77it/s]

Failed to scrape 2666100460819000023
list index out of range


8598it [1:02:58,  2.00it/s]

Failed to scrape 1666001327920000181
'Организация'


8635it [1:03:22,  1.47it/s]

Failed to scrape 3666000798020000303
'Наименование объекта закупки и его характеристики'
'NoneType' object has no attribute '__dict__'


8636it [1:03:22,  1.59it/s]

'NoneType' object has no attribute '__dict__'


8693it [1:04:03,  1.47it/s]

'NoneType' object has no attribute '__dict__'


8726it [1:04:29,  1.11it/s]

Failed to scrape 1666001327920000327
'Организация'


8805it [1:05:30,  1.38it/s]

Failed to scrape 2665808158520000023
'Организация'


8922it [1:06:56,  1.85it/s]

Failed to scrape 2665804173720000285
'Организация'


8924it [1:06:57,  1.39it/s]

'NoneType' object has no attribute '__dict__'


8947it [1:07:15,  1.02it/s]

Failed to scrape 3660601895220000089
'Организация'


9018it [1:08:07,  1.22it/s]

'NoneType' object has no attribute '__dict__'


10250it [1:08:40, 1108.88it/s]

'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'


11494it [1:08:40, 2294.73it/s]

'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'


13582it [1:08:40, 4581.53it/s]

'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'


14279it [1:12:48,  1.62it/s]  

Failed to scrape 2665801087320000058
'Организация'


14345it [1:13:34,  1.46it/s]

Failed to scrape 3660601895220000075
'Организация'


14364it [1:13:49,  1.16it/s]

Failed to scrape 2661900214920000260
'Организация'


14388it [1:14:05,  1.83it/s]

'NoneType' object has no attribute '__dict__'


14458it [1:14:53,  1.18it/s]

Failed to scrape 1722300731619000182
'Наименование объекта закупки и его характеристики'


14485it [1:15:15,  1.14it/s]

Failed to scrape 1666101464520000113
'Наименование объекта закупки и его характеристики'


14522it [1:15:39,  1.52it/s]

Failed to scrape 3666005552920000027
'Наименование объекта закупки и его характеристики'


14540it [1:15:52,  1.41it/s]

'NoneType' object has no attribute '__dict__'


14565it [1:16:09,  1.26it/s]

Failed to scrape 3661201332819000002
'Наименование объекта закупки и его характеристики'


14586it [1:16:26,  1.15it/s]

'NoneType' object has no attribute '__dict__'


14602it [1:16:36,  1.37it/s]

Failed to scrape 3660600930918000001
'Наименование объекта закупки и его характеристики'


14628it [1:16:55,  1.40it/s]

Failed to scrape 3668100596020000035
'Наименование объекта закупки и его характеристики'


14714it [1:17:59,  1.18it/s]

Failed to scrape 3666100295320000131
'Наименование объекта закупки и его характеристики'


14795it [1:19:02,  2.19it/s]

'NoneType' object has no attribute '__dict__'


14835it [1:19:29,  1.46it/s]

Failed to scrape 2667030834520000209
'Организация'


14865it [1:19:51,  1.86it/s]

'NoneType' object has no attribute '__dict__'


14883it [1:20:05,  1.25it/s]

Failed to scrape 3663201775419000008
'Наименование объекта закупки и его характеристики'


14912it [1:20:27,  1.55it/s]

Failed to scrape 3666005552918000262
'Наименование объекта закупки и его характеристики'


14942it [1:20:46,  1.83it/s]

'NoneType' object has no attribute '__dict__'


15002it [1:21:33,  1.52it/s]

Failed to scrape 1666001327920000324
'Организация'


15044it [1:22:02,  1.75it/s]

'NoneType' object has no attribute '__dict__'


15055it [1:22:09,  1.65it/s]

Failed to scrape 3662600920119000001
'Наименование объекта закупки и его характеристики'


15126it [1:23:03,  1.08it/s]

'NoneType' object has no attribute '__dict__'


15190it [1:23:50,  1.28it/s]

'NoneType' object has no attribute '__dict__'


15392it [1:26:18,  1.48it/s]

'NoneType' object has no attribute '__dict__'


16716it [1:28:34, 1210.94it/s]

'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'


17657it [1:30:55,  4.13it/s]  

Failed to scrape 1668609305320000080
'Наименование объекта закупки и его характеристики'
'NoneType' object has no attribute '__dict__'
Failed to scrape 1666001346221000001
'Наименование объекта закупки и его характеристики'
'NoneType' object has no attribute '__dict__'


17769it [1:31:31,  3.38it/s]

Failed to scrape 2666001041519000928
'Наименование объекта закупки и его характеристики'
'NoneType' object has no attribute '__dict__'


18184it [1:35:50,  1.86it/s]

Failed to scrape 3666600818719000003
'Наименование объекта закупки и его характеристики'


18189it [1:35:55,  1.01it/s]

Failed to scrape 1666202396320000063
'Организация'


18246it [1:36:33,  1.60it/s]

Failed to scrape 3661700353120000005
list index out of range


18250it [1:36:36,  1.30it/s]

Failed to scrape 1665802857420000053
list index out of range


18574it [1:40:25,  1.09it/s]

'NoneType' object has no attribute '__dict__'


18589it [1:40:35,  1.76it/s]

'NoneType' object has no attribute '__dict__'


18594it [1:40:39,  1.47it/s]

Failed to scrape 2666901486220000129
'Наименование объекта закупки и его характеристики'


18644it [1:41:16,  1.46it/s]

Failed to scrape 1660900206920000156
'Организация'


18710it [1:42:03,  1.31it/s]

'NoneType' object has no attribute '__dict__'


18718it [1:42:09,  1.81it/s]

'NoneType' object has no attribute '__dict__'


18906it [1:44:24,  1.08it/s]

Failed to scrape 1666001327920000323
'Организация'


19281it [1:48:49,  1.67it/s]

Failed to scrape 2668000042220000084
'Организация'


19470it [1:51:06,  1.54it/s]

'NoneType' object has no attribute '__dict__'


19580it [1:52:30,  1.68it/s]

'NoneType' object has no attribute '__dict__'


19605it [1:52:52,  1.04s/it]

Failed to scrape 1665802857420000020
list index out of range


19882it [1:56:18,  1.99it/s]

Failed to scrape 3666100224820000246
'Организация'


19920it [1:56:46,  1.35it/s]

Failed to scrape 3662503996520000010
'Наименование объекта закупки и его характеристики'


19967it [1:57:22,  1.47it/s]

Failed to scrape 1666001327920000175
'Организация'


20060it [1:58:31,  1.25it/s]

Failed to scrape 1666001327917000279
'Наименование объекта закупки и его характеристики'


20160it [1:59:42,  1.68it/s]

'NoneType' object has no attribute '__dict__'


20246it [2:00:45,  1.65it/s]

Failed to scrape 1665808957720000246
list index out of range


20256it [2:00:52,  1.38it/s]

Failed to scrape 3667144244920000097
'Наименование объекта закупки и его характеристики'


20289it [2:01:20,  1.39it/s]

Failed to scrape 2661900214920000263
'Организация'


20325it [2:01:49,  1.63it/s]

Failed to scrape 1660900206920000211
'Организация'


20435it [2:03:07,  1.11it/s]

Failed to scrape 2666001041519000955
'Наименование объекта закупки и его характеристики'


20447it [2:03:15,  1.19it/s]

Failed to scrape 3663300876020000002
list index out of range


20517it [2:04:08,  1.10it/s]

Failed to scrape 1666101464520000164
'Организация'


21694it [2:04:28, 1344.04it/s]

'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'


22828it [2:04:28, 2635.43it/s]

'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'


23831it [2:04:28,  3.19it/s]  


'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
'NoneType' object has no attribute '__dict__'
Failed to scrape 93 contracts
['3660601895220000071', '3666100466118000147', '3666000588819000002', '2661900214920000296', '2661900214920000262', '1665801738920000114', '2666100878620000122', '3666000798020000347', '1666101464520000297', '1665808957720000109', '2662309705520000308', '1772905090120000118', '3660200715619000001', '3666000798021000034', '2666100878620000126', '3667808274920000002', '3666005552917000016', '2667138083420000438', '2667114714720000013', '2661800178420000132', '3662501736918000006', '3666000798021000033', '3666000798020000269', '3660800191517000008', '3664500283620000005', '3666100466119000006', '1666101464520000321', '2665804173720000304', '1665808957719000131', '2666100878620000127', '3661900735520000025', '1666100166019000069', '3662501738320000005', '2665808158520000020', '1666001327920000180', '3666600829019000001', '16

### Now that the data is saved to the hard disk, we can run the below code without needing to rerun the scraping process

In [19]:
cachedContracts = []

with open('cachedContracts.csv', encoding="utf-8") as f:
    cachedContracts = list(csv.reader(f))


df = pd.DataFrame(columns=Contract().__dict__.keys(), data=cachedContracts)

df['signed'] = pd.to_datetime(df['signed'])
df['deadline'] = pd.to_datetime(df['deadline'])

start = datetime.fromordinal(startDateBefore.toordinal()).strftime("%Y-%m-%d")
end = datetime.fromordinal(endDate.toordinal()).strftime("%Y-%m-%d")

print(start, end)

mask = (df['signed'] >= start) & (df['signed'] <= end)

selectedDatesDF = df.loc[mask]

selectedDatesDF.head()

2016-01-01 2020-12-31


Unnamed: 0,id,price,signed,deadline,method,procurer,supplier,proinn,supinn,registered,address,number,mail,code,product
0,3660800365518000015,"1 974 925,54",2018-07-30,2018-12-31,Электронный аукцион,ЕКАТЕРИНБУРГСКОЕ МУНИЦИПАЛЬНОЕ УНИТАРНОЕ ПРЕДП...,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""СПЕЦ...",6608003655,6664079721,01.09.2011,"['620010, ОБЛ СВЕРДЛОВСКАЯ 66, Г ЕКАТЕРИНБУРГ,...",7-343-2260122,tender@specregion.ru,14.12.30.190,Белье
1,1770989550919007394,"1 460 823,00",2019-05-12,2020-06-30,Закупка у единственного поставщика (подрядчика...,МЕЖРЕГИОНАЛЬНЫЙ ФИЛИАЛ ФЕДЕРАЛЬНОГО КАЗЕННОГО ...,ПЫТАЛЕВ АНДРЕЙ СТЕПАНОВИЧИндивидуальный предпр...,7709895509,773671146984,20.07.2018,['Г МОСКВА 77'],8-912-4150862,pzs_arenda@mail.ru,68.20.12.000,Передача во временное владение и пользование ч...
2,3666005552917000109,"5 347 130,00",2017-09-13,2017-12-31,Закупка у единственного поставщика (подрядчика...,"МУНИЦИПАЛЬНОЕ БЮДЖЕТНОЕ УЧРЕЖДЕНИЕ ""ЕКАТЕРИНБУ...","Общество с ограниченной ответственностью ""МО ""...",6660055529,6658126476,08.02.2001,"['620102, г. Екатеринбург, ул. Ясная, 46']",(343) 231-80-10,omt@omt-ural.ru,26.60.13.190,"Устройство терморегулирующее ""Аллон 2001"""
3,3668200734219000001,"3 361 200,00",2019-08-04,2019-09-30,Электронный аукцион,"МУНИЦИПАЛЬНОЕ КАЗЕННОЕ УЧРЕЖДЕНИЕ ""УПРАВЛЕНИЕ ...","ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ЕКАТ...",6682007342,6672249991,01.01.2012,"['620026, ОБЛ СВЕРДЛОВСКАЯ 66, Г ЕКАТЕРИНБУРГ,...",7-343-3787200,102@eta-info.ru,86.90.19.140,Предоставление услуг по организации отдыха и о...
4,3661100106218000013,"1 200 000,00",2018-03-29,2018-12-31,Закупка у единственного поставщика (подрядчика...,"МУНИЦИПАЛЬНОЕ УНИТАРНОЕ ПРЕДПРИЯТИЕ ""КОМБИНАТ ...","ОТКРЫТОЕ АКЦИОНЕРНОЕ ОБЩЕСТВО ""ЭНЕРГОСБЫТ ПЛЮС...",6611001062,5612042824,22.05.2014,"['143421, ОБЛ МОСКОВСКАЯ 50, Р-Н КРАСНОГОРСКИЙ...",8-34355-52497,sesb@ies-holding.com,35.11.10.115,"Электроэнергия, произведенная атомными электро..."


In [22]:
selectedDatesDF.shape

(41098, 15)

### Section 6: Output

Convert the list of contract classes to a dataframe so that they can be exported to a csv file

In [20]:
selectedDatesDF.to_csv("zakupki{}to{}.csv".format(startDateBefore, endDate))