# Zakupki website scraping for Piotr

The aim of this notebook is to scrape details of each contract hosted on the Russian Zakupki public sector contract awarding website.

The input for this project will be the Zakupki URL. This code can be run on different dates to pull fresh contract data.

Method:
1.   Identify the number of pages of contracts to be scraped (using the contract filters provided).
2.   Iterate through each page, scraping the registration number of each contract.
3.   Access the website for each contract by placing the registraion number in the URL.
4.   Scrape the details for each contract and add them to a list of Contracts dataclasses.
5.   Format these Contract objects as a dataframe and output the dataframe to a csv file.


The output of this project will be the CSV file, with each row representing a new contract from the webstie.


### Section 1: Setup

In [9]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
from datetime import date
from dataclasses import dataclass
from tqdm import tqdm
from dateutil import parser
from threading import Thread
import pandas as pd
import datetime
from datetime import date, timedelta
import logging
import http.client
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor
import math

In [10]:
logging = False

if logging:

    http.client.HTTPConnection.debuglevel = 1

    # You must initialize logging, otherwise you'll not see debug output.
    logging.basicConfig()
    logging.getLogger().setLevel(logging.DEBUG)
    requests_log = logging.getLogger("requests.packages.urllib3")
    requests_log.setLevel(logging.DEBUG)
    requests_log.propagate = True

### Section 2: Determine Number of pages to scrape
Test connection to the website and determine number of pages to scrape

In [11]:
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

@lru_cache(maxsize=50)
def getPage(tempURL):
  # If User-Agent is not set to custom, the website will know a Python script is accessing it and block some of the request
  
  response = session.get(tempURL, headers={'User-Agent': 'Custom'})
  return BeautifulSoup(response.content, "html.parser")

In [12]:
# Getting the dates we want to scrape.

url="https://zakupki.gov.ru/epz/contract/search/results.html?morphology=on&search-filter=%D0%94%D0%B0%D1%82%D0%B5+%D1%80%D0%B0%D0%B7%D0%BC%D0%B5%D1%89%D0%B5%D0%BD%D0%B8%D1%8F&fz44=on&contractStageList_0=on&contractStageList_1=on&contractStageList=0%2C1&contractPriceFrom=1000000&contractCurrencyID=-1&budgetLevelsIdNameHidden=%7B%7D&customerPlace=5277383&customerPlaceCodes=66000000000&contractDateFrom={}&contractDateTo={}&selectedLaws=FZ44&sortBy=UPDATE_DATE&pageNumber={}&sortDirection=false&recordsPerPage=_500&showLotsInfoHidden=false"
start = date(2016, 1, 1)
end = date(2016, 12, 31)
days = timedelta(days=1)

calendar=[]

while start<=end:
  calendar.append(start.strftime('%d.%m.%Y'))
  start+=days

print("Created {} dates".format(len(calendar)))

Created 366 dates



### Section 3: Scrape each registration number

Scrape the reg numbers of each contract, so they can be accessed individually

In [13]:
class Page:

    def __init__(self, day, pageNum, pagefile):

        self.day = day
        self.pageNum = pageNum
        self.pagefile = pagefile


In [14]:
def getContracts(page):

    regNumbersList = []

    # Obtain a list of all the sections of HTML containing a contract in the web page
    listOfContracts = page.find_all("div", {"class": "registry-entry__header-mid__number"})
    
    # Segment the registration number from the URL of each contract
    for contract in listOfContracts:
        href = contract.find("a")['href']
        regNum = href[href.index("Number=")+7:]
        regNumbersList.append(regNum)

    return regNumbersList

In [15]:
def progress(idx, data):

    x_ = int(((idx+1) * 100) / len(data))
    y_ = idx % math.ceil(len(data) / 10)
    
    print(" ----\n{}% completed\n----".format(x_)) if y_ == 0 else None

In [16]:
# Getting the web page for all the contracts for each date in the range we want to scrape.


regNumbers = []

def getRegNumbersForDate(i, day):

  tempURL = url.format(day, day, 1)

  page = getPage(tempURL)

  # Scrape the max number of pages
  try:
    maxPageNum = int(page.select('a[data-pagenumber]')[-2].find("span").text)
    print("{} pages for this day".format(maxPageNum))
  except:
    maxPageNum = 1


  # Leave my variable names alone :(
  totalRegNumbersForThisDay = 0

  for i in range(1, maxPageNum+1):

    # Creating a temporary URL for each page containing contracts
    tempPageURL = url.format(day, day, i)

    # Request the page and format it as a BeautifulSoup object so that we can perform scrapings
    page = getPage(tempPageURL)

    regNumbersList = getContracts(page)

    totalRegNumbersForThisDay += len(regNumbersList)

    regNumbers.extend(regNumbersList) 


  print("{} had {} contracts \n".format(day, totalRegNumbersForThisDay), end='')
  
  # progress(i, calendar)




with ThreadPoolExecutor(max_workers=40) as ex:
  for i, day in enumerate(calendar):
      ex.submit(getRegNumbersForDate, i, day)


  

print("------------------- \n {} contracts found in total".format(len(regNumbers)))


02.01.2016 had 4 contracts 
10.01.2016 had 2 contracts 
04.01.2016 had 4 contracts 
09.01.2016 had 1 contracts 
03.01.2016 had 2 contracts 
08.01.2016 had 3 contracts 
05.01.2016 had 1 contracts 
16.01.2016 had 1 contracts 
23.01.2016 had 3 contracts 
07.01.2016 had 0 contracts 
13.01.2016 had 23 contracts 
17.01.2016 had 0 contracts 
06.01.2016 had 5 contracts 
21.01.2016 had 21 contracts 
13.02.2016 had 1 contracts 
10.02.2016 had 23 contracts 
29.01.2016 had 22 contracts 
14.02.2016 had 1 contracts 
20.01.2016 had 19 contracts 
22.01.2016 had 20 contracts 
25.01.2016 had 48 contracts 
18.01.2016 had 40 contracts 
16.02.2016 had 20 contracts 
12.02.2016 had 10 contracts 
14.01.2016 had 25 contracts 
24.01.2016 had 1 contracts 
22.02.2016 had 3 contracts 
11.02.2016 had 15 contracts 
21.02.2016 had 1 contracts 
17.02.2016 had 22 contracts 
15.01.2016 had 25 contracts 
26.01.2016 had 35 contracts 
19.01.2016 had 34 contracts 
20.02.2016 had 16 contracts 
01.01.2016 had 80 contracts 
18

The below is just for testing labels etc.

In [35]:
url="https://zakupki.gov.ru/epz/contract/contractCard/common-info.html?reestrNumber=3664700268716000001"
soup=getPage(url)
secondtab=page.findAll("span", {"class": "grey-main-light"})[1]
print(secondtab)
# # # products=soup.findAll("span",{"class":"section__info"})
# # # product=products[24].text.strip()
# # print(product)

# # # # # ### first tab info ###
# # table=soup.findAll("td",{"class":"tableBlock__col"})
# # # # cardinfo=soup.findAll("span",{"class":"cardMainInfo__content"})
# # # signed=cardinfo[3].text.strip()
# # # deadline=cardinfo[4].text.strip()
# # # # price=soup.find("span", {"class":"cardMainInfo__content cost"}).text.replace("\xa0","").replace(",",".").replace("₽","").strip()
# # print(signed)
# # # org=table[0].text.strip().split("\n")
# # # procurer=org[0]
# # # INN=org[9]
# # # KPP=org[13]
# # # registered=org[17]

# try:
#     address=table[3].text.strip()
#     numbermail=table[3].text.strip().split("\n")
# #     number=numbermail[0]
# #     mail=numbermail[1].strip()

# # except:
# #     address=table[2].text.strip()
# #     numbermail=table[2].text.strip().split("\n")
# #     number=numbermail[0]
# #     mail=numbermail[1].strip()

# table=soup.findAll("section",{"class":"blockInfo__section section"})
# # # check=table[6]
# # # print(check)

# # section=soup.findAll("span",{"class":"section__info"})
# # # method=section[3].text.strip()
# # # print(method)
# # # # print(method)
# # # # # # ### second tab info ###
# # print((table))

# try:
#   check=table[3]
#   method=check.findAll('span', {'class':'section__info'})[0].text.strip()

# except:
#   method=section[6].text.strip()

# print(method)

# # # secondtab=soup.findAll("div", {"class": "col"})
# # # # print(secondtab)
# # # tables = []
# for tab in secondtab:
#     tables.append(tab.findAll("tbody"))

# object=tables[-1][0].findAll("td",{"class":"tableBlock__col"})
# code=object[1].text.strip().split("\n")[-1].strip()
# print(code)


<span class="grey-main-light">Дата постановки на учет:</span>


### Section 4: Details scraping

The Contract Dataclass will store the information during scraping.
If any information can't be scraped, default values have been provided in their place

In [18]:
@dataclass
class Contract:

  # TODO: Add reg number to class

  # Main Section
  id: float = 0
  price: float = 0.0
  signed: date = None
  deadline: date = None

  # Tab 1
  method: str = "none"
  procurer: str = "none"
  supplier: str = "none"
  proinn: str = "none"
  supinn: str = "none"
  registered: date = None
  address: str = "none"
  number: str = "none"
  mail: str = "none"

  # Tab 2
  code: float = 0.0
  product: str = "none"
  

  def __repr__(self):
    return "\nContract id= {} \n First tab: price={}, signed={}, deadline={}, method={}, procurer={}, supplier={}, proinn={}, supinn={}, registered={}, address={}, number={}, mail={} \n  Second tab: code={}, product={})".format(self.id, self.price, self.signed, self.deadline, self.method, self.procurer, self.supplier, self.proinn, self.supinn, self.registered, self.address, self.number, self.mail, self.code,  self.product)

  

Method for scraping the data from each contract

In [19]:
def getSectionDict(page):

    sections=page.findAll("section",{"class":"blockInfo__section section"})

    # Turning the sections into a dictionary that will be easier to work with.
    titles = [key.findAll("span")[0] for key in sections]
    values = [key.findAll("span")[1] for key in sections]

    sectionDict = {titles[i].text.strip() : values[i].text.strip() for i in range(len(titles))}

    return sectionDict


def getTableDict(page, secondTab=False):


    if secondTab:
        sectionOfInterest = page.findAll("div", {"class": "col"})[-1]
    else:
        sectionOfInterest = page

    table = sectionOfInterest.findAll("tr",{"class":"tableBlock__row"})

    headers = [i.text.strip() for i in table[0].findAll("th", {"class":"tableBlock__col tableBlock__col_header"})]
    data = [list(filter(None, [j.strip() for j in i.text.split("\n")])) for i in table[1].findAll("td")]
  
    tableDict = {headers[i] : data[i] for i in range(len(headers))}

    return tableDict

In [2]:
def scrapeData(reg):

  try:

    # Input: reg = one registration number.

    # Different URL from the one above, this accesses more information from Zakupki.
    dir = "https://zakupki.gov.ru/epz/contract/contractCard/{}.html?reestrNumber={}"

    # Getting the web page for the given contract
    tempDir = dir.format("common-info", reg)
    page = getPage(tempDir)

    # We probably don't need this with the method I've used below.
    contractTypeTwo = False

    # Enter the text here that should be present to signify the second type of contract.
    if page.findAll(text="Основание заключения контракта с единственным поставщиком"):
      contractTypeTwo = True
      
    id = reg  
    sectionDict = getSectionDict(page)
    firstTableDict = getTableDict(page)

    # print(sectionDict)
    # print(firstTableDict)

    # print(sectionDict, firstTableDict)
    try:
      price=sectionDict["Цена контракта"].replace("\xa0","").replace(",",".").replace("₽","").strip().split()[0]
    except:
      price=sectionDict["Ориентировочное значение цены контракта"].replace("\xa0","").replace(",",".").replace("₽","").strip().split()[0]
    else:
      price=sectionDict["Максимальное значение цены контракта"].replace("\xa0","").replace(",",".").replace("₽","").strip().split()[0]
      
    signed=sectionDict["Дата заключения контракта"].split()[0]
    deadline=sectionDict["Дата окончания исполнения контракта"].split()[0]
    
    ### fixed issue with method ### 
    try:
      method = sectionDict["Способ определения поставщика (подрядчика, исполнителя)"]
    except:
      if page.findAll(text="Основание заключения контракта с единственным поставщиком"):
            method="Закупка у единственного поставщика (подрядчика, исполнителя)"
            
    procurer=sectionDict["Полное наименование заказчика"]
    supplier=firstTableDict["Организация"][0]

    proinn=sectionDict["ИНН"]

    ### fixed issues for missing values sometimes in the table ###
    try:
      supinn=firstTableDict["Организация"][4]
    except:
      if secondtab==page.findAll("span", {"class": "grey-main-light"})[1]=="Дата постановки на учет:":
        supinn=firstTableDict["Организация"][1]
    else:
      supinn=firstTableDict["Организация"][2]

    ### fixed issues in lower table ### 
    registered=firstTableDict["Организация"][-1]
    try:
      address=firstTableDict["Адрес места нахождения"]
    except:
      address=firstTableDict["Адрес в стране регистрации"]
    number=firstTableDict["Телефон, электронная почта"][0]
    mail=firstTableDict["Телефон, электронная почта"][1]

    ### details about winner - ALSO, THERE'S OPTION TO SCRAPE SUBCONTRACTORS ### 

    ### Second tab ###
    tempDir = dir.format("payment-info-and-target-of-order", reg)
    page = getPage(tempDir)

    ### code stands for the product code, which can be later identified to return industry type ### 
    
    secondTableDict = getTableDict(page, True)
    
    code = secondTableDict["Позиции по КТРУ, ОКПД2"][1][1:-1]
    product = secondTableDict["Позиции по КТРУ, ОКПД2"][0]

    # Create the Contract dataclass object and append it to a list of objects.
    # This method means that missing data can be accounted for.
    # print(method)

    contract = Contract(id=id, price=price, signed=signed, deadline=deadline, method=method, procurer=procurer, supplier=supplier, proinn=proinn, supinn=supinn, registered=registered, address=address, number=number, mail=mail, code=code, product=product)
    contracts.append(contract)
    print('Completed {}'.format(id))
    progress(len(contracts), regNumbers)
  except Exception as e:
    failedRegNumbers.append(reg)
    print("Failed to scrape {}".format(reg))
    print(e)

### Section 5: Starting execution
Scrape the contracts themselves using threading

In [21]:
def scrape(reg):
    
    try:
        _ = int(reg)
        # print("Scraping {}".format(reg))
        scrapeData(reg)
    except TypeError:
        for r in (reg):
            # print("Scraping {}".format(r))
            scrapeData(r)

In [44]:
contracts = []

failedRegNumbers = []

print("Starting scrape with {} reg numbers\n".format(len(regNumbers)))

# scrape(regNumbers[:10])

# for contract in contracts:
#     print(contract)

# for regNumber in tqdm(regNumbers[:50]):
#   thread = Thread(target = scrapeData, args = (regNumber,))
#   thread.start()

interval = 10

with ThreadPoolExecutor(max_workers=10) as ex:
    for i in range(0, len(regNumbers), interval):
        tempNumbers = regNumbers[i:i+interval]
        # print("Scraping ", tempNumbers)
        ex.submit(scrape, tempNumbers)

print("Scraped {} contracts".format(len(contracts)))
print("Failed to scrape {} contracts".format(len(failedRegNumbers)))
print(failedRegNumbers)


Starting scrape with 6766 reg numbers

Failed to scrape 2665808158516000042
'Максимальное значение цены контракта'
Failed to scrape 3662502866716000002
'Максимальное значение цены контракта'
Failed to scrape 3663302000516000002
'Максимальное значение цены контракта'
Failed to scrape 3662100819416000002Failed to scrape 1666101464516000030
'Максимальное значение цены контракта'

'Максимальное значение цены контракта'
Failed to scrape 3666301998416000002
'Максимальное значение цены контракта'
Failed to scrape 3663100607616000001
'Максимальное значение цены контракта'
Failed to scrape 3665803002016000003
'Максимальное значение цены контракта'
Failed to scrape 3666000798016000027
'Максимальное значение цены контракта'
Failed to scrape 2667016956416000002
'Максимальное значение цены контракта'
Failed to scrape 3666801180516000003
'Максимальное значение цены контракта'
Failed to scrape 1666001346216000069
'Максимальное значение цены контракта'
Failed to scrape 3667129017916000001
'Максимально

### Section 6: Output

Convert the list of contract classes to a dataframe so that they can be exported to a csv file

In [23]:
contract = contracts[0]
print(contract.__dict__.keys())
df = pd.DataFrame(columns=contract.__dict__.keys())
pd.options.display.float_format = '{:.2f}'.format


for contract in tqdm(contracts):
  df = df.append(contract.__dict__, ignore_index=True)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df)
  

dict_keys(['id', 'price', 'signed', 'deadline', 'method', 'procurer', 'supplier', 'proinn', 'supinn', 'registered', 'address', 'number', 'mail', 'code', 'product'])


  df = df.append(contract.__dict__, ignore_index=True)
  df = df.append(contract.__dict__, ignore_index=True)
  df = df.append(contract.__dict__, ignore_index=True)
  df = df.append(contract.__dict__, ignore_index=True)
  df = df.append(contract.__dict__, ignore_index=True)
  df = df.append(contract.__dict__, ignore_index=True)
  df = df.append(contract.__dict__, ignore_index=True)
  df = df.append(contract.__dict__, ignore_index=True)
  df = df.append(contract.__dict__, ignore_index=True)
  df = df.append(contract.__dict__, ignore_index=True)
  df = df.append(contract.__dict__, ignore_index=True)
  df = df.append(contract.__dict__, ignore_index=True)
  df = df.append(contract.__dict__, ignore_index=True)
  df = df.append(contract.__dict__, ignore_index=True)
  df = df.append(contract.__dict__, ignore_index=True)
  df = df.append(contract.__dict__, ignore_index=True)
  df = df.append(contract.__dict__, ignore_index=True)
  df = df.append(contract.__dict__, ignore_index=True)
  df = df.

                       id         price      signed    deadline  \
0     3663302000516000002    1502655.00  10.02.2016  31.12.2016   
1     1666101464516000030    5657589.00  10.02.2016  31.07.2016   
2     2665808158516000042    3850500.00  13.01.2016  31.12.2016   
3     3666801180516000003    1707000.00  13.02.2016  31.12.2016   
4     3666000798016000027    7319054.16  21.01.2016  31.12.2016   
5     3666301998416000002    1262500.00  13.01.2016  31.12.2016   
6     3662100819416000002    1656320.00  09.01.2016  30.06.2016   
7     3665803002016000003    5925232.43  02.01.2016  31.12.2016   
8     3662502866716000002    8804457.60  23.01.2016  31.12.2016   
9     1666001346216000069    1411771.97  10.02.2016  31.12.2016   
10    2666401415316000004    1530000.00  10.02.2016  31.12.2016   
11    3663100607616000001    1040148.13  21.01.2016  28.02.2017   
12    2666001041516000069    6398630.29  10.02.2016  01.04.2016   
13    2666202298416000026    1789096.32  13.01.2016  31.12.201

In [24]:
df.to_csv("zakupki.csv")