# Zakupki website scraping for Piotr

The aim of this notebook is to scrape details of each contract hosted on the Russian Zakupki public sector contract awarding website.

The input for this project will be the Zakupki URL. This code can be run on different dates to pull fresh contract data.

Method:
1.   Identify the number of pages of contracts to be scraped (using the contract filters provided).
2.   Iterate through each page, scraping the registration number of each contract.
3.   Access the website for each contract by placing the registraion number in the URL.
4.   Scrape the details for each contract and add them to a list of Contracts dataclasses.
5.   Format these Contract objects as a dataframe and output the dataframe to a csv file.


The output of this project will be the CSV file, with each row representing a new contract from the webstie.


### Section 1: Setup

In [169]:
import requests
from bs4 import BeautifulSoup
from datetime import date
from dataclasses import dataclass
from tqdm import tqdm
from dateutil import parser
from threading import Thread
import pandas as pd
import datetime
from datetime import date, timedelta
import logging
import http.client

In [170]:
logging = False

if logging:

    http.client.HTTPConnection.debuglevel = 1

    # You must initialize logging, otherwise you'll not see debug output.
    logging.basicConfig()
    logging.getLogger().setLevel(logging.DEBUG)
    requests_log = logging.getLogger("requests.packages.urllib3")
    requests_log.setLevel(logging.DEBUG)
    requests_log.propagate = True

### Section 2: Determine Number of pages to scrape
Test connection to the website and determine number of pages to scrape

In [171]:
# Getting the dates we want to scrape.

url="https://zakupki.gov.ru/epz/contract/search/results.html?morphology=on&search-filter=%D0%94%D0%B0%D1%82%D0%B5+%D1%80%D0%B0%D0%B7%D0%BC%D0%B5%D1%89%D0%B5%D0%BD%D0%B8%D1%8F&fz44=on&contractStageList_0=on&contractStageList_1=on&contractStageList=0%2C1&contractPriceFrom=1000000&contractCurrencyID=-1&budgetLevelsIdNameHidden=%7B%7D&customerPlace=5277383&customerPlaceCodes=66000000000&contractDateFrom={}&contractDateTo={}&selectedLaws=FZ44&sortBy=UPDATE_DATE&pageNumber={}&sortDirection=false&recordsPerPage=_500&showLotsInfoHidden=false"
start = date(2016, 6, 1)
end = date(2016, 6, 1)
days = timedelta(days=1)

calendar=[]

### this actually works, set start & end date, it creates search results urls for each days with some criteria to limit the number (10 mln RUB, finished)
while start<=end:
  # tempURL=url.format(start.strftime('%d.%m.%Y'),start.strftime('%d.%m.%Y'))
  calendar.append(start.strftime('%d.%m.%Y'))
  start+=days

print(calendar)

['01.06.2016']



### Section 3: Scrape each registration number

Scrape the reg numbers of each contract, so they can be accessed individually

In [172]:
def getPage(tempURL):
  # If User-Agent is not set to custom, the website will know a Python script is accessing it and block some of the request
  return BeautifulSoup(requests.get(tempURL, headers={'User-Agent': 'Custom'}).content, "html.parser")

In [173]:
class Page:

    def __init__(self, day, pageNum, pagefile):

        self.day = day
        self.pageNum = pageNum
        self.pagefile = pagefile


In [174]:
# Getting the web page for all the contracts for each date in the range we want to scrape.

pages = []

for day in calendar:

  tempURL = url.format(day, day, 1)

  page = getPage(tempURL)

  # Scrape the max number of pages
  try:
    maxPageNum = int(page.select('a[data-pagenumber]')[-2].find("span").text)
  except:
    maxPageNum = 1

  print(maxPageNum)
    
  for i in range(1, maxPageNum+1):

    contracts = []

    # Creating a temporary URL for each page containing contracts
    tempPageURL = url.format(day, day, i)
    print(tempPageURL)

    # Request the page and format it as a BeautifulSoup object so that we can perform scrapings
    page = getPage(tempPageURL)

    pages.append(Page(day, i, page))

    # print(page)

print(str(len(pages)), " pages found")


1
https://zakupki.gov.ru/epz/contract/search/results.html?morphology=on&search-filter=%D0%94%D0%B0%D1%82%D0%B5+%D1%80%D0%B0%D0%B7%D0%BC%D0%B5%D1%89%D0%B5%D0%BD%D0%B8%D1%8F&fz44=on&contractStageList_0=on&contractStageList_1=on&contractStageList=0%2C1&contractPriceFrom=1000000&contractCurrencyID=-1&budgetLevelsIdNameHidden=%7B%7D&customerPlace=5277383&customerPlaceCodes=66000000000&contractDateFrom=01.06.2016&contractDateTo=01.06.2016&selectedLaws=FZ44&sortBy=UPDATE_DATE&pageNumber=1&sortDirection=false&recordsPerPage=_500&showLotsInfoHidden=false
1  pages found


In [175]:
regNumbers = []


for page in pages:


  # Obtain a list of all the sections of HTML containing a contract in the web page
  listOfContracts = page.pagefile.find_all("div", {"class": "registry-entry__header-mid__number"})
  

  # Segment the registration number from the URL of each contract
  for contract in listOfContracts:
    href = contract.find("a")['href']
    regNum = href[href.index("Number=")+7:]
    regNumbers.append(regNum)

  # print(regNumbers)

  print("done", i, len(listOfContracts), '\n')

done 1 40 



The below is just for testing labels etc.

In [182]:
url="https://zakupki.gov.ru/epz/contract/contractCard/common-info.html?reestrNumber=2665807811020000160"
soup=getPage(url)
# products=page.findAll("span",{"class":"section__info"})
# product=products[25].text.strip()
# print(product)

### first tab info ###
table=soup.findAll("td",{"class":"tableBlock__col"})
cardinfo=soup.findAll("span",{"class":"cardMainInfo__content"})
signed=cardinfo[3].text.strip()
deadline=cardinfo[4].text.strip()
# # price=soup.find("span", {"class":"cardMainInfo__content cost"}).text.replace("\xa0","").replace(",",".").replace("₽","").strip()
print(signed)
# # org=table[0].text.strip().split("\n")
# # procurer=org[0]
# # INN=org[9]
# # KPP=org[13]
# # registered=org[17]

# # address=table[2].text.strip()
# # numbermail=table[4].text.strip().split("\n")
# # number=numbermail[0]
# # mail=numbermail[1].strip()
# # section=soup.findAll("span",{"class":"section__info"})
# # method=section[6].text.strip()
# # ### second tab info ###

# secondtab=soup.findAll("div", {"class": "col"})
# # print(secondtab)
# tables = []
# for tab in secondtab:
#     tables.append(tab.findAll("tbody"))

# object=tables[-1][0].findAll("td",{"class":"tableBlock__col"})
# code=object[2].text.strip().split("\n")[-1].strip()
# print(code)


07.09.2020


### Section 4: Details scraping

The Contract Dataclass will store the information during scraping.
If any information can't be scraped, default values have been provided in their place

In [177]:
@dataclass
class Contract:

  # TODO: Add reg number to class

  # Main Section
  ID: str = "none"
  price: float = 0.0
  signed: date = None
  deadline: date = None
  # organisationName: str = "none"

  # Tab 1
  method: str = "none"
  product: str = "none"
  procurer: str = "none"
  inn: str = "none"
  kpp: str = "none"
  registered: date = None
  address: str = "none"
  number: str = "none"
  mail: str = "none"

  # Tab 2
  code: float = 0.0
  

Method for scraping the data from each contract

In [178]:
def scrapeData(reg):

  dir = "https://zakupki.gov.ru/epz/contract/contractCard/{}.html?reestrNumber={}"

  # TODO Add function to retry contracts that have their connection aborted.

  # Getting the web page for the given contract
  tempDir = dir.format("common-info", reg)
  print(tempDir)
  page = getPage(tempDir)

  ### Basic info - Unique ID, Initial price, publication date and deadline 

  # ID = page.find("span",{"class":"navBreadcrumb__text"}).text.strip()  ## Returns unique ID with No at the start from each particular tender page (not the results list)
  ID = reg

  ### first tab info ###
  table=page.findAll("td",{"class":"tableBlock__col"})
  cardinfo=page.findAll("span",{"class":"cardMainInfo__content"})
  signed=cardinfo[3].text.strip()
  deadline=cardinfo[4].text.strip()
  price=page.find("span", {"class":"cardMainInfo__content cost"}).text.replace("\xa0","").replace(",",".").replace("₽","").strip()
  section=page.findAll("span",{"class":"section__info"})
  method=section[6].text.strip()
  products=page.findAll("span",{"class":"section__info"})
  product=products[25].text.strip()

  ### details about winner - ALSO, THERE'S OPTION TO SCRAPE SUBCONTRACTORS ### 
  org=table[0].text.strip().split("\n")
  procurer=org[0]
  inn=org[9]
  kpp=org[13]
  registered=org[17]

  ### contractor deets ### 
  address=table[2].text.strip()
  numbermail=table[4].text.strip().split("\n")
  number=numbermail[0]
  mail=numbermail[1].strip()

  ### second tab ###
  tempDir = dir.format("payment-info-and-target-of-order", reg)
  print(tempDir)
  page = getPage(tempDir)

  ### code stands for the product code, which can be later identified to return industry type ### 
  secondtab=page.findAll("div", {"class": "col"})
  tables = []
  for tab in secondtab:
      tables.append(tab.findAll("tbody"))
  object=tables[-1][0].findAll("td",{"class":"tableBlock__col"})
  code=object[2].text.strip().split("\n")[-1].strip()

  # Create the Contract dataclass object and append it to a list of objects.
  # This method means that missing data can be accounted for.
  contract = Contract(ID, price, signed, deadline, method, product, procurer, inn, kpp, registered, address, number, code)
  contracts.append(contract)


### Section 5: Starting execution
Scrape the contracts themselves using threading

In [181]:
from tqdm import tqdm

contracts = []

print("Starting scrape with {} reg numbers".format(len(regNumbers)))

# Ensuring that hte code works without threading first
scrapeData(regNumbers) #it was regNumbers[]

# # print(contracts)

# for regNum in tqdm(regNumbers):
#     scrapeData(regNum)


# for regNumber in tqdm(regNumbers[:50]):
# # #  scrapeData(regNumber)
#   thread = Thread(target = scrapeData, args = (regNumber,))
#   thread.start()

# print(contracts)


# # For each contract reg number, start a thread for its execution and display the progress.
# for reg in tqdm(regNumbers):
#   thread = Thread(target = scrapeData, args = (reg,))
#   thread.start()


# # The contracts list now contains only successfully scraped contracts
# print(len(contracts))

Starting scrape with 40 reg numbers
https://zakupki.gov.ru/epz/contract/contractCard/common-info.html?reestrNumber=['3662902195116000023', '2665807811016000068', '3661901766717000002', '3661901766717000001', '1665800456616000013', '3660200986916000052', '1666100166016000053', '3663400781816000003', '1668600001016000017', '2666001041516000251', '2666001041516000249', '1666202396316000052', '3667319873216000019', '3660700596316000039', '1667033496216000260', '1667033496216000255', '2666001041516000256', '2666001041516000254', '2666001041516000253', '2666001041516000252', '3662305565816000016', '3660700596316000037', '3660700596316000036', '2666001041516000245', '2666001041516000257', '3660700596316000038', '3660700258516000045', '3660700596316000040', '3666403957416000005', '1666303221616000047', '2666001041516000255', '2666001041516000250', '2666001041516000248', '3663202282816000015', '2666001041516000246', '2666001041516000258', '2666001041516000244', '2666001041516000247', '266550052

IndexError: list index out of range

Usually about 9/10 contracts are scraped successfully. This depends on how the Zakupki website is feeling that day. Sometimes erroneous webpages are returned instead of the webpage for the requested contract.

### Section 6: Output

Convert the list of contract classes to a dataframe so that they can be exported to a csv file

In [None]:
contract = contracts[0]
print(contract.__dict__.keys())
df = pd.DataFrame(columns=contract.__dict__.keys())

for contract in contracts:
  df = df.append(contract.__dict__, ignore_index=True)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df)

IndexError: list index out of range

In [None]:
df.to_csv("zakupki.csv")