# Zakupki website scraping for Piotr

The aim of this notebook is to scrape details of each contract hosted on the Russian Zakupki public sector contract awarding website.

The input for this project will be the Zakupki URL. This code can be run on different dates to pull fresh contract data.

Method:
1.   Identify the number of pages of contracts to be scraped (using the contract filters provided).
2.   Iterate through each page, scraping the registration number of each contract.
3.   Access the website for each contract by placing the registraion number in the URL.
4.   Scrape the details for each contract and add them to a list of Contracts dataclasses.
5.   Format these Contract objects as a dataframe and output the dataframe to a csv file.


The output of this project will be the CSV file, with each row representing a new contract from the webstie.


### Section 1: Setup

In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import date
from dataclasses import dataclass
from tqdm import tqdm
from dateutil import parser
from threading import Thread
import pandas as pd
import datetime
from datetime import date, timedelta
import logging
import http.client



In [None]:
logging = False

if logging:

    http.client.HTTPConnection.debuglevel = 1

    # You must initialize logging, otherwise you'll not see debug output.
    logging.basicConfig()
    logging.getLogger().setLevel(logging.DEBUG)
    requests_log = logging.getLogger("requests.packages.urllib3")
    requests_log.setLevel(logging.DEBUG)
    requests_log.propagate = True

### Section 2: Determine Number of pages to scrape
Test connection to the website and determine number of pages to scrape

In [None]:
# Getting the dates we want to scrape.


url="https://zakupki.gov.ru/epz/contract/search/results.html?morphology=on&search-filter=%D0%94%D0%B0%D1%82%D0%B5+%D1%80%D0%B0%D0%B7%D0%BC%D0%B5%D1%89%D0%B5%D0%BD%D0%B8%D1%8F&fz44=on&contractStageList_0=on&contractStageList_1=on&contractStageList=0%2C1&contractPriceFrom=1000000&contractCurrencyID=-1&budgetLevelsIdNameHidden=%7B%7D&customerPlace=5277383&customerPlaceCodes=66000000000&contractDateFrom={}&contractDateTo={}&selectedLaws=FZ44&sortBy=UPDATE_DATE&pageNumber={}&sortDirection=false&recordsPerPage=_10&showLotsInfoHidden=false"
start = date(2016, 1, 1)
end = date(2021, 12, 31)
days = timedelta(days=1)

calendar=[]

### this actually works, set start & end date, it creates search results urls for each days with some criteria to limit the number (10 mln RUB, finished)
while start<=end:
  # tempURL=url.format(start.strftime('%d.%m.%Y'),start.strftime('%d.%m.%Y'))
  calendar.append(start.strftime('%d.%m.%Y'))
  start+=days

print(calendar)
  







### Section 3: Scrape each registration number

Scrape the reg numbers of each contract, so they can be accessed individually

In [None]:
def getPage(tempURL):
  # If User-Agent is not set to custom, the website will know a Python script is accessing it and block some of the request
  return BeautifulSoup(requests.get(tempURL, headers={'User-Agent': 'Custom'}).content, "html.parser")

In [None]:
class Page:

    def __init__(self, day, pageNum, pagefile):

        self.day = day
        self.pageNum = pageNum
        self.pagefile = pagefile


In [None]:
# Getting the web page for all the contracts for each date in the range we want to scrape.


pages = []


for day in calendar:

  tempURL = url.format(1, day, day)

  page = getPage(tempURL)

  # Scrape the max number of pages
  try:
    maxPageNum = int(page.select('a[data-pagenumber]')[-2].find("span").text)
  except:
    maxPageNum = 1

  print(maxPageNum)
    
  for i in range(1, maxPageNum+1):

    contracts = []

    # Creating a temporary URL for each page containing contracts
    tempPageURL = url.format(i, day, day)
    print(tempPageURL)

    # Request the page and format it as a BeautifulSoup object so that we can perform scrapings
    page = getPage(tempPageURL)

    pages.append(Page(day, i, page))

    # print(page)


    

print(str(len(pages)), " pages found")


In [None]:
regNumbers = []


for page in pages:


  # Obtain a list of all the sections of HTML containing a contract in the web page
  listOfContracts = page.pagefile.find_all("div", {"class": "registry-entry__header-mid__number"})
  

  # Segment the registration number from the URL of each contract
  for contract in listOfContracts:
    href = contract.find("a")['href']
    regNum = href[href.index("Number=")+7:]
    regNumbers.append(regNum)

  # print(regNumbers)

  print("done", i, len(listOfContracts), '\n')

In [None]:
url="https://zakupki.gov.ru/epz/contract/contractCard/common-info.html?reestrNumber=2665807811020000160"
soup=getPage(url)
table=soup.findAll("td",{"class":"tableBlock__col"})
org=table[0].text.strip().split("\n")
procurer=org[0]
INN=org[9]
KPP=org[13]
registered=org[17]

address=table[2].text.strip()

numbermail=table[4].text.strip().split("\n")
number=numbermail[0]
mail=numbermail[1].strip()

#productCode=table[1].text.strip()
# #Contracted=tables[3].find_all("td",{"class":"tableBlock__col"})  ### that's about finding the winner and winning bid, the real of the contract
# #Supplier=Contracted[2].text # WORKS
# #FinalPrice=Contracted[3].text  # works
#print(table[0])
print(number)


### Section 4: Details scraping

The Contract Dataclass will store the information during scraping.
If any information can't be scraped, default values have been provided in their place

In [None]:
@dataclass
class Contract:

  # TODO: Add reg number to class

  # Main Section
  ID: str = "none"
  price: float = 0.0
  published: date = None
  deadline: date = None
  # organisationName: str = "none"

  # Tab 1
  method: str = "none"
  tenderObject: str = "none"
  organisationName: str = "none"
  address: str = "none"
  official: str = "none"
  productCode: str = "none"


  # Tab 2
  # We may be missing info for this tab
  participants: str = "none"
  bids: float = 0.0
  

Method for scraping the data from each contract

In [None]:
def scrapeData(reg):

  dir = "https://zakupki.gov.ru/epz/order/notice/ea44/view/{}.html?regNumber={}"


  # TODO Add function to retry contracts that have their connection aborted.


  # Getting the web page for the given contract
  tempDir = dir.format("common-info", reg)
  print(tempDir)
  page = getPage(tempDir)

  # print(page)

  ### Basic info - Unique ID, Initial price, publication date and deadline 

  # ID = page.find("span",{"class":"navBreadcrumb__text"}).text.strip()  ## Returns unique ID with No at the start from each particular tender page (not the results list)
  ID = reg
  try:
    price=page.find("span", {"class":"cardMainInfo__content cost"}).text.replace("\xa0",",").replace(" ","").replace("₽","").strip()
  except:
    price=0.0
  cardInfo=page.find_all("span",{"class":"cardMainInfo__content"}) ## there's list inside, with procurer and price if needed
  published=str(cardInfo[3].text).strip()
  deadline=str(cardInfo[4].text).strip()

  ### First part info - tender type, object, procurer, address and official in charge

  sectionInfo=page.find_all("span",{"class":"section__info"})       ## same issue, but we can get a lot of stuff from this tab
  method=sectionInfo[0].text
  tenderObject=sectionInfo[4].text
  organisationName=sectionInfo[8].text                              ## e.g. like that?
  address=str(sectionInfo[9].text).strip()                                       ## 6 and 7 positions on the list return the same data..?
  official=str(sectionInfo[11].text).strip()                                      ## just in case, price can be also extracted from here, nr. 13

  ### Data taken from purchase order, it might contain multiple codes, to be tested! 
  tableBlock=page.find_all("td",{"class":"tableBlock__col"})
  productCode=tableBlock[1].text.strip()

  ### Second part info - participants and bids


  tempDir = dir.format("supplier-results", reg)
  print(tempDir)
  page = getPage(tempDir)

  tableBlock2=page.find_all("td",{"class":"tableBlock__col"})
  participants=tableBlock2[2].text
  bids=tableBlock2[3].text

  print(participants, bids)

  # Create the Contract dataclass object and append it to a list of objects.
  # This method means that missing data can be accounted for.
  contract = Contract(ID, price, published, deadline, method, tenderObject, organisationName, address, official, productCode)
  contracts.append(contract)


### Section 5: Starting execution
Scrape the contracts themselves using threading

In [None]:
### this soup is used later on to extract values from tender page
#soup1 = BeautifulSoup(requests.get(url1, headers={'User-Agent': 'Custom'}).content, "html.parser")

In [None]:
from tqdm import tqdm

contracts = []


print("Starting scrape with {} reg numbers".format(len(regNumbers)))

# Ensuring that hte code works without threading first
scrapeData(regNumbers[0])

# print(contracts)

# for regNum in tqdm(regNumbers):
#     scrapeData(regNum)


# for regNumber in tqdm(regNumbers[:50]):
# # #  scrapeData(regNumber)
#   thread = Thread(target = scrapeData, args = (regNumber,))
#   thread.start()

# print(contracts)


# For each contract reg number, start a thread for its execution and display the progress.
# for reg in tqdm(regNumbers):
#   thread = Thread(target = scrapeData, args = (reg,))
#   thread.start()


# The contracts list now contains only successfully scraped contracts
#print(len(contracts))

Usually about 9/10 contracts are scraped successfully. This depends on how the Zakupki website is feeling that day. Sometimes erroneous webpages are returned instead of the webpage for the requested contract.

### Section 6: Output

Convert the list of contract classes to a dataframe so that they can be exported to a csv file

In [None]:
contract = contracts[0]
print(contract.__dict__.keys())
df = pd.DataFrame(columns=contract.__dict__.keys())

for contract in contracts:
  df = df.append(contract.__dict__, ignore_index=True)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df)

In [None]:
df.to_csv("zakupki.csv")