# The task is to scrap certain information for about 500 companies (with some filters) from the Skolkovo website

In [None]:
import requests
import time
import pandas as pd
import pickle
from selenium import webdriver
from selenium.webdriver.common.by import By 
from bs4 import BeautifulSoup
from tqdm import tqdm

### Parse websites with information on all companies

In [None]:
myUrl = "https://navigator.sk.ru/?q=N4IgZiBcoC4IYHMDOB9GBPADgUyiA9gE4gA0IAloQDZShiH4C2epIM%2BLAvmY9beA2aQQrdlzIw%2B0AUxYSOwkNxBIArgCNG5GAGF8qgHYwoARjIBjJpjgH0KQtgBu2A6tzT6syACYADP-95cRBzKTpBOTYFEU5lJDwAWkZsRnVsQhQkcgNzXDJMU04gA"

In [None]:
driver = webdriver.Chrome()
driver.get(myUrl)

# define a function that simulates pressing the "More" button

def click_load_more_button():
    while True:
        try:
            button = driver.find_element(By.CLASS_NAME, "load-more__button")
            button.click()
            time.sleep(40) # so that the page has time to load
        except:
            break


loading_start = time.time()
click_load_more_button()

company_hrefs = []

page_source = driver.page_source
soup = BeautifulSoup(page_source, "html.parser")

companies_div = soup.find_all("a", class_="card company-card card_navigator w-inline-block")
for company in tqdm(companies_div):
    company_hrefs.append(company["href"])

loading_time = time.time() - loading_start # just for fun, let's see how long the whole procedure takes

len(company_hrefs)

In [None]:
base_url = "https://navigator.sk.ru{}"

company_urls = []
for company in tqdm(company_hrefs):
    final_url = base_url.format(company)
    company_urls.append(final_url)

In [None]:
# save each company's website so as not to repeat the procedure every time
with open("skolkovo_parce/skolkovo_urls.pkl", "wb") as file:
    pickle.dump(company_urls, file)

### Parse the data of interest for each company

In [None]:
# import the latter file
with open("skolkovo_parce/skolkovo_urls.pkl", "rb") as file:
    skolkovo_urls = pickle.load(file)

In [None]:
company_info = []

for company in tqdm(range(432, len(skolkovo_urls))):
    d = {}
    url = skolkovo_urls[company]
    webpage = requests.get(url).text
    soup = BeautifulSoup(webpage, "html.parser")
    d["Название"] = soup.find("p", class_="page__subtitle").text
    
    website = soup.find("a", class_="sidebar-item web-alpha-icon")
    if website:
        d["Сайт"] = website["href"]
    else:
        d["Сайт"] = "Сайт отсутствует"
        
    description = soup.find("div", class_="page-section__main-text")
    if description:
        d["Описание"] = description.text
    else:
        d["Описание"] = "Описание отсутствует"
    
    thead = soup.find("thead")
    thead_rows = thead.find_all("th")
    inv_thead = [cell.text for cell in thead_rows]
    tbody = soup.find("tbody")
    tbody_rows = tbody.find_all("tr")[:2]
    inv_tbody = []
    for row in tbody_rows:
        cells = row.find_all("td")
        num_cells = []
        for i in range(1, len(cells)):
            num = int(cells[i].find("span", class_="number").text.replace(" ", "").replace("₽", ""))
            num_cells.append(num)
         
        inv_tbody.append([cells[0].text] + num_cells)
    d["Инвестиции"] = [inv_thead, inv_tbody]
    
    d["ОГРН"] = int(soup.find("div", text="ОГРН").find_parent("div").find("a").text)
    
    founders = soup.find_all("div", class_="contact-item__text-2 contact-item__row uppercase")
    formatted_founders = []
    for founder in founders:
        p_elements = founder.find_all("p")
        fio = p_elements[0].text
        percent = p_elements[1].text
        formatted_founders.append(f"{fio} ({percent})")
    d["Учредители"] = ", ".join(formatted_founders)
    
    company_info.append(d)

In [None]:
# save a list with information on all companies
with open("skolkovo_parce/final_list.pkl", "wb") as file:
    pickle.dump(company_info, file)

---
---

### Work with a dataframe

In [None]:
# import the latter file
with open("final_list.pkl", "rb") as file:
    final_list = pickle.load(file)

In [None]:
df = pd.DataFrame(final_list)
display(df)

In [None]:
for i in range(len(df["Инвестиции"])):
    diff = len(df["Инвестиции"][i][1][0]) - 4
    if diff < 0:
        df["Инвестиции"][i][1][0] += ["--"] * abs(diff)
        df["Инвестиции"][i][1][1] += ["--"] * abs(diff)
    else:
        pass
    df.loc[i, ["Rev 2022", "Rev 2021", "Rev 2020"]] = df["Инвестиции"][i][1][0][1:]
    df.loc[i, ["NI 2022", "NI 2021", "NI 2020"]] = df["Инвестиции"][i][1][1][1:]
del df["Инвестиции"]

#### Save as Excel 

In [None]:
df.to_excel("scrapTable_Lena.xlsx", index=False)