In [None]:
%%time

import json
import os

from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
from pathlib import Path
import requests
import yaml

#Create a folder to contain all the files
# Parent directory path
parent_dir_path = "../data/areas_of_interest/"
# Current utc time
utcnow = datetime.utcnow().strftime("%Y%m%d_%H%M")
# Create directory path
directory_path = f"{parent_dir_path}{utcnow}/"
# Create the directory in os
Path(directory_path).mkdir(parents=True, exist_ok=True)

#Read in the areas and their hemnet area code
with open("../configs/area_ids.yaml", "r") as stream:
    area_info = yaml.safe_load(stream)

#Loop thruogh all the areas one by one

listings_count = 0

for area in area_info:
    search_id = area_info[area]["area_code"]
    
    url = "https://www.hemnet.se/bostader"
    params= {
        'housing_form_groups':'apartments',
        'location_ids':search_id,
        'item_types':'bostadsratt',
        'rooms_min':0,
        'living_area_min':0,
        'new_construction':'include'
    }
    payload={}

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'
    }

    response = requests.request("GET", url, headers=headers, data=payload, params=params)

    soup = BeautifulSoup(response.content, "html.parser")
    map_results=soup.find(id='results-map')
    initial_data=map_results.attrs['data-initial-data']
    json_data=json.loads(initial_data)

    url = "https://www.hemnet.se/bostader/search/"+json_data['search_key']

    params= {}

    payload={}

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'
    }

    response = requests.request("GET", url, headers=headers, data=payload, params=params)

    r_json = response.json()
    r_properties = r_json['properties']
    
    #Store all the listings in that area as a dataframe
    df = pd.json_normalize(r_properties, max_level=1)
    print(f"{area} has {df.shape[0]} listings and {df.shape[1]} columns.")
    listings_count += df.shape[0]
    
    #Update the area_info dictionary
    area_info[area]["listings"] = df.shape[0]
    
    df["area"] = area
    df.to_parquet(f"{directory_path}{area}.parquet.gzip", compression="gzip")
    
print(f"/n There was a total of {listings_count} listings in the areas of interest today!")

with open(f"{directory_path}area_info.yml", "w") as outfile:
    yaml.dump(area_info, outfile, default_flow_style=False)
with open(f"{directory_path}area_info.txt", 'w') as convert_file:
    convert_file.write(json.dumps(area_info))
    