In [12]:
#! pip install html5lib
#! pip install bs4

In [13]:
# Builtins
from datetime import datetime, date
import requests
import string
import time
import re
import base64

# External Libraries
import numpy as np
import pandas as pd
import bs4
import html5lib
from kaggle_secrets import UserSecretsClient


In [14]:
user_secrets = UserSecretsClient()
github_access_token = user_secrets.get_secret("github_rent_price_monitoring")

In [15]:
class GithubApi():
    def __init__(self, token:str, owner:str, repo:str) -> None:
        self.token = token
        self.owner = owner
        self.repo = repo
        self.base_url = f'https://api.github.com/repos/{owner}/{repo}'
        
    def get_url(self, file_path:str) -> str:
        return f'{self.base_url}/contents/{file_path}'
        
    def get_file_info(self, file_url) -> str:
        headers = {'Authorization': f'token {self.token}'}
        response = requests.get(url=file_url, headers=headers)
        response.raise_for_status()
        response_json = response.json()
        download_url = response_json['download_url']
        current_sha = response_json['sha']
        return download_url, current_sha, file_url
    
    def _download_current_content(self, download_url) -> pd.DataFrame:
        extension = download_url.split('.')[-1]
        if extension == 'csv':
            return pd.read_csv(download_url)
        elif extension == 'parquet':
            return pd.read_csv
        else:
            raise TypeError(f'file format {extension} is not supported')
                
    def _append_new_content(self,current_content=pd.DataFrame, new_content=pd.DataFrame) -> pd.DataFrame:
        if current_content.columns == new_content.columns:
            return pd.concat([current_content,new_content])
    
    def _get_encoded_content(self, appended_content=pd.DataFrame, file_format='csv'):
        if file_format == 'csv':
            csv_data = appended_content.to_csv()
            return base64.b64encode(csv_data.encode()).decode('utf-8')
        else:
            raise TypeError(f'Unsuported file type in _get_encoded_content function')
    
    def _put_content(self, headers, data, url) -> requests.models.Response:
        try:
            response = requests.put(url=file_url, headers=headers, json=data)
            response.raise_for_status()
            return response
        except requests.HTTPError as http_err:
            print(f'HTTP error occurred: {http_err}')
        except Exception as err:
            print(f'Other error occurred: {err}')
        else:
            if response.status_code == 200:
                print("File updated successfully.")
            else:
                print(f"Failed to update file. Status code: {response.status_code}")
    
    def update_file_content(self, file_path, new_content, method='append') -> None:
        file_url = self.get_url(file_path=file_path)
        download_url, current_sha, file_url = self.get_file_info(file_url)
        current_content = self._download_current_content(download_url)
        if method == 'append':
            appended_content = self._append_new_content(current_content, new_content)
        elif method == 'overwrite':
            appended_content = new_content
        else:
            raise TypeError('"method" must be one of "append" or "overwrite".')
        encoded_content = self._get_encoded_content(appended_content)
        commit_message = f"Automatically updated via Kaggle script"
        headers = {'Authorization': f'token {self.token}'}
        data = {"message": commit_message,"content": encoded_content,"sha": current_sha}
        self._put_content(headers=headers, data=data, url=file_url)
        

In [16]:
def extract_address(string):
    formatted_chars = []
    bairro = None
    numero = None
    rua = None
    for char in string:
        if char in "-,/;|.":
            formatted_chars.append(',')
        else:
            formatted_chars.append(char)
    formatted_string = ''.join(formatted_chars)
    term_list = formatted_string.split(',')[::-1]
    try:
        bairro = term_list[2]
    except:
        bairro = None
    try:
        numero = int(''.join(re.findall(r'\d', formatted_string)))
    except:
        numero = None
    try:
        rua = term_list[len(term_list)-1] if len(term_list) > 1 else None
    except:
        rua = None
    return dict(
        bairro = bairro,
        numero = numero,
        rua = rua
    )

In [17]:
class VivaRealApi():
    def __init__(self, cidade:str) -> None:
        self.type = 'Viva Real'
        self.current_page = 1
        self.city = cidade
        self.endpoint = f'https://www.vivareal.com.br/aluguel/santa-catarina/{self.city}/?pagina='
        self.result_set = pd.DataFrame(
            {'data': pd.Series(dtype='datetime64[ns]'),
             'fonte': pd.Series(dtype='str'),
             'descricao': pd.Series(dtype='str'),
             'endereco': pd.Series(dtype='str'),
             'rua': pd.Series(dtype='str'),
             'numero': pd.Series(dtype='int'),
             'bairro': pd.Series(dtype='str'),
             'cidade': pd.Series(dtype='str'),
             'valor': pd.Series(dtype='float'),
             'periodicidade': pd.Series(dtype='str'),
             'condominio': pd.Series(dtype='float'),
             'area': pd.Series(dtype='float'),
             'qtd_banheiros': pd.Series(dtype='int'),
             'qtd_quartos': pd.Series(dtype='int'),
             'qtd_vagas': pd.Series(dtype='int'),
             'url': pd.Series(dtype='str')
                  }
        )

    def _get_endpoint(self) -> str:
        return f'{self.endpoint}{self.city}'
    
    def _extract_current_page(self, max_retries=5, backoff_factor=2) -> requests.models.Response:
        retries = 0
        while retries < max_retries:
            response = requests.get(self._get_endpoint())
            if response.status_code < 300:  # Successful response
                return response
            elif response.status_code == 429:  # Too many requests
                print(f"Rate limited. Retrying in {backoff_factor ** retries} seconds.")
                time.sleep(backoff_factor ** retries)
                retries += 1
            else:
                print(f"Request failed with status code {response.status_code}")
                return None
    
    def _parse_html_response(self, response=None) -> bs4.BeautifulSoup:
        if response and response.status_code < 300:
            return bs4.BeautifulSoup(response.text, features="html5lib")
        elif response:
            print(f'No valid response to be parsed. Status code {response.status_code}.')
            return None
        else:
            print('Empty request')
        
    @property
    def _result_count(self) -> int:
        response = self._extract_current_page()
        soup = self._parse_html_response(response=response)
        try:
            return int(soup.find('strong',{'class':'results-summary__count'}).text.replace('.',''))
        except:
            print(f'No houses were found for the given page.\n the HTML structure of the page might have been altered...')
            return 0
        
    def _extract_listings_from_soup(self, soup) -> bs4.element.ResultSet:
        return soup.find_all('article', {'class': 'property-card__container js-property-card'})
    
    def _format_listing(self, listing=None) -> list:
            data = datetime.now()
            fonte = self.type
            cidade = self.city
            formatted = []
            try:
                descricao = listing.find('span', {'class': 'js-card-title'}).text.strip()
            except:
                descricao = None
            try:
                endereco = listing.find('span', {'class': 'property-card__address'}).text.replace('-',',').replace('|','').strip()
            except:
                endereco = ''
            try:
                rua = extract_address(endereco)['rua']
            except:
                rua = None
            try:
                numero = extract_address(endereco)['numero']
            except:
                numero = None   
            try:
                bairro = extract_address(endereco)['bairro']
            except:
                bairro = None
            try:
                valor = listing.find('div', {'class': 'property-card__price'}).text.replace('R$','').replace('.','').split('/')[0]
            except:
                valor = None
            try:
                periodicidade = listing.find('div', {'class': 'property-card__price'}).text.replace('R$','').replace('.','').split('/')[1].split(' ')[0]
            except:
                periodicidade = None
            try:
                condominio = listing.find('strong', {'class': 'js-condo-price'}).text.replace('R$','').strip()
            except:
                condominio = None
            try:
                area = listing.find('span', {'class': 'js-property-card-detail-area'}).text.strip()
            except:
                area = None
            try:
                qtd_banheiros = listing.find('li', {'class': 'property-card__detail-bathroom'}).text.strip()[0]
                qtd_banheiros = int(''.join(re.findall(r'\d', qtd_banheiros)))
            except:
                qtd_banheiros = None
            try:
                qtd_quartos = listing.find('li', {'class': 'property-card__detail-room'}).text.strip()[0]
                qtd_quartos = int(''.join(re.findall(r'\d', qtd_quartos)))
            except:
                qtd_quartos = None
            try:
                qtd_vagas = listing.find('li', {'class': 'property-card__detail-garage'}).text.strip()[0]
                qtd_vagas = int(''.join(re.findall(r'\d', qtd_vagas)))
            except:
                qtd_vagas = None
            try:
                link = 'https://vivareal.com.br' + listing.find('a', {'class': 'property-card__labels-container'})['href']
            except:
                link = None
            return [data,fonte,descricao,endereco,rua,numero,bairro,cidade,valor,periodicidade,condominio,area,qtd_banheiros,qtd_quartos,qtd_vagas,link]
    
    def _append_formatted_listing(self, listing=None) -> None:
        try:
            self.result_set.loc[self.result_set.shape[0]] = listing
        except:
            print(f'Error appending the following listing:\n{listing}')
    
    def _ingest_current_page(self) -> None:
        try:
            print(f'Current_page: {self.current_page}')
            response = self._extract_current_page()
            soup = self._parse_html_response(response=response)
            listings = self._extract_listings_from_soup(soup=soup)
            for i in listings:
                formatted = self._format_listing(listing = i)
                self._append_formatted_listing(listing=formatted)
        except exception:
            print(f'Something went wrong while formating the listings of the page {self.current_page}: {exception}')
        else:
            self.current_page += 1
    
    def ingest_listings(self, all=True, pages_number=None) -> None:
        if all == True:
            while self.result_set.shape[0] < self._result_count:
                self._ingest_current_page()
        elif pages_number:
            if type(pages_number) == int and pages_number > 0:
                self.current_page = 1
                while self.current_page <= pages_number:
                    self._ingest_current_page()
            else:
                raise TypeError('pages_number: This parameter only accepts numbers above zero.')
                
    def dump_result_set(self, path=None, format='csv') -> None:
        if format=='csv':
            self.result_set.to_csv(f'{path}{self.city}_{date.today()}.csv')
        elif format=='parquet':
            self.result_set.to_parquet(f'{path}{self.city}_{date.today()}.parquet')
        else:
            print(f'Option not allowed: {format}')

In [18]:
api = VivaRealApi('florianopolis')
api.ingest_listings(all=True)

Current_page: 1
Current_page: 2
Current_page: 3
Current_page: 4
Current_page: 5
Current_page: 6
Current_page: 7
Current_page: 8
Current_page: 9
Current_page: 10
Current_page: 11
Current_page: 12
Current_page: 13
Current_page: 14
Current_page: 15
Current_page: 16
Current_page: 17
Current_page: 18
Current_page: 19
Current_page: 20
Current_page: 21
Current_page: 22
Current_page: 23
Current_page: 24
Current_page: 25
Current_page: 26
Current_page: 27
Current_page: 28
Current_page: 29
Current_page: 30
Current_page: 31
Current_page: 32
Current_page: 33
Current_page: 34
Current_page: 35
Current_page: 36
Current_page: 37
Current_page: 38
Current_page: 39
Current_page: 40
Current_page: 41
Current_page: 42
Current_page: 43
Current_page: 44
Current_page: 45
Current_page: 46
Current_page: 47
Current_page: 48
Current_page: 49
Current_page: 50
Current_page: 51
Current_page: 52
Current_page: 53
Current_page: 54
Current_page: 55
Current_page: 56
Current_page: 57
Current_page: 58
Current_page: 59
Curren

In [19]:
api.result_set

Unnamed: 0,data,fonte,descricao,endereco,rua,numero,bairro,cidade,valor,periodicidade,condominio,area,qtd_banheiros,qtd_quartos,qtd_vagas,url
0,2023-10-26 14:03:29.900975,Viva Real,"Apartamento com 3 Quartos para Aluguel, 110m²","Rua Lauro Linhares, 1520 , Trindade, Florianóp...",Rua Lauro Linhares,1520,Trindade,florianopolis,2900,Mês,750,110,2,3,1,https://vivareal.com.br/imovel/apartamento-3-q...
1,2023-10-26 14:03:29.905241,Viva Real,"Apartamento com Quarto para Aluguel, 58m²","Rua Osvaldo Cruz, 137 , Balneario do Estreito,...",Rua Osvaldo Cruz,137,Balneario do Estreito,florianopolis,1700,Mês,390,58,1,1,1,https://vivareal.com.br/imovel/apartamento-1-q...
2,2023-10-26 14:03:29.911438,Viva Real,"Casa com 3 Quartos para Aluguel, 245m²","Rua da Corticeira da Serra, 51 , Cachoeira do ...",Rua da Corticeira da Serra,51,Cachoeira do Bom Jesus,florianopolis,2400,Dia,,245,4,3,,https://vivareal.com.br/imovel/casa-3-quartos-...
3,2023-10-26 14:03:29.916984,Viva Real,"Casa com 2 Quartos para Aluguel, 110m²","Rua Recanto do Beija,Flor, 128 , Campeche, Flo...",Rua Recanto do Beija,128,Campeche,florianopolis,4700,Mês,,110,6,2,2,https://vivareal.com.br/imovel/casa-2-quartos-...
4,2023-10-26 14:03:29.922078,Viva Real,"Apartamento com Quarto para Aluguel, 35m²","Rua Maria Eduarda, 238 , Pantanal, Florianópol...",Rua Maria Eduarda,238,Pantanal,florianopolis,2950,Mês,610,35,1,1,,https://vivareal.com.br/imovel/apartamento-1-q...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7375,2023-10-26 14:12:11.976096,Viva Real,"Apartamento com Quarto para Aluguel, 41m²","Avenida Professor Othon Gama D'Eça , Centro, F...",Avenida Professor Othon Gama D'Eça,,Centro,florianopolis,1800,Mês,405,41,1,1,,https://vivareal.com.br/imovel/apartamento-1-q...
7376,2023-10-26 14:12:11.983936,Viva Real,"Apartamento com 3 Quartos para Aluguel, 85m²","Rua Professor Clementino de Brito, 205 , Capoe...",Rua Professor Clementino de Brito,205,Capoeiras,florianopolis,3300,Mês,650,85,2,3,1,https://vivareal.com.br/imovel/apartamento-3-q...
7377,2023-10-26 14:12:11.991590,Viva Real,"Apartamento com 2 Quartos para Aluguel, 44m²","Avenida Marinheiro Max Schramm, 2428 , Jardim ...",Avenida Marinheiro Max Schramm,2428,Jardim Atlântico,florianopolis,1100,Mês,345,44,1,2,1,https://vivareal.com.br/imovel/apartamento-2-q...
7378,2023-10-26 14:12:11.999070,Viva Real,"Casa com 2 Quartos para Aluguel, 65m²","Rua Professor João José Cabral, 0 , Balneário,...",Rua Professor João José Cabral,0,Balneário,florianopolis,1600,Mês,,65,1,2,,https://vivareal.com.br/imovel/casa-2-quartos-...


In [20]:
git = GithubApi(
    token=github_access_token,
    owner='strangercacaus',
    repo='florianopolis_rent_pricing_monitoring')

In [21]:
file_path = 'dataset.csv'
new_content = api.result_set
method = 'append'
git.update_file_content(file_path, new_content, method)