In [278]:
import datetime
import os

import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [279]:
base_url = 'https://www.castorus.com'

### Get search url based on parameter search

radius_dic = {0: '', 5: 1, 10: 2, 15: 2, 30: 4}
type_dic = {'house': 1, 'farm': 5, 'immeuble': 9}

def build_search_url(location, radius, n_rooms, type_, price_min, price_max, min_m_2=0):
    type_code = type_dic[type_]
    radius_code = radius_dic[radius]
    
    return f"https://www.castorus.com/s/{location[0]},{location[1]},{type_code}-{radius_code}-{min_m_2}--{price_min}-{price_max}-{n_rooms}----------------------"

### Get dataframe based on search url

In [301]:
def get_dataframe_from_url(url, center, type_):
    ppties_dic = {'price': [], 'price_m2': [], 'date_in': [], 'title': [], 'link_castorus': [], 'n_rooms': [],
                  'surface': [], 'evolution': [], 'n_days_up': [], 'link_ad': [], 'center': [], 'type': []}
    
    res = requests.get(url)
    soup = BeautifulSoup(res.content)
    soup_res = soup.find('table', id='myTableResult')
    
    if soup_res == None:
        print('No data available...')
        return pd.DataFrame(ppties_dic)
    soup_res = soup_res.find('tbody')

    
    for res in tqdm(soup_res, desc='Iterating over properties'):
        ppties_dic['price'] += [int(res.find('td', {"class":'price'}).string)]

        pm2_classes = iter(['hide_mobile price', 'hide_mobile price green', 'hide_mobile price red'])
        pm2 = None
        while pm2 == None:
            curr_class = next(pm2_classes)
            pm2 = res.find('td', {"class": curr_class})
        ppties_dic['price_m2'] += [int(pm2.string)]

        parsed_date = res.find('span').string.split('/')
        d = [int(i) for i in parsed_date]
        ppties_dic['date_in'] += [datetime.datetime(year=int(d[2]), month=int(d[1]), day=int(d[0]))]

        title_field = res.find('td', {"class": 'title'})
        ppties_dic['title'] += [title_field.string]
        castorus_url = base_url + title_field.find('a')['href']
        ppties_dic['link_castorus'] += [castorus_url] 
        
        
        next_castorus_soup = BeautifulSoup(requests.get(castorus_url).content)
        link_element = next_castorus_soup.find('a', {'id': 'Redir_A'})
        if link_element == None:
            print('Link broken at castorus_url', castorus_url)
            print(ppties_dic['price'][-1], ppties_dic['title'][-1])
            ppties_dic['link_ad'] += [None]
        else:
            url_ad_redir = link_element['href']
            ppties_dic['link_ad'] += [base_url + '/' + url_ad_redir]
        

        ppties_dic['n_rooms'] += [res.find('td', {"class":'hide_mobile piece'}).string]

        ppties_dic['surface'] += [res.find('td', {"class":'surf'}).string]

        evol = res.find('td', {"class":'hide_mobile evol'}).string
        if evol != None and evol != 'stable':
            for char in ['(', ')', '%']:
                evol = evol.replace(char, '')
            evol = float(evol)
        else:
            evol = np.nan
        ppties_dic['evolution'] += [evol]

        ppties_dic['n_days_up'] += [res.find('td', {"class":'since'}).string]
        
        ppties_dic['center'] += [center]
        ppties_dic['type'] += [type_]
        
    return pd.DataFrame(ppties_dic)


### Create Dataframe for multiple centers

In [302]:
def build_df_from_centers(centers, radius):
    types = ['house', 'immeuble', 'farm']
    df = pd.DataFrame()
    for c in tqdm(centers, desc='Iterating over centers'):
        print('Current center:', c)
        for type_ in types:
            target_url = build_search_url(c, radius=radius, n_rooms=10, type_=type_, 
                                          price_min=200000, price_max=1200000, min_m_2=200)
            print('Current url:', target_url)
            df_ = get_dataframe_from_url(target_url, c, type_)
            df = df.append(df_)
    return df 

### Scrap execution

In [303]:
centers = [('Angers', 19447), ('Nantes', 17969), ('Le+Mans', 31646), ('Tours', 15280)]
df = build_df_from_centers(centers, radius=30)

HBox(children=(IntProgress(value=0, description='Iterating over centers', max=4, style=ProgressStyle(descripti…

Current center: ('Angers', 19447)
Current url: https://www.castorus.com/s/Angers,19447,1-4-200--200000-1200000-10----------------------


HBox(children=(IntProgress(value=0, description='Iterating over properties', max=225, style=ProgressStyle(desc…


Current url: https://www.castorus.com/s/Angers,19447,9-4-200--200000-1200000-10----------------------


HBox(children=(IntProgress(value=0, description='Iterating over properties', max=1, style=ProgressStyle(descri…


Current url: https://www.castorus.com/s/Angers,19447,5-4-200--200000-1200000-10----------------------
No data available...
Current center: ('Nantes', 17969)
Current url: https://www.castorus.com/s/Nantes,17969,1-4-200--200000-1200000-10----------------------


HBox(children=(IntProgress(value=0, description='Iterating over properties', max=208, style=ProgressStyle(desc…


Current url: https://www.castorus.com/s/Nantes,17969,9-4-200--200000-1200000-10----------------------


HBox(children=(IntProgress(value=0, description='Iterating over properties', max=2, style=ProgressStyle(descri…


Current url: https://www.castorus.com/s/Nantes,17969,5-4-200--200000-1200000-10----------------------
No data available...
Current center: ('Le+Mans', 31646)
Current url: https://www.castorus.com/s/Le+Mans,31646,1-4-200--200000-1200000-10----------------------


HBox(children=(IntProgress(value=0, description='Iterating over properties', max=168, style=ProgressStyle(desc…


Current url: https://www.castorus.com/s/Le+Mans,31646,9-4-200--200000-1200000-10----------------------


HBox(children=(IntProgress(value=0, description='Iterating over properties', max=3, style=ProgressStyle(descri…


Current url: https://www.castorus.com/s/Le+Mans,31646,5-4-200--200000-1200000-10----------------------


HBox(children=(IntProgress(value=0, description='Iterating over properties', max=2, style=ProgressStyle(descri…


Current center: ('Tours', 15280)
Current url: https://www.castorus.com/s/Tours,15280,1-4-200--200000-1200000-10----------------------


HBox(children=(IntProgress(value=0, description='Iterating over properties', max=274, style=ProgressStyle(desc…


Current url: https://www.castorus.com/s/Tours,15280,9-4-200--200000-1200000-10----------------------


HBox(children=(IntProgress(value=0, description='Iterating over properties', max=5, style=ProgressStyle(descri…


Current url: https://www.castorus.com/s/Tours,15280,5-4-200--200000-1200000-10----------------------
No data available...



### Postprocess

In [318]:
qu = df[df['n_days_up'].astype(int) > 365].sort_values('evolution')
qu.to_csv('365days_worstevo_castorus.csv')
qu

Unnamed: 0,price,price_m2,date_in,title,link_castorus,n_rooms,surface,evolution,n_days_up,link_ad
76,850000.0,1704.0,2020-02-10,"Propriété - 21 pièces, 499m2","https://www.castorus.com/bouchemaine,d60261194",21,499,-28.57,555,https://www.castorus.com/r.php?redirect=d60261194
270,680000.0,1943.0,2020-02-06,"Propriété - 10 pièces, 350m2","https://www.castorus.com/neuille-pont-pierre,d...",10,350,-28.42,616,https://www.castorus.com/r.php?redirect=d47800769
4,884000.0,598.0,2020-01-29,"Vends - 40 pièces, 1480m2",https://www.castorus.com/chanceaux-sur-choisil...,40,1480,-22.59,379,https://www.castorus.com/r.php?redirect=d65098198
39,378500.0,1262.0,2020-02-11,"Propriété - 14 pièces, 300m2","https://www.castorus.com/chalonnes-sur-loire,d...",14,300,-19.98,466,https://www.castorus.com/r.php?redirect=d62701046
230,224000.0,688.0,2020-02-16,"Propriété - 12 pièces, 326m2","https://www.castorus.com/luynes,d64165598",12,326,-18.55,429,https://www.castorus.com/r.php?redirect=d64165598
125,516000.0,1362.0,2020-02-03,"Propriété - 13 pièces, 379m2","https://www.castorus.com/tours,d53513439",13,379,-18.1,810,https://www.castorus.com/r.php?redirect=d53513439
133,635000.0,1717.0,2020-01-31,"Propriété - 15 pièces, 370m2","https://www.castorus.com/tours,d65372101",15,370,-15.33,754,https://www.castorus.com/r.php?redirect=d65372101
71,519950.0,1188.0,2020-02-13,"Propriété - 17 pièces, 438m2","https://www.castorus.com/yvre-l-eveque,d61423178",17,438,-13.04,507,https://www.castorus.com/r.php?redirect=d61423178
173,219200.0,1049.0,2020-02-14,"Propriété - 14 pièces, 209m2","https://www.castorus.com/angers,d65701693",14,209,-12.28,370,https://www.castorus.com/r.php?redirect=d65701693
91,470000.0,1045.0,2020-02-16,"Propriété - 10 pièces, 450m2","https://www.castorus.com/nantes,d65641400",10,450,-10.48,370,https://www.castorus.com/r.php?redirect=d65641400


In [None]:
df.sort_values()

In [311]:
df['n_days_up'].astype(int)

0    146
1    102
2    101
3    199
4    360
    ... 
0    173
1     37
2     33
3     82
4    379
Name: n_days_up, Length: 888, dtype: int64

In [319]:
df.to_csv('full_castorus.csv')