# Install libraries

In [None]:
import requests
import pandas as pd
import plotly.express as px
import folium
import plotly.io as pio
import time
import random
from tqdm import tqdm

# API CEREMA

In [None]:
BASE_URL_API = "https://apidf-preprod.cerema.fr"

In [None]:
def apidf(url_endpoint, token=None):
    HEADERS = {
        "Content-Type": "application/json",
    }
    if token:
        HEADERS["Authorization"] = "Token " + token
    response = requests.get(
        url_endpoint,
        headers=HEADERS,
    )
    if response.status_code == 200:
      return response.json()
    return None

In [None]:
# List of fake User-Agents
user_agents =[
    "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.0 Safari/532.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.0 Safari/532.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.0 Safari/532.1",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_0; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.212.1 Safari/532.1",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_7; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.212.1 Safari/532.1",
    "Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0",
    "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.1",
    "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0",
    "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0",
    "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0",
    "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0",
    "Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0",
    "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0",
    "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0",
    "Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0",
    "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0"]

# Function to get a random user agent
def get_random_user_agent():
    return random.choice(user_agents)

In [None]:
# Codes Insee Paris
codinsee = ['75101','75102','75103','75104','75105','75106','75107','75108','75109','75110','75111','75112','75113','75114','75115','75116','75117','75118','75119','75120']
donnees_mutations = []
visited_mutations = set()
x, y = 2.340976, 48.862033
m = folium.Map(location=[y, x], zoom_start=16)

with tqdm(total=len(codinsee), desc='Overall Progress') as overall_progress:
    for code in codinsee:

        overall_progress.update(1)

        url = BASE_URL_API + f"/dvf_opendata/geomutations/?code_insee={code}&anneemut_min=2020&codtypbien=12"
        #retries = {}

        while True:
            try:
                user_agent = get_random_user_agent()
                headers = {'User-Agent': user_agent}
                response = requests.get(url,
                                        headers=headers,
                                        timeout=(5,30)).json()

                folium.GeoJson(response,
                               name="mutations",
                               popup=folium.GeoJsonPopup(fields=["datemut", "valeurfonc","sbati"])).add_to(m)

                for feature in response['features']:
                    properties = feature['properties']
                    mutation_id = properties["idmutinvar"]
                    mutation_info = {
                        "idmutinvar": properties["idmutinvar"],
                        "libtypbien": properties["libtypbien"],
                        "anneemut": properties["anneemut"],
                        "datemut": properties["datemut"],
                        "coddep": properties["coddep"],
                        "l_codinsee": properties.get("l_codinsee"),
                        "valeurfonc": properties["valeurfonc"],
                        "l_idpar": properties.get("l_idpar"), # parcelle cadastrale associée
                        "sbati": properties.get("sbati"),  # surface batiment au m2
                    }

                    if mutation_id not in visited_mutations:
                        donnees_mutations.append(mutation_info)
                        visited_mutations.add(mutation_id)

                if not response["next"]:
                    break
                url = response["next"]

            except Exception as e:
                print(f"Erreur pour le code INSEE {code}: {mutation_id}")

                time.sleep(random.uniform(1, 10))
                continue

Overall Progress:  20%|██        | 4/20 [00:59<04:43, 17.69s/it]

Erreur pour le code INSEE 75104: 0aa837a3484d055a282d832697c09c53


Overall Progress:  30%|███       | 6/20 [02:09<06:24, 27.46s/it]

Erreur pour le code INSEE 75106: d2f57b47848e9d2a20698a652437616b
Erreur pour le code INSEE 75106: 6b07a1fa06d8709feb47c66ec2ae4639


Overall Progress:  55%|█████▌    | 11/20 [06:17<07:30, 50.03s/it]

Erreur pour le code INSEE 75111: a7dff6bf2cb2401dcb30d5a9502a63fd
Erreur pour le code INSEE 75111: 5ee0932ce8cc0b78b6f7431bccad1c76
Erreur pour le code INSEE 75111: 1e7927ddfb8ee17a32f69baa7f1ca683
Erreur pour le code INSEE 75111: 87e83b8a79b199710c09ee6e3ba635c0
Erreur pour le code INSEE 75111: 1114fd108608b851699c6887bcd4485d
Erreur pour le code INSEE 75111: d70ccdce8a4e6666db88a3371aa63cf6


Overall Progress:  60%|██████    | 12/20 [09:13<11:45, 88.22s/it]

Erreur pour le code INSEE 75112: 69add37ecb16b6599abb7c71818ee9d8
Erreur pour le code INSEE 75112: dc2ba288783d9401736c1bd9bd30ef59
Erreur pour le code INSEE 75112: 7c14c098f48c9a4c0b66260590c0d1a5


Overall Progress:  65%|██████▌   | 13/20 [11:01<10:59, 94.23s/it]

Erreur pour le code INSEE 75113: 03a3a567d31765309ae65bd32d7d8ead


Overall Progress:  75%|███████▌  | 15/20 [13:23<06:50, 82.19s/it]

Erreur pour le code INSEE 75115: 634b83a7d9550ebf029261f3dd83a724
Erreur pour le code INSEE 75115: d29af583bc29ae2d93c36d299e87bdc5
Erreur pour le code INSEE 75115: ddb4684ea47c8e61729a9d3fdd161e42
Erreur pour le code INSEE 75115: 729e7cfcc4041f30f12c4d7ab90ec049
Erreur pour le code INSEE 75115: 115172aa1477ef552ec53679481a0e0b
Erreur pour le code INSEE 75115: 0db67b5657a227ef59eb8e28080a7f20


Overall Progress:  80%|████████  | 16/20 [17:01<08:12, 123.19s/it]

Erreur pour le code INSEE 75116: a371f4468d4ca4869ee166c689815b1d
Erreur pour le code INSEE 75116: 9e4e5eeed871500ff76a078e98a5333e
Erreur pour le code INSEE 75116: 021fe66a673d3d06db4a47be8e0ee410
Erreur pour le code INSEE 75116: d3656bc13269fd29e3362be08c7081cc


Overall Progress:  85%|████████▌ | 17/20 [19:46<06:46, 135.50s/it]

Erreur pour le code INSEE 75117: 679815d3ae883af732b4d82fb7072c7f
Erreur pour le code INSEE 75117: 70a81176ea238e863a1ec665db224010
Erreur pour le code INSEE 75117: d383db6e2d3bd4ded23dde957cad52b3
Erreur pour le code INSEE 75117: 22ce8a28d98c2f5416d0c9813f821ec3
Erreur pour le code INSEE 75117: 879df6754ea1fa4995e6251aa0c0a6ff
Erreur pour le code INSEE 75117: 3b213bf01449ff48bba22b968f63fd07


Overall Progress:  90%|█████████ | 18/20 [23:00<05:06, 153.15s/it]

Erreur pour le code INSEE 75118: febe0f9ea94bbf27316a2e960db6df51
Erreur pour le code INSEE 75118: 50b44850a6b080bcb9cca9170f13d881
Erreur pour le code INSEE 75118: 2177e9c5a95d340cf55f322cc2ba2d51
Erreur pour le code INSEE 75118: bcea5877d06525f4aa6718a0ee1598cf


Overall Progress:  95%|█████████▌| 19/20 [26:18<02:46, 166.81s/it]

Erreur pour le code INSEE 75119: 3324a6e716c0871604952aceb7d91401
Erreur pour le code INSEE 75119: 1011fd229429c20b1405858473639717
Erreur pour le code INSEE 75119: f962d72dcee8c5670a86a1c3788fe3c9
Erreur pour le code INSEE 75119: 42104162e00c2fbea453119c5c04a661


Overall Progress: 100%|██████████| 20/20 [28:18<00:00, 152.47s/it]

Erreur pour le code INSEE 75120: 4af1f05b9791e69cf768e72d4d6ac732
Erreur pour le code INSEE 75120: 9da57840319a8d7da0f810442d9f2dae


Overall Progress: 100%|██████████| 20/20 [30:07<00:00, 90.36s/it] 


In [None]:
df_mutations = pd.DataFrame()
df_mutations= pd.DataFrame(donnees_mutations)

# Verification and data preprocessing

In [None]:
# delete the lists from the dataframe
df_mutations['l_codinsee'] = df_mutations['l_codinsee'].apply(lambda x: x[0] if isinstance(x, list) else x)
df_mutations['l_idpar'] = df_mutations['l_idpar'].apply(lambda x: x[0] if isinstance(x, list) else x)

In [None]:
# Are there any duplicates ? No
doublons = df_mutations[df_mutations.duplicated()]
print("Lignes en double :\n", doublons)

Lignes en double :
 Empty DataFrame
Columns: [idmutinvar, libtypbien, anneemut, datemut, coddep, l_codinsee, valeurfonc, l_idpar, sbati]
Index: []


In [None]:
# Are there missing values ? Yes some valeurfonc missing --> delete
df_mutations.isna().sum()

idmutinvar     0
libtypbien     0
anneemut       0
datemut        0
coddep         0
l_codinsee     0
valeurfonc    46
l_idpar        0
sbati          0
dtype: int64

# Add coordinates of each parcelles

In [None]:
import gzip
import json

chemin_general = "drive/MyDrive/Colab Notebooks/Data science project/"

with gzip.open(chemin_general + "BDD_input/cadastre-75-parcelles.json.gz", 'rb') as f:
    data = f.read().decode('utf-8')
    json_data = json.loads(data)

In [None]:
from shapely.geometry import Polygon

id_parcelles = []
mean_coor_parcelles = []

for feature in json_data['features']:
    id_parcelles.append(feature['id'])
    coordinates = feature['geometry']['coordinates'][0]

    polygon = Polygon(coordinates)
    # Calculate the center of polygon
    centroid = polygon.centroid
    mean_coordinate = [centroid.y, centroid.x]

    mean_coor_parcelles.append(mean_coordinate)

df_parcelles = pd.DataFrame({'id_parcelles': id_parcelles, 'coor_parcelles': mean_coor_parcelles})

In [None]:
mutation_final = pd.merge(df_mutations, df_parcelles, how='left', left_on='l_idpar', right_on='id_parcelles').drop('l_idpar', axis=1)

In [None]:
mutation_final.isna().sum()

idmutinvar        0
libtypbien        0
anneemut          0
datemut           0
coddep            0
l_codinsee        0
valeurfonc        0
sbati             0
id_parcelles      0
coor_parcelles    0
dtype: int64

In [None]:
mutation_final = mutation_final.dropna()

In [None]:
mutation_final['long_parcelles'] = mutation_final['coor_parcelles'].map(lambda x: x[0])
mutation_final['lat_parcelles'] = mutation_final['coor_parcelles'].map(lambda x: x[1])
mutation_final = mutation_final.reset_index(drop=True)

# Save data

In [None]:
# save the map in html
m.save('mutations.html')

KeyboardInterrupt: 

In [None]:
# save data in CSV
chemin_general = "drive/MyDrive/Colab Notebooks/Data science project/"
mutation_final.to_csv(chemin_general + "BDD_output/mutations.csv", index=False)