In [129]:
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch
from sklearn.model_selection import train_test_split
import geopandas as gpd

In [2]:

HOST = 'elasticsearch-master.projet-ssplab'

def elastic():
    """Connection avec Elastic sur le data lab"""
    es = Elasticsearch([{'host': HOST, 'port': 9200, 'scheme': 'http'}], http_compress=True, request_timeout=200)
    return es

es = elastic()

In [3]:
# Importation des bases
import functions as fc
dict_data = fc.read_all_raw(fc.list_bases)
dict_data.keys()

dict_keys(['rejets', 'etablissements', 'emissions', 'Trait_dechets_non_dangereux', 'Trait_dechets_dangereux', 'Prod_dechets_non_dangereux', 'Prod_dechets_dangereux', 'Prelevements'])

In [157]:
df = dict_data["etablissements"]
df = df.rename({'numero_siret': "numero_siret_true"}, axis = 1)
df["numero_siret_true"] = df["numero_siret_true"].astype(str)

In [158]:
df[["coordonnees_x", "coordonnees_y", "code_epsg"]].head()

Unnamed: 0,coordonnees_x,coordonnees_y,code_epsg
0,672948.91,1813634.0,27572.0
1,-1.498014,43.50209,4326.0
2,2.856548,50.43594,4326.0
3,2.14059,48.9841,4326.0
4,6.861504,47.62541,4326.0


In [159]:
df["code_epsg"].value_counts()

2154.0     7520
4326.0      882
27572.0     368
4559.0       33
2971.0        8
3727.0        4
Name: code_epsg, dtype: int64

In [160]:
df["code_apet"] = df["code_ape"].str[:4]

In [161]:
etab_not_null = df.dropna(subset = ['code_epsg'])
etab_null = df.loc[df['code_epsg'].isnull()]
gb = etab_not_null.groupby("code_epsg")
gb = [gb.get_group(x) for x in gb.groups]

def transform_wgs84(df, epsg):
    etab = gpd.GeoDataFrame(
       df,
        geometry=gpd.points_from_xy(
            df['coordonnees_x'],
            df['coordonnees_y']
        ),
        crs = epsg)
    etab = etab.to_crs(4326)
    etab['x'] = etab['geometry'].x 
    etab['y'] = etab['geometry'].y
    etab = pd.DataFrame(etab)
    return etab

In [162]:
df.columns

Index(['identifiant', 'nom_etablissement', 'numero_siret_true', 'adresse',
       'code_postal', 'commune', 'departement', 'region', 'coordonnees_x',
       'coordonnees_y', 'code_epsg', 'code_ape', 'libelle_ape', 'code_eprtr',
       'libelle_eprtr', 'code_apet'],
      dtype='object')

In [172]:
temp = [
    transform_wgs84(
        gb[idx],
        gb[idx]['code_epsg'].iloc[0]
    ) for idx in range(len(gb)) 
]
temp2 = pd.concat(
    temp
)
temp3 = pd.concat(
    [temp2, etab_null]
)



In [173]:
temp3.columns

Index(['identifiant', 'nom_etablissement', 'numero_siret_true', 'adresse',
       'code_postal', 'commune', 'departement', 'region', 'coordonnees_x',
       'coordonnees_y', 'code_epsg', 'code_ape', 'libelle_ape', 'code_eprtr',
       'libelle_eprtr', 'code_apet', 'geometry', 'x', 'y'],
      dtype='object')

In [171]:
temp2.columns

Index(['identifiant', 'nom_etablissement', 'numero_siret_true', 'adresse',
       'code_postal', 'commune', 'departement', 'region', 'coordonnees_x',
       'coordonnees_y', 'code_epsg', 'code_ape', 'libelle_ape', 'code_eprtr',
       'libelle_eprtr', 'code_apet', 'geometry', 'x', 'y'],
      dtype='object')

In [174]:
temp3.shape

(8963, 19)

In [175]:
temp3[['x', "y"]].head()

Unnamed: 0,x,y
11,5.645619,45.253497
12,5.1377,45.658263
13,-1.583039,47.190602
16,7.405579,47.775791
34,-3.964126,48.587616


In [176]:
X = temp3

In [177]:
X_train, X_test = train_test_split(
     X, test_size=0.20, random_state=42)

In [178]:
X_train.columns

Index(['identifiant', 'nom_etablissement', 'numero_siret_true', 'adresse',
       'code_postal', 'commune', 'departement', 'region', 'coordonnees_x',
       'coordonnees_y', 'code_epsg', 'code_ape', 'libelle_ape', 'code_eprtr',
       'libelle_eprtr', 'code_apet', 'geometry', 'x', 'y'],
      dtype='object')

In [209]:
requete_type = '''{{ 
  "query": {{
    "bool": {{
      "should": [
        {{ "match": {{ "rs_denom":   "{nom_etablissement}" }}}}
      ],
      "filter": [
        {{ "prefix":  {{ "apet": "{code_apet}" }}}},
        {{"geo_distance": {{"distance": "10km","location": {{"lat": "{x}","long": "{y}"}}}}}}
      ]
    }}
  }},
  "size": 1
}}'''

In [180]:
df.shape

(8963, 16)

In [191]:
X_train["y"].isna().sum()

107

In [193]:
X_train = X_train.loc[~X_train["x"].isna() & ~X_train["y"].isna(), :]

In [194]:
X_train.shape

(7063, 19)

In [210]:
# Il est nécessaire de spécifier l'index associé à chaque requête
header = '{"index" : "sirus_2020"}'

multiple_requetes = ""

# On itère sur le dataframe d'établissements polluants pour ajouter une requête spécifique à chacun d'entre eux
n_etab = X_train.shape[0] # Pour l'exemple, on prend les 10 premiers

for index, row in X_train.iloc[0:n_etab][['nom_etablissement', 'code_apet', 'x', 'y']].iterrows():
    
    multiple_requetes+= header
    multiple_requetes+= '\n'
    multiple_requetes+= requete_type.format_map(row).replace("\n","")
    multiple_requetes+= '\n'

In [211]:
res = es.msearch(body = multiple_requetes)



RequestError: RequestError(400, 'x_content_parse_exception', '[geo_distance] query does not support [location]')

In [204]:
len(res['responses'])

7063

In [205]:
res['responses']

[{'took': 5,
  'timed_out': False,
  '_shards': {'total': 3, 'successful': 3, 'skipped': 0, 'failed': 0},
  'hits': {'total': {'value': 0, 'relation': 'eq'},
   'max_score': None,
   'hits': []},
  'status': 200},
 {'took': 7,
  'timed_out': False,
  '_shards': {'total': 3, 'successful': 3, 'skipped': 0, 'failed': 0},
  'hits': {'total': {'value': 0, 'relation': 'eq'},
   'max_score': None,
   'hits': []},
  'status': 200},
 {'took': 6,
  'timed_out': False,
  '_shards': {'total': 3, 'successful': 3, 'skipped': 0, 'failed': 0},
  'hits': {'total': {'value': 0, 'relation': 'eq'},
   'max_score': None,
   'hits': []},
  'status': 200},
 {'took': 4,
  'timed_out': False,
  '_shards': {'total': 3, 'successful': 3, 'skipped': 0, 'failed': 0},
  'hits': {'total': {'value': 0, 'relation': 'eq'},
   'max_score': None,
   'hits': []},
  'status': 200},
 {'took': 5,
  'timed_out': False,
  '_shards': {'total': 3, 'successful': 3, 'skipped': 0, 'failed': 0},
  'hits': {'total': {'value': 0, 'rela

In [199]:
X_train["siret_elastic"] = [res['responses'][i]['hits']['hits'][0]["_source"]["siret_id"] if \
    res['responses'][i]['hits']['hits'] else np.NaN for i in range(X_train.shape[0]) ]

KeyError: 'hits'

In [118]:
X_train[["siret_elastic", "numero_siret_true"]].head(20)

Unnamed: 0,siret_elastic,numero_siret_true
543,31650260800029.0,30957502500031
2133,38972750400026.0,38972750400018
2683,,31446627700048
5473,,45236852500013
4793,31732564500024.0,31732564500024
5824,37844298200021.0,37844298200021
5581,,30582329600077
3298,32127621400012.0,32127621400012
8268,24670048800017.0,21670482500019
8525,37941102800044.0,37941102800044


In [119]:
X_train["match"] = (X_train["numero_siret_true"] == X_train["siret_elastic"])

In [120]:
X_train["match"].value_counts()/X_train.shape[0]

True     0.505718
False    0.494282
Name: match, dtype: float64