In [2]:
from elasticsearch import Elasticsearch
HOST = 'elasticsearch-master.projet-ssplab'

def elastic():
    """Connection avec Elastic sur le data lab"""
    es = Elasticsearch([{'host': HOST, 'port': 9200, 'scheme': 'http'}], http_compress=True, request_timeout=200)
    return es

es = elastic()

In [3]:
# Importation des bases
import functions as fc
dict_data = fc.read_all_raw(fc.list_bases)
dict_data.keys()

dict_keys(['rejets', 'etablissements', 'emissions', 'Trait_dechets_non_dangereux', 'Trait_dechets_dangereux', 'Prod_dechets_non_dangereux', 'Prod_dechets_dangereux', 'Prelevements'])

In [23]:
df = dict_data["etablissements"]
df = df.rename({'numero_siret': "numero_siret_true"}, axis = 1)

In [25]:
df["adresse"].head()

0                                  Plaine Saint Pierre
1                                 17 avenue de l'adour
2                                      1 route de lens
3                                               BP 104
4    Boulevard Dunant (entrée par Allée des Grands ...
Name: adresse, dtype: object

In [5]:

requete_type = '''{{ 
  "query": {{
    "bool": {{
      "should": [
        {{ "match": {{ "rs_denom":   "{nom_etablissement}" }}}},
        {{ "match": {{ "geo_adresse": "{adresse}" }}}},
        {{ "match": {{ "sir_adr_et_com_lib": "{commune}" }}}}
      ],
      "minimum_should_match": 2,
      "filter": [
        {{ "match":  {{ "adr_et_post": "{code_postal}" }}}}
      ]
    }}
  }},
  "size": 1
}}'''

In [31]:
df["code_apet"] = df["code_ape"].str[:4]
df["code_apet"].head()

0    3700
1    3700
2    3700
3    3700
4    3700
Name: code_apet, dtype: object

In [32]:
requete_type = '''{{ 
  "query": {{
    "bool": {{
      "should": [
        {{ "match": {{ "rs_denom":   "{nom_etablissement}" }}}}
      ],
      "filter": [
        {{ "match":  {{ "adr_et_post": "{code_postal}" }}}},
        {{ "prefix":  {{ "apet": "{code_apet}" }}}}
      ]
    }}
  }},
  "size": 1
}}'''

In [18]:
df.shape

(8963, 15)

In [22]:
df["adresse"].isna().sum()

0

In [None]:
df["code_postal"]

In [34]:
# Il est nécessaire de spécifier l'index associé à chaque requête
header = '{"index" : "sirus_2020"}'

multiple_requetes = ""

# On itère sur le dataframe d'établissements polluants pour ajouter une requête spécifique à chacun d'entre eux
n_etab = df.shape[0] # Pour l'exemple, on prend les 10 premiers

for index, row in df.iloc[0:n_etab][['nom_etablissement',  'code_postal', 'code_apet']].iterrows():
    
    multiple_requetes+= header
    multiple_requetes+= '\n'
    multiple_requetes+= requete_type.format_map(row).replace("\n","")
    multiple_requetes+= '\n'

In [35]:
res = es.msearch(body = multiple_requetes)



In [36]:
res['responses'][0]['hits']['hits']

[{'_index': 'sirus_2020_e_3_ngr_bool',
  '_type': '_doc',
  '_id': '24340076900044',
  '_score': 1.0,
  '_source': {'sirus_id': '243400769',
   'nic': '00044',
   'ape': '8411Z',
   'apet': '3700Z',
   'eff_3112_et': '17.0',
   'eff_etp_et': '15.0',
   'eff_et_effet_daaaammjj': '20181231',
   'enseigne_et1': 'SERVICE ASSAINISSEMENT',
   'nom_comm_et': '',
   'adr_et_loc_geo': '3403202144',
   'adr_et_compl': 'QUAI OUEST',
   'adr_et_voie_num': '39',
   'adr_et_voie_repet': '',
   'adr_et_voie_type': 'BD',
   'adr_et_voie_lib': 'DE VERDUN',
   'adr_et_cedex': '',
   'adr_et_distsp': '',
   'sir_adr_et_com_lib': 'BEZIERS',
   'adr_et_post': '34500',
   'adr_et_l1': 'COMMUNAUTE AGGLO BEZIERS MEDITERRANEE',
   'adr_et_l2': 'SERVICE ASSAINISSEMENT',
   'adr_et_l3': 'QUAI OUEST',
   'adr_et_l4': '39 BD DE VERDUN',
   'adr_et_l5': '',
   'adr_et_l6': '34500 BEZIERS',
   'adr_et_l7': '',
   'nic_siege': '00093',
   'unite_type': '1',
   'region': '76',
   'adr_depcom': '34032',
   'region_impl

In [9]:
# on rappatrie les siret trouvés 
res['responses'][0]['hits']['hits'][0]["_source"]["siret_id"]

'81162101000024'

In [37]:
len(res['responses'])

8963

In [38]:
df["siret_elastic"] = [res['responses'][i]['hits']['hits'][0]["_source"]["siret_id"] if res['responses'][i]['hits']['hits'] else np.NaN for i in range(df.shape[0]) ]

In [41]:
df[["siret_elastic", "numero_siret_true"]].head(20)

Unnamed: 0,siret_elastic,numero_siret_true
0,24340076900044.0,41003460701688
1,,20006710600241
2,21620523700086.0,57202552600813
3,,25755000400044
4,20006905200070.0,20006905200070
5,,57202552601142
6,20006755100156.0,25740097800017
7,,85069047000016
8,50424344500013.0,24440064400039
9,30821885800170.0,20006925000021


In [47]:
df[["siret_elastic", "numero_siret_true"]].dtypes

siret_elastic        object
numero_siret_true    object
dtype: object

In [46]:
df["numero_siret_true"] = df["numero_siret_true"].astype(str)

In [48]:
df["match"] = (df["numero_siret_true"] == df["siret_elastic"])

In [49]:
df["match"].value_counts()

False    4533
True     4430
Name: match, dtype: int64