# Imports and Data Read

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from geopy import distance
from sklearn.neighbors import BallTree
from tqdm import tqdm
import plotly.express as px
import re
from haversine import haversine_vector, Unit

In [2]:
ct_data = pd.read_csv('CT_asset_cement_emissions.csv')
ets_data = pd.read_csv('EUETS_acquiring_accounts_bytransactions.csv')

cols = ['Acquiring.Holder.MainAddressLine', 'Acquiring.Holder.SecondaryAddressLine', 'Acquiring.Holder.City','Acquiring.Holder.ZipCode','Acquiring.Holder.Country']
ets_data['Address'] = ets_data[cols].apply(lambda row: ', '.join(row.values.astype(str)), axis=1)
ets_data['Address'] = ets_data['Address'].str.replace('nan, ','')
ct_data['st_astext'] = ct_data['st_astext'].astype(str)

ct_data[['lng', 'lat']] = ct_data['st_astext'].str.split(' ', 1, expand=True)
ct_data['lng'] = ct_data['lng'].str.replace('POINT(','',regex=False)
ct_data['lat'] = ct_data['lat'].str.replace(')','',regex=False)
ct_data['lat'] = ct_data['lat'].astype(float)
ct_data['lng'] = ct_data['lng'].astype(float)
codes = pd.read_excel('CountryCodes.xlsx')


  ets_data = pd.read_csv('EUETS_acquiring_accounts_bytransactions.csv')
  ct_data[['lng', 'lat']] = ct_data['st_astext'].str.split(' ', 1, expand=True)


# Filtering CT data for countries that are in ETS data

In [3]:
ct_data_2digcode = pd.merge(ct_data,codes,how='left',left_on='iso3_country',right_on='Code3')
ct_data_filterETS = ct_data_2digcode[ct_data_2digcode['Code2'].isin(ets_data['Acquiring.Holder.CountryCode'].unique())]
#ct_data_filterETS.to_csv('filteredETS_CTdata.csv')

In [4]:
ets_data.head()

Unnamed: 0,TransactionID,NbOfUnits,Acquiring.AccountIDRegistryCode,Acquiring.AccountID,Acquiring.RegistryCode,Acquiring.NationalAdministrator,Acquiring.AccountStatus,Acquiring.AccountOpeningDate,Acquiring.AccountType,Acquiring.RelatedInstallationAircraftOperatorID,...,Acquiring.CommitmentPeriod,Acquiring.Holder.Name,Acquiring.Holder.City,Acquiring.Holder.SecondaryAddressLine,Acquiring.Holder.RelationshipType,Acquiring.Holder.CountryCode,Acquiring.Holder.Country,Acquiring.Holder.ZipCode,Acquiring.Holder.MainAddressLine,Address
0,FR21168,2000,AT10621,10621.0,AT,Austria,closed,2005-12-29 00:00:00.0,Former Operator Holding Account,,...,Supplementary Program Commitment Period (2005 ...,Martin Pichler Ziegelwerk GmbH.,Aschach an der Donau,Ziegeleistraße 14,Account holder,AT,Austria,4082,,"Ziegeleistraße 14, Aschach an der Donau, 4082,..."
1,AT8881,13646,AT10621,10621.0,AT,Austria,closed,2005-12-29 00:00:00.0,Former Operator Holding Account,,...,Supplementary Program Commitment Period (2005 ...,Martin Pichler Ziegelwerk GmbH.,Aschach an der Donau,Ziegeleistraße 14,Account holder,AT,Austria,4082,,"Ziegeleistraße 14, Aschach an der Donau, 4082,..."
2,AT13722,13646,AT10621,10621.0,AT,Austria,closed,2005-12-29 00:00:00.0,Former Operator Holding Account,,...,Supplementary Program Commitment Period (2005 ...,Martin Pichler Ziegelwerk GmbH.,Aschach an der Donau,Ziegeleistraße 14,Account holder,AT,Austria,4082,,"Ziegeleistraße 14, Aschach an der Donau, 4082,..."
3,AT7617,13646,AT10621,10621.0,AT,Austria,closed,2005-12-29 00:00:00.0,Former Operator Holding Account,,...,Supplementary Program Commitment Period (2005 ...,Martin Pichler Ziegelwerk GmbH.,Aschach an der Donau,Ziegeleistraße 14,Account holder,AT,Austria,4082,,"Ziegeleistraße 14, Aschach an der Donau, 4082,..."
4,AT17534,13646,AT10621,10621.0,AT,Austria,closed,2005-12-29 00:00:00.0,Former Operator Holding Account,,...,Supplementary Program Commitment Period (2005 ...,Martin Pichler Ziegelwerk GmbH.,Aschach an der Donau,Ziegeleistraße 14,Account holder,AT,Austria,4082,,"Ziegeleistraße 14, Aschach an der Donau, 4082,..."


# Geocoding all ETS addresses to Lat,Lng

In [None]:
locator = Nominatim(user_agent="myGeocoder")
unique_locs=ets_data['Address'].unique()

latlngs = np.zeros((len(unique_locs),2))

for idx,loc in enumerate(tqdm(unique_locs)):
    try:
        gc = locator.geocode(loc)
        latlngs[idx,0]=gc.latitude
        latlngs[idx,1]=gc.longitude
    except:
        latlngs[idx,0]=91
        latlngs[idx,1]=91
#ets_data['point'] = ets_data['location'].apply(lambda loc: tuple(loc.point) if loc else None)
#ets_data[['latitude', 'longitude', 'altitude']] = pd.DataFrame(df['point'].tolist(), index=df.index)

In [None]:
np.savetxt("GeocodedETSLatlngs.csv", latlngs, delimiter=",")

In [None]:
latlngs = np.loadtxt('GeocodedETSLatlngs.csv',delimiter=',')
fig = px.scatter_geo(lat=latlngs[:,0], lon=latlngs[:,1],hover_name=unique_locs)

## TODO:
* Reverse geocode all CT points
* Match CT and ETS on country AND zip code
* Generate distances between all matches pairs
* For distances below 5km

# Reverse Geocode all CT points to get Country,Zip

In [7]:
ct_addresses = np.zeros(len(ct_data.lat),dtype=str)

for idx,loc in enumerate(tqdm(ct_data.lat)):
    try:
        ct_addresses[idx]=(locator.reverse(str(ct_data.lat[idx])+','+str(ct_data.lng[idx])))
    except:
        ct_addresses[idx]='NA'
    print(ct_addresses)
    break
#ets_data['point'] = ets_data['location'].apply(lambda loc: tuple(loc.point) if loc else None)
#ets_data[['latitude', 'longitude', 'altitude']] = pd.DataFrame(df['point'].tolist(), index=df.index)

  0%|                                                                                                                                             | 0/992200 [00:00<?, ?it/s]

['N' '' '' ... '' '' '']





# Generate distance between all pairs of CT lat,lng and ETS lat,lng in same country

## TODO:
* Split CT and ETS into country blocks
* For each matching country block, calculate haversine

In [None]:
ets_country_blocks = []
ct_country_blocks = []

all_countries_ets = ct_data_filterETS['Code2'].unique()
all_countries_ct = ets_data['Acquiring.Holder.CountryCode'].unique()

for country in all_countries_ets:
    

In [34]:
haversine_vector(ETS_countryblock, CT_countryblock, Unit.KILOMETERS, comb=True)
len(ct_data_filterETS)*len(latlngs)

923560

13201