In [101]:
import pandas as pd
import numpy as np
import re
import json

In [102]:
COL_DICT = {'jahr': int,
 'kennnummer': str,
 'betriebsname': str,
 'betriebsname_2': str,
 'plz': str,
 'ort': str,
 'strasse': str,
 'hausnr': str,
 'bundesland': str,
 'flusseinzugsgebiet': str,
 'geo_lat_wgs84': np.float64,
 'geo_long_wgs84': np.float64,
 'taet_nr': str,
 'taetigkeit': str,
 'activity': str,
 'haupttaetigkeit': str,
 'branche': str,
 'sector': str,
 'nace_id': int,
 'nace_wirtschaftszweig': str,
 'nace_sector': str,
 'stoffgruppe': str,
 'substances_group': str,
 'schadstoff': str,
 'pollutant': str,
 'umweltkompartiment': str,
 'releases_to': str,
 'jahresfracht_freisetzung': float,
 'versehentliche_freisetzung': float,
 'schadstoff_schwellenwert': float,
 'einheit': str,
 'unit': str,
 'bestimmungsmethode': str,
 'determination_method': str,
 'schutzgrund_fracht': str,
 'confidential_reason_release': str,
 'schutzgrund_betrieb': str,
 'confidential_reason_facility': str}

DROP_LIST = ['flusseinzugsgebiet', 'schutzgrund_fracht', 'confidential_reason_release', 'schutzgrund_betrieb', 'confidential_reason_facility', 'versehentliche_freisetzung', 'bestimmungsmethode', 'determination_method', 'einheit', 'umweltkompartiment', 'haupttaetigkeit', 'taetigkeit', 'nace_wirtschaftszweig', 'schadstoff', 'stoffgruppe', 'branche', 'schadstoff_schwellenwert', 'blockid', 'geo_lat_wgs84', 'geo_long_wgs84']

In [103]:
raw0 = pd.read_csv("../data/2020-05-19_PRTR-Deutschland_Freisetzungen.csv", encoding="iso-8859-1", sep=";", dtype=COL_DICT)

In [104]:
raw1 = raw0.rename(columns={"jahr": "year", "betriebsname_2": "plantname", "betriebsname": "company", "kennnummer": "plantid", "ort": "place", "strasse": "street", "bundesland": "federalstate", "jahresfracht_freisetzung": "amount"})

In [105]:
raw1['plantid'] = raw1['plantid'].apply(lambda x: str(x).replace('/', '_'))

In [106]:
#raw0 = pd.read_csv("../data/2019-11-25_PRTR-Deutschland_Freisetzungen.csv", encoding="iso-8859-1", error_bad_lines=False, engine="c", sep=";", low_memory=False)

In [107]:
#raw0

In [108]:
bpm = pd.read_csv("../basic/block_plant_mapper.csv")
bpm['plantid'] = bpm['plantid'].apply(lambda x: str(x).replace('/', '_'))
bpm.drop_duplicates(["plantid"], inplace=True)

In [109]:
#raw1.loc[raw1.plantid == "03-01-01012110180"]

In [110]:
raw2 = pd.merge(raw1, bpm, how="inner", on="plantid")

In [111]:
raw2.year = raw2.year.astype(int)

In [112]:
raw3 = raw2.drop(DROP_LIST, axis=1)

In [113]:
#raw3.loc[raw3.plantid == "03-01-01012110180"]

In [114]:
p = "PCDD + PCDF (dioxins + furans) (as Teq)"

In [115]:
q = "Trichlorobenzenes (TCBs) (all isomers)"
z = "Total organic carbon (TOC) (as total C or COD/3)"

In [116]:
re.findall("([A-Z]{2,}[A-Za-z]*?)\)",q)[-1]

'TCBs'

In [117]:
t = "Kraftwerk Voerde OHG der STEAG GmbH und RWE"

In [118]:
match_list = ["Vattenfall", "RWE", "Uniper", "LEAG", "EnBW", "Steag"]

In [119]:
raw4 = raw3.sort_values(by="amount", ascending=False)

In [120]:
raw3.shape

(11072, 19)

In [121]:
raw4.shape

(11072, 19)

In [122]:
def fix_company(company):
    for name in match_list:
        if name in company:
            return name
        else:
            return company

In [123]:
def fix_pollutant(pol):
    match = re.findall("([A-Z]{1,}[0-9]*[A-Za-z]+[0-9]*?)\)",pol)
    if match:
        return match[-1]
    else:
        return pol

In [124]:
fix_pollutant(q)

'TCBs'

In [125]:
unit_2_dict = {9: "Mio. t", 6: "Tsd. t", 3: "t", 1: "kg", -3: "g", -6: "mg"}
amount_dict = {"CO2": 9, "CO": 6, "NO2": 6, "Cl": 6, "SO2": 6, "Total nitrogen": 6, "PM10": 6, "HCl": 3, "HF": 3, "N2O": 6, "Benzene": 3, "Pb": 3, "Zn": 3, "Benzene": 3, "Fluorides (as total F)": 3, "NMVOC": 3, "NH3": 3, "TOC": 3, "CH4": 3, "AOX": 3, "Total phosphorus": 3, "HCFCs": 3, "HFCs": 3, "DCE": 3, "Phenols (as total C)": 3, "Cu": 3, "PER": 3, "Cr": 3, "Trichloroethylene": 3, "Trichloromethane": 3, "TCM": 3, "CN": 3, "Ni": 3, "DCM": 3, "HCN": 3, "As": 3, "Hg": 3}
neg_dict = {"Teq": 6}

In [126]:
raw4['group'] = raw4['company'].apply(lambda x: fix_company(x))
raw4['pollutant'] = raw4['pollutant'].apply(lambda x: fix_pollutant(x))

In [127]:
raw5 = raw4.copy()

In [128]:
#raw4

In [85]:
raw5['potency'] = raw5['pollutant'].apply(lambda x: amount_dict.get(x, 1))

In [86]:
raw5['neg_potency'] = raw5['pollutant'].apply(lambda x: neg_dict.get(x, 0))

In [87]:
raw5['unit_2'] = raw5['potency'].apply(lambda x: unit_2_dict[x])

In [88]:
raw5['amount_2'] = raw5['amount'] / 10**(raw5['potency'])
raw5['amount_2'] = raw5['amount_2'] * 10**(raw5['neg_potency'])

In [89]:
raw5.loc[raw5['pollutant'] == "Teq"] = raw5.loc[raw5['pollutant'] == "Teq"].replace("kg", "mg")

In [90]:
#raw5

In [91]:
DROP_LIST = ['plz', 'company','plantname', 'group', 'federalstate',  'place', 'street', 'hausnr', 'taet_nr', 'nace_id', 'substances_group', 'unit', 'activity', 'sector', 'nace_sector', 'neg_potency']

In [92]:
raw6 = raw5.drop(DROP_LIST, axis=1)

In [93]:
raw6['pollutant2'] = raw6['pollutant'] + ' [' + raw6['unit_2'] + ']'

In [94]:
raw6

Unnamed: 0,year,plantid,pollutant,releases_to,amount,potency,unit_2,amount_2,pollutant2
2343,2013,06-05-100-0248923,CO2,Air,3.330000e+10,9,Mio. t,33.3,CO2 [Mio. t]
2354,2014,06-05-100-0248923,CO2,Air,3.240000e+10,9,Mio. t,32.4,CO2 [Mio. t]
2396,2018,06-05-100-0248923,CO2,Air,3.220000e+10,9,Mio. t,32.2,CO2 [Mio. t]
2365,2015,06-05-100-0248923,CO2,Air,3.210000e+10,9,Mio. t,32.1,CO2 [Mio. t]
3613,2007,06-05-300-0326774,CO2,Air,3.130000e+10,9,Mio. t,31.3,CO2 [Mio. t]
...,...,...,...,...,...,...,...,...,...
529,2015,03-03-03030273580,Teq,Water,1.310000e-04,1,mg,13.1,Teq [mg]
9218,2015,12-40710010000,Teq,Air,1.300000e-04,1,mg,13.0,Teq [mg]
2580,2012,06-05-100-0431554,Teq,Air,1.300000e-04,1,mg,13.0,Teq [mg]
2589,2013,06-05-100-0431554,Teq,Air,1.160000e-04,1,mg,11.6,Teq [mg]


In [95]:
m1 = raw6.duplicated(['year', 'pollutant', 'plantid', 'releases_to'])
m2 = raw6.duplicated(['year', 'pollutant', 'plantid'])
m3 = raw6.duplicated(['year', 'pollutant', 'plantid', 'releases_to'])

In [96]:
raw7 = raw6.sort_values(['plantid', 'year', 'potency'], ascending=[True, False, False])

In [41]:
#raw7

In [42]:
raw7.to_csv("pollutants_pg.csv", index=True, header=False)
raw7.to_csv("pollutants.csv", index=False)
raw7.to_csv("pollutants_nh.csv", index=False, header=False)

In [43]:
raw7.loc[raw7.plantid == "14-70-46630660001"]

Unnamed: 0,year,plantid,pollutant,releases_to,amount,potency,unit_2,amount_2,pollutant2
10555,2018,14-70-46630660001,NO2,Air,13400000.0,6,Tsd. t,13.400,NO2 [Tsd. t]
10552,2018,14-70-46630660001,SO2,Air,10900000.0,6,Tsd. t,10.900,SO2 [Tsd. t]
10553,2018,14-70-46630660001,CO,Air,5780000.0,6,Tsd. t,5.780,CO [Tsd. t]
10563,2018,14-70-46630660001,PM10,Air,350000.0,6,Tsd. t,0.350,PM10 [Tsd. t]
10554,2018,14-70-46630660001,N2O,Air,205000.0,6,Tsd. t,0.205,N2O [Tsd. t]
...,...,...,...,...,...,...,...,...,...
10431,2008,14-70-46630660001,N2O,Air,169000.0,6,Tsd. t,0.169,N2O [Tsd. t]
10437,2008,14-70-46630660001,HCl,Air,129000.0,3,t,129.000,HCl [t]
10436,2008,14-70-46630660001,Pb,Air,243.0,3,t,0.243,Pb [t]
10434,2008,14-70-46630660001,Hg,Air,122.0,3,t,0.122,Hg [t]


In [44]:
bpm.loc[bpm.plantid == "14-70-46630660001"]

Unnamed: 0,plantid,blockid
5,14-70-46630660001,BNA0124


In [45]:
#raw2.loc[raw2.plantid == "06-04-11_2039669_2_0"]

In [46]:
#raw6[m2].groupby("pollutant").max()

In [47]:
#raw6[m2].groupby("pollutant").min()

In [48]:
#raw5.loc[raw5["pollutant"] == "Teq"].sort_values("amount", ascending=False)

In [49]:
#raw4.loc[raw4['year'] == 2017].loc[raw4['plantid'] == '06-05-100-0248923'].sort_values("amount", ascending=False)

In [50]:
#raw4.loc[raw4['year'] == 2017].loc[raw4['plantid'] == '12-40710010000'].sort_values("amount", ascending=False) #12-40710010000 #06-05-300-0923949

In [51]:
#raw4.loc[raw4['year'] == 2009].loc[raw4['plantid'] == '06-05-100-0431554'].sort_values("amount", ascending=False)

In [52]:
#raw4.groupby("pollutant").sum()

In [53]:
#raw3.groupby("pollutant").sum()

In [54]:
#raw4

In [55]:
raw7.groupby('releases_to').min()

Unnamed: 0_level_0,year,plantid,pollutant,amount,potency,unit_2,amount_2,pollutant2
releases_to,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Air,2007,01-50-01000004838,As,0.000115,1,Mio. t,0.01,As [t]
Land,2008,06-90031210632,AOX,15.0,3,Tsd. t,0.015,AOX [t]
Water,2007,03-01-01241117210,AOX,0.000131,1,Tsd. t,0.00102,AOX [t]


In [56]:
raw7

Unnamed: 0,year,plantid,pollutant,releases_to,amount,potency,unit_2,amount_2,pollutant2
44,2018,01-50-01000004838,CO2,Air,602000000.0,9,Mio. t,0.602000,CO2 [Mio. t]
46,2018,01-50-01000004838,SO2,Air,208000.0,6,Tsd. t,0.208000,SO2 [Tsd. t]
45,2018,01-50-01000004838,NO2,Air,196000.0,6,Tsd. t,0.196000,NO2 [Tsd. t]
47,2018,01-50-01000004838,HCl,Air,46100.0,3,t,46.100000,HCl [t]
40,2017,01-50-01000004838,CO2,Air,597000000.0,9,Mio. t,0.597000,CO2 [Mio. t]
...,...,...,...,...,...,...,...,...,...
10194,2008,18042,Zn,Water,272.0,3,t,0.272000,Zn [t]
10195,2008,18042,Pb,Water,32.5,3,t,0.032500,Pb [t]
10191,2007,18042,CO2,Air,209038000.0,9,Mio. t,0.209038,CO2 [Mio. t]
10190,2007,18042,Cl,Water,77671600.0,6,Tsd. t,77.671600,Cl [Tsd. t]
