In [89]:
import pandas as pd
import numpy as np
import re
import json

In [90]:
COL_DICT = {'jahr': int,
 'kennnummer': str,
 'betriebsname': str,
 'betriebsname_2': str,
 'plz': str,
 'ort': str,
 'strasse': str,
 'hausnr': str,
 'bundesland': str,
 'flusseinzugsgebiet': str,
 'geo_lat_wgs84': np.float64,
 'geo_long_wgs84': np.float64,
 'taet_nr': str,
 'taetigkeit': str,
 'activity': str,
 'haupttaetigkeit': str,
 'branche': str,
 'sector': str,
 'nace_id': int,
 'nace_wirtschaftszweig': str,
 'nace_sector': str,
 'stoffgruppe': str,
 'substances_group': str,
 'schadstoff': str,
 'pollutant': str,
 'umweltkompartiment': str,
 'releases_to': str,
 'jahresfracht_freisetzung': float,
 'versehentliche_freisetzung': float,
 'schadstoff_schwellenwert': float,
 'einheit': str,
 'unit': str,
 'bestimmungsmethode': str,
 'determination_method': str,
 'schutzgrund_fracht': str,
 'confidential_reason_release': str,
 'schutzgrund_betrieb': str,
 'confidential_reason_facility': str}

DROP_LIST = ['flusseinzugsgebiet', 'schutzgrund_fracht', 'confidential_reason_release', 'schutzgrund_betrieb', 'confidential_reason_facility', 'versehentliche_freisetzung', 'bestimmungsmethode', 'determination_method', 'einheit', 'umweltkompartiment', 'haupttaetigkeit', 'taetigkeit', 'nace_wirtschaftszweig', 'schadstoff', 'stoffgruppe', 'branche', 'schadstoff_schwellenwert', 'blockid', 'geo_lat_wgs84', 'geo_long_wgs84']

In [91]:
raw0 = pd.read_csv("../data/2019-11-25_PRTR-Deutschland_Freisetzungen.csv", encoding="iso-8859-1", sep=";", dtype=COL_DICT)

In [92]:
raw1 = raw0.rename(columns={"jahr": "year", "betriebsname_2": "plantname", "betriebsname": "company", "kennnummer": "plantid", "ort": "place", "strasse": "street", "bundesland": "federalstate", "jahresfracht_freisetzung": "amount"})

In [93]:
#raw0 = pd.read_csv("../data/2019-11-25_PRTR-Deutschland_Freisetzungen.csv", encoding="iso-8859-1", error_bad_lines=False, engine="c", sep=";", low_memory=False)

In [94]:
#raw0

In [95]:
bpm = pd.read_csv("../basic/block_plant_mapper.csv")
bpm.drop_duplicates(["plantid"], inplace=True)

In [96]:
raw2 = pd.merge(raw1, bpm, how="inner", on="plantid")

In [97]:
raw2.year = raw2.year.astype(int)

In [98]:
raw3 = raw2.drop(DROP_LIST, axis=1)

In [99]:
p = "PCDD + PCDF (dioxins + furans) (as Teq)"

In [100]:
q = "Trichlorobenzenes (TCBs) (all isomers)"
z = "Total organic carbon (TOC) (as total C or COD/3)"

In [101]:
re.findall("([A-Z]{2,}[A-Za-z]*?)\)",q)[-1]

'TCBs'

In [102]:
t = "Kraftwerk Voerde OHG der STEAG GmbH und RWE"

In [103]:
match_list = ["Vattenfall", "RWE", "Uniper", "LEAG", "EnBW", "Steag"]

In [104]:
raw4 = raw3.sort_values(by="amount", ascending=False)

In [105]:
def fix_company(company):
    for name in match_list:
        if name in company:
            return name

In [106]:
def fix_pollutant(pol):
    match = re.findall("([A-Z]{1,}[0-9]*[A-Za-z]+[0-9]*?)\)",pol)
    if match:
        return match[-1]
    else:
        return pol

In [107]:
fix_pollutant(q)

'TCBs'

In [108]:
unit_2_dict = {9: "Mio. t", 6: "Tsd. t", 3: "t", 1: "kg", -3: "g", -6: "mg"}
amount_dict = {"CO2": 9, "CO": 6, "NO2": 6, "Cl": 6, "SO2": 6, "Total nitrogen": 6, "PM10": 6, "HCl": 3, "HF": 3, "N2O": 6, "Benzene": 3, "Pb": 3, "Zn": 3, "Benzene": 3, "Fluorides (as total F)": 3, "NMVOC": 3, "NH3": 3, "TOC": 3, "CH4": 3, "AOX": 3, "Total phosphorus": 3, "HCFCs": 3, "HFCs": 3, "DCE": 3, "Phenols (as total C)": 3, "Cu": 3, "PER": 3, "Cr": 3, "Trichloroethylene": 3, "Trichloromethane": 3, "TCM": 3, "CN": 3, "Ni": 3, "DCM": 3, "HCN": 3, "As": 3, "Hg": 3}
neg_dict = {"Teq": 6}

In [109]:
raw4['group'] = raw4['company'].apply(lambda x: fix_company(x))
raw4['pollutant'] = raw4['pollutant'].apply(lambda x: fix_pollutant(x))

In [110]:
raw5 = raw4.copy()

In [111]:
#raw4

In [112]:
raw5['potency'] = raw5['pollutant'].apply(lambda x: amount_dict.get(x, 1))

In [113]:
raw5['neg_potency'] = raw5['pollutant'].apply(lambda x: neg_dict.get(x, 0))

In [114]:
raw5['unit_2'] = raw5['potency'].apply(lambda x: unit_2_dict[x])

In [115]:
raw5['amount_2'] = raw5['amount'] / 10**(raw5['potency'])
raw5['amount_2'] = raw5['amount_2'] * 10**(raw5['neg_potency'])

In [116]:
raw5.loc[raw5['pollutant'] == "Teq"] = raw5.loc[raw5['pollutant'] == "Teq"].replace("kg", "mg")

In [135]:
#raw5

In [136]:
DROP_LIST = ['plz', 'company','plantname', 'group', 'federalstate',  'place', 'street', 'hausnr', 'taet_nr', 'nace_id', 'substances_group', 'unit', 'activity', 'sector', 'nace_sector', 'neg_potency']

In [137]:
raw6 = raw5.drop(DROP_LIST, axis=1)

In [138]:
m1 = raw6.duplicated(['year', 'pollutant', 'plantid', 'releases_to'])
m2 = raw6.duplicated(['year', 'pollutant', 'plantid'])
m3 = raw6.duplicated(['year', 'pollutant', 'plantid', 'releases_to'])

In [139]:
raw7 = raw6.sort_values(['plantid', 'year', 'pollutant'], ascending=[True, False, True])

In [140]:
#raw7

In [123]:
raw7.to_csv("pollutants_pg.csv", index=True, header=False)
raw7.to_csv("pollutants.csv", index=False)
raw7.to_csv("pollutants_nh.csv", index=False, header=False)

In [124]:
#raw6[m2].groupby("pollutant").max()

In [125]:
#raw6[m2].groupby("pollutant").min()

In [126]:
#raw5.loc[raw5["pollutant"] == "Teq"].sort_values("amount", ascending=False)

In [127]:
#raw4.loc[raw4['year'] == 2017].loc[raw4['plantid'] == '06-05-100-0248923'].sort_values("amount", ascending=False)

In [128]:
#raw4.loc[raw4['year'] == 2017].loc[raw4['plantid'] == '12-40710010000'].sort_values("amount", ascending=False) #12-40710010000 #06-05-300-0923949

In [129]:
#raw4.loc[raw4['year'] == 2009].loc[raw4['plantid'] == '06-05-100-0431554'].sort_values("amount", ascending=False)

In [130]:
#raw4.groupby("pollutant").sum()

In [131]:
#raw3.groupby("pollutant").sum()

In [132]:
#raw4

In [133]:
raw7.groupby('releases_to').min()

Unnamed: 0_level_0,year,plantid,pollutant,amount,potency,unit_2,amount_2
releases_to,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Air,2007,01-50-01000004838,As,0.000115,1,Mio. t,0.01
Land,2008,06-90031210632,AOX,15.0,3,Tsd. t,0.015
Water,2007,03-01-01241117210,AOX,0.000131,1,Tsd. t,0.00102


In [134]:
raw7

Unnamed: 0,year,plantid,pollutant,releases_to,amount,potency,unit_2,amount_2
40,2017,01-50-01000004838,CO2,Air,597000000.0,9,Mio. t,0.597000
43,2017,01-50-01000004838,HCl,Air,33600.0,3,t,33.600000
41,2017,01-50-01000004838,NO2,Air,174000.0,6,Tsd. t,0.174000
42,2017,01-50-01000004838,SO2,Air,189000.0,6,Tsd. t,0.189000
36,2016,01-50-01000004838,CO2,Air,633000000.0,9,Mio. t,0.633000
...,...,...,...,...,...,...,...,...
7998,2008,18042,Pb,Water,32.5,3,t,0.032500
7997,2008,18042,Zn,Water,272.0,3,t,0.272000
7994,2007,18042,CO2,Air,209038000.0,9,Mio. t,0.209038
7993,2007,18042,Cl,Water,77671600.0,6,Tsd. t,77.671600
