In [5]:
import pandas as pd
from pandas.io.parsers import ParserError
import numpy as np
from helper import get_mapper
import json
from sqlalchemy import create_engine
import os
import re

In [6]:
from os import listdir, stat
from os.path import isfile, join

In [7]:
import datetime
from calendar import isleap

In [8]:
MIN_SIZE = 512
BASE_DIR = "."
FIX_DIR = "fix"

In [9]:
def get_fix_files(BASE_DIR):
    tmp = [f for f in os.listdir(BASE_DIR) if os.path.isfile(join(BASE_DIR, f))]
    tmp.sort()
    return tmp

In [10]:
def get_files_from_folder(folder):
    onlyfiles = [folder + "/" + f for f in listdir(folder) if isfile(join(folder, f))]
    onlyfiles.sort()
    files = [f for f in onlyfiles if stat(f).st_size > MIN_SIZE]
    return files

In [11]:
FOLDERS = [os.path.join(BASE_DIR, o) for o in os.listdir(BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR,o))]
FOLDERS.sort()
FOLDERS = FOLDERS[1:-2]


In [12]:
hours = (365 * 5 + 1) * 24
base = datetime.datetime(2015, 1, 1)
date_list = [base + datetime.timedelta(hours=x) for x in range(0, hours)]
COMPLETE = pd.DataFrame(data={'produced_at': date_list})
COMPLETE.produced_at = pd.to_datetime(COMPLETE.produced_at)

In [13]:
FOLDERS

['./2015', './2016_17', './2018_19']

In [14]:
mapper = get_mapper('plantmapper.json')

In [15]:
FILES_L = [get_files_from_folder(f) for f in FOLDERS]
FILES = [item for sublist in FILES_L for item in sublist]

In [16]:
#FILES

In [17]:
def get_block(col):
    return col.split("Generation_DE ")[1].rsplit('[MW]')[0]

In [18]:
def gen_powers(dirs):
    dfs = []
    for d in dirs:
        print(d)
        files = get_files_from_folder(d)
        tmp = gen_power(files)
        dfs.append(tmp)
        print(dfs)
    
    df = pd.concat(dfs, ignore_index=True)
    return df

In [19]:
def get_msg(tup):
    msg = ""
    corr_dates, corr_power = tup
    if corr_dates:
        msg += "Dates wrong"
    elif corr_power:
        msg += "Power wrong"
    return msg

In [20]:
def validate_dirs(dirs):
    errored_files = []
    for d in dirs:
        print(d)
        files = get_files_from_folder(d)
        for file in files:
            try:
                df = pd.read_csv(file, sep=";")#, na_values=0)
                result, errtup = is_valid_df(df)
                if not result:
                    errored_files.append((file, get_msg(errtup)))
            except ParserError:
                errored_files.append((file, "ParserError"))
    return errored_files

In [21]:
def df_correct_dates(df, name):
    count = df.loc[df[name].apply(lambda x: len(x.split("."))) != 3].shape[0]
    if count == 0:
        return True
    else:
        return False

In [22]:
def df_correct_power(df):
    cols = list(df)[2:]
    #print(cols)
    map_dict = {}
    
    for col in cols:
        map_dict[col] = str
        
        
    df2 = df.astype(map_dict)
    
    res = True
    
    for col in cols:
        count = df2.loc[df2[col].apply(lambda x: len(x.split(":"))) != 1].shape[0]
        if count == 0:
            res = res and True
        else:
            res = res and False
    return res

In [23]:
def is_valid_df(df):
    result = True
    date, tod = ("Date", "Time of day")
    if "Date" not in list(df):
        date, tod = ("Datum", "Uhrzeit")
    
    corr_dates = df_correct_dates(df, date)
    corr_power = df_correct_power(df)
    
    result = result and corr_dates
    result = result and corr_power
    return (result, (corr_dates, corr_power))

In [24]:
def get_name_from_file(s):
    return s.rsplit('_2018', 1)[0]

In [25]:
def get_plant(f):
    return f.split("/")[2].rsplit("_", 3)[0]

In [26]:
def get_columns(plant, cols):
    map_dict = {}
    to_delete = []
    for c in cols:
        block = ""
        try:
            block = mapper[plant][get_block(c)]
        except KeyError:
            print("KeyError")
            print(plant)
            print(cols)
            print(c)
        if block:
            map_dict[c] = block
        else:
            to_delete.append(c)
            
    return map_dict, to_delete

In [27]:
def conv_to_dt(df):
    date, tod = ("Date", "Time of day")
    #print(df.shape)
    #print(list(df))
    if "Date" not in list(df):
        date, tod = ("Datum", "Uhrzeit")
    df["produced_at"] = df[date] + " " + df[tod]
    df["produced_at"] = pd.to_datetime(df['produced_at'], errors='coerce')
    df = df.drop(columns=[date, tod])
    df2 = df.drop_duplicates(["produced_at"])
    df3 = df2.dropna(subset=['produced_at']) # remove coerced errors
    return df2

In [28]:
def rename_to_blockid(df, name):
    cols = list(df)[:-1]
    map_dict, to_delete = get_columns(name, cols)
    df2 = df.rename(columns=map_dict)
    df3 = df2.drop(columns=to_delete)
    return df3

In [29]:
def fix_path(file):
    tmp = file.split("/")
    return "fix/" + tmp[2]

In [30]:
def get_str_dict(array):
    res = {}
    for a in array:
        res[a] = str
    return res

def get_int_dict(array):
    res = {}
    for a in array:
        res[a] = int
    return res

In [31]:
def fix_num(x):
    if isinstance(x, float):
        return int(x)
    elif str(x).isnumeric():
        return x
    else:
        return "".join(re.findall(r"\d", x))

In [32]:
def gen_date_df(years):
    hours = 0
    baseyear = years[0]
    for year in years:
        days = 366 if isleap(year) else 365
        hours += days * 24
    base = datetime.datetime(baseyear, 1, 1)
    date_list = [base + datetime.timedelta(hours=x) for x in range(0, hours)]
    datedf = pd.DataFrame(data={'produced_at': date_list})
    datedf.produced_at = pd.to_datetime(datedf.produced_at)
    return datedf

In [33]:
def melt_to_power(df):
    #df = fix_df(df)
    final = df.dropna(subset=["produced_at"])
    powers = final.melt(id_vars=["produced_at"], var_name='blockid', value_name='power')
    powers2 = powers.copy()
    #powers2['power'] = powers2['power'].fillna(0)
    #powers2.power.replace(['-'], [0], inplace=True)
    #powers2 = powers2.astype({"power": str})
    #powers2['power'] = powers2['power'].apply(lambda x: x.replace(".", ""))
    powers2['power'] = powers2['power'].fillna(0)
    powers2 = powers2.astype({"power": int})
    #powers3 = powers2.copy()
    return powers2

In [34]:
def gen_power(filelist):
    df = gen_df(filelist)
    return conv_to_power(df)

In [35]:
def gen_power3(files):
    result = COMPLETE
    for plant, files in FILES_DICT.items():
        final = pd.DataFrame()
        for idx, f in enumerate(files):
            f = "fix/" + f
            df = pd.read_csv(f, parse_dates=["produced_at"])
            df2 = df.merge(DATEDF_LIST[idx], on='produced_at', how='right')
            if final.empty:
                final = df2
            else:
                final = final.append(df2, sort=False)
        result = pd.merge(result, final, how='left', on=['produced_at'])
    return result.sort_values(by=['produced_at'])

In [36]:
def gen_power2(files):
    result = COMPLETE
    final = pd.DataFrame()
    for idx, f in enumerate(files):
        f = "fix/" + f
        df = pd.read_csv(f, parse_dates=["produced_at"])
        #print()
        #df.produced_at = pd.to_datetime(df.produced_at)
        #print(df.dtypes)
        #print(DATEDF_LIST[idx].dtypes)
        #df2 = df.merge(DATEDF_LIST[idx], on='produced_at')
        df2 = df.merge(DATEDF_LIST[idx], on='produced_at', how='right') # use left for not filling missing values with na
        #print(df.shape)
        if final.empty:
            final = df2
        else:
            final = final.append(df2)
        final.sort_values(by=['produced_at'], inplace=True)
    return final

In [37]:
def get_year(f):
    return f.split("/")[1]

In [38]:
def get_date_df_from_file(f):
    return gen_date_df(get_years(get_year(f)))

In [39]:
def get_years(yearstr):
    if yearstr == "2015":
        return [2015]
    elif yearstr == "2016_17":
        return [2016, 2017]
    elif yearstr == "2018_19":
        return [2018, 2019]
    else:
        raise ValueError("wrong data " + yearstr)

In [40]:
def get_fix_filename(f):
    return f.split("/")[1]

In [41]:
def fill_to_int(df):
    headers = list(df)
    headers.remove('produced_at')
    df[headers] = df[headers].fillna(0)
    df = df.astype(get_int_dict(headers))
    return df

In [42]:
def fix_df(df):
    headers = list(df)
    headers.remove('produced_at')
    #print(headers)
    powers2 = df.copy()
    powers2 = powers2.astype(get_str_dict(headers))
    powers2[headers] = powers2[headers].fillna("0")
    #powers2[headers] = powers2[headers].applymap(lambda x: int(x) if str(x).isnumeric() else x) # cast floats to int
    powers2[headers] = powers2[headers].applymap(lambda x: "0" if not str(x).isnumeric() and ":" in x else x) # remove dates from int column
    powers2[headers] = powers2[headers].applymap(lambda x: x if str(x).isnumeric() else "".join(re.findall(r"\d", x)) or 0) # remove . in ints
    powers2[headers] = powers2[headers].applymap(lambda x: x if not (str(x).isnumeric() and len(str(x)) < 4) else int(str(x)[0:4])) # trunc to first 4 digits
    powers2[headers] = powers2[headers].fillna(0)
    #powers2[headers] = powers2[headers].replace(r'^\s*$', 0, regex=True) # replace emptystrings with zero
    powers3 = powers2.copy()
    powers3 = powers3.astype(get_int_dict(headers))
    #powers3['produced_at'] = pd.to_datetime(powers3['produced_at'])
    return powers3

In [43]:
def gen_df(filelist):
    #powers = pd.DataFrame()
    tmp = COMPLETE
    for file in filelist[:]:
        
        refdf = get_date_df_from_file(file)
        
        #print(file)
        try:
            df = pd.read_csv(file, sep=";", na_values=["-", ''])#, na_values=0)
            cols = list(df)
            str_cols = cols[2:len(cols)]
            dtdict = {}
            for s in str_cols:
                dtdict[s] = str
            df = pd.read_csv(file, sep=";", na_values=["-", ''], dtype=dtdict)#, na_values=0)
        except (ParserError, UnicodeDecodeError):
            print(file)
            continue
        name = get_plant(file)
        try:
            df2 = conv_to_dt(df)
        except (ParserError, TypeError) as e:
            continue
        df3 = rename_to_blockid(df2, name)
        df4 = pd.merge(refdf, df3, how='left', on=['produced_at']) # fill nan values
        df5 = fix_df(df4)
        #return df5
        fixp = fix_path(file)
        df6 = df5.sort_values(by=['produced_at'])
        df6.to_csv(fixp, index=False)
        if tmp.empty:
            tmp = df6 # merge with itself if no other exists
        tmp = pd.merge(tmp, df6, how='left', on=['produced_at'])
        tmp2 = fill_to_int(tmp)
        #old = tmp.copy()
    return tmp2

In [46]:
def gen_df(filelist, err_bl=True):
    #powers = pd.DataFrame()
    tmp = COMPLETE
    for file in filelist[:]:
        
        refdf = get_date_df_from_file(file)
        
        #print(file)
        try:
            df = pd.read_csv(file, sep=";", na_values=["-", ''], error_bad_lines=err_bl, nrows=5)#, na_values=0)
            cols = list(df)
            str_cols = cols[2:len(cols)]
            use_cols = ['Datum', 'Uhrzeit'] + str_cols
            dtdict = {}
            for s in str_cols:
                dtdict[s] = str
            df = pd.read_csv(file, sep=";", na_values=["-", ''], dtype=dtdict, error_bad_lines=err_bl)#, na_values=0)
        except (ParserError, UnicodeDecodeError) as e:
            print(e)
            print(file)
            continue
        name = get_plant(file)
        try:
            df2 = conv_to_dt(df)
        except (ParserError, TypeError) as e:
            continue
        df3 = rename_to_blockid(df2, name)
        df4 = pd.merge(refdf, df3, how='left', on=['produced_at']) # fill nan values
        df5 = fix_df(df4)
        #return df5
        fixp = fix_path(file)
        df6 = df5.sort_values(by=['produced_at'])
        df6.to_csv(fixp, index=False)
        if tmp.empty:
            tmp = df6 # merge with itself if no other exists
        tmp = pd.merge(tmp, df6, how='left', on=['produced_at'])
        tmp2 = fill_to_int(tmp)
        #old = tmp.copy()
    return tmp2

In [47]:
mapper = get_mapper('plantmapper.json')

In [None]:
#test = gen_df(FILES, True)

In [58]:
parta = gen_df(FILES)

KeyboardInterrupt: 

In [None]:
partb = melt_to_power(parta)

In [51]:
FILES_DICT = {}
for f in testfiles:
    name = f.split("_20")[0]
    if not name in FILES_DICT:
        tmp = [f]
        FILES_DICT[name] = tmp
    else:
        tmp = FILES_DICT[name]
        tmp = tmp + [f]
        FILES_DICT[name] = tmp
    #FILES_DICT[name] = []
    #print(name)

NameError: name 'testfiles' is not defined

In [68]:
DATEDF_LIST = [gen_date_df(year) for year in [[2015], [2016, 2017], [2018, 2019]]]

In [53]:
CDF = gen_power3(FILES)

In [None]:
#CDF.sort_values(by='produced_at')

In [71]:
#CDF.loc[CDF['BNA1404']]

In [None]:
CDF2

In [None]:
partb

In [None]:
F2 = ["./2016_17/Buschhaus_201601010000_201712312345_146.csv", "./2016_17/Brokdorf_201601010000_201712312345_150.csv"]

In [None]:
test1 = gen_df(F2)

In [None]:
test1.sort_values(by="BNA0439", ascending=False)

In [None]:
test1.sort_values(by="BNA0439", ascending=False)

In [None]:
#pd.concat([DATEDF_LIST[0], test1, test1], sort=False).drop_duplicates(subset=['produced_at'], keep=False)

In [None]:
#FILES[0:3]

In [52]:
testfiles = get_fix_files(FIX_DIR)

In [65]:
#FILES_DICT

In [None]:
#testfiles

In [None]:
#DATEDF_LIST[2]

In [None]:
CDF = parta

In [74]:
CDF = pd.read_csv("CDF.csv", parse_dates=['produced_at'])

In [66]:
CDF2 = melt_to_power(CDFS)

In [67]:
#CDF = pd.read_csv("produced_power.csv", parse_dates=['produced_at'])

In [68]:
CDF.dtypes

produced_at    object
BNA0067         int64
BNA1404         int64
BNA0124         int64
BNA0123         int64
                ...  
BNA0413c        int64
BNA1071         int64
BNA1092         int64
BNA1091         int64
BNA1093         int64
Length: 250, dtype: object

In [69]:
CDF

Unnamed: 0,produced_at,BNA0067,BNA1404,BNA0124,BNA0123,BNA0122,BNA0115,BNA0116,BNA0157,BNA0172a,...,BNA1025,BNA0413b,BNA0415,BNA0414,BNA0413a,BNA0413c,BNA1071,BNA1092,BNA1091,BNA1093
0,2015-01-01 00:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2015-01-01 01:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2015-01-01 02:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2015-01-01 03:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2015-01-01 04:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43819,2019-12-31 19:00:00,0,0,0,475,490,808,0,1326,0,...,306,0,0,0,0,0,0,0,0,0
43820,2019-12-31 20:00:00,0,0,0,478,500,850,0,1325,0,...,305,0,0,0,0,0,0,0,0,0
43821,2019-12-31 21:00:00,0,0,0,477,502,857,0,1326,0,...,306,0,0,0,0,0,0,0,0,0
43822,2019-12-31 22:00:00,0,63,0,477,503,858,0,1326,0,...,307,0,0,0,0,0,0,0,0,0


In [None]:
df = CDF2

In [None]:
df.to_csv("produced_power_pg.csv", index=True, header=False)
df.to_csv("produced_power.csv", index=False)
df.to_csv("produced_power_nh.csv", index=False, header=False)

In [64]:
CDF2

Unnamed: 0,produced_at,blockid,power
0,2015-01-01 00:00:00,BNA0067,0
1,2015-01-01 01:00:00,BNA0067,0
2,2015-01-01 02:00:00,BNA0067,0
3,2015-01-01 03:00:00,BNA0067,0
4,2015-01-01 04:00:00,BNA0067,0
...,...,...,...
10912171,2019-12-31 19:00:00,BNA1093,0
10912172,2019-12-31 20:00:00,BNA1093,0
10912173,2019-12-31 21:00:00,BNA1093,0
10912174,2019-12-31 22:00:00,BNA1093,0


In [None]:
#CDF

In [70]:
#CDF.to_csv("CDF.csv", index=False)

In [71]:
list(CDF2)

['produced_at', 'blockid', 'power']

In [75]:
#CDF.dtypes

In [76]:
gy = CDF.resample('1Y', on='produced_at').sum()
gm = CDF.resample('1M', on='produced_at').sum()

In [77]:
gm2 = gm.reset_index()
gm2['year'] = gm2['produced_at']
gm2['month'] = gm2['produced_at']

In [78]:
gm2['year'] = gm2['year'].apply(lambda x: str(x).split("-")[0])
gm2['month'] = gm2['month'].apply(lambda x: int(str(x).split("-")[1]))
gm2['month'] = gm2['month'].astype(int)

In [76]:
gm3 = gm2.sort_values(by=["year", 'month'])
gm4 = gm3.drop(columns='produced_at')

In [77]:
gm5 = gm4.melt(id_vars=["year", "month"], var_name='blockid', value_name='power')
gm5['power'] = gm5['power'].astype(int)

In [78]:
gm5

Unnamed: 0,year,month,blockid,power
0,2015,1,BNA0067,70621
1,2015,2,BNA0067,68088
2,2015,3,BNA0067,66709
3,2015,4,BNA0067,53328
4,2015,5,BNA0067,76938
...,...,...,...,...
14935,2019,8,BNA1093,58135
14936,2019,9,BNA1093,58514
14937,2019,10,BNA1093,110960
14938,2019,11,BNA1093,151811


In [79]:
gm5.to_csv("monthly.csv", header=False)

In [None]:
gm4.to_csv("monthly.csv", header=False)

In [None]:
gm2[headers] = powers2[headers].applymap(lambda x: "0" if not str(x).isnumeric() and ":" in x else x) # remove dates from int column
powers2[headers] = powers2[headers].applymap(lambda x: x if str(x).isnumeric() else "".join(re.findall(r"\d", x)) or 0)

In [None]:
gm2

In [None]:
gm2 = gm.copy()

In [None]:
gm

In [None]:
CDF2.groupby(CDF2['produced_at'])CDF2.groupby(CDF2['produced_at']),

In [None]:
CDF2

In [None]:
COMPLETE.shape

In [None]:
#FILES

In [None]:
#mask = CDF.duplicated(subset=['produced_at'])
#CDF[mask].sort_values(by=['produced_at'])

In [None]:
FT = testfiles[15:18]
testdf = gen_power2(FT)
#testdf

In [None]:
#set subtraction
#pd.concat([COMPLETE, df, df]).drop_duplicates(subset=['produced_at'], keep=False)

In [None]:
#a = gen_df([FILES[0]])

In [None]:
failed = gen_power(["./2016_17/Heizkraftwerk_Altbach_Deizisau_201601010000_201712312345_10.csv"])

In [None]:
mq = failed.duplicated(subset=['produced_at', 'blockid'])

In [None]:
#failed[mq]

In [None]:
a = (365 * 5 + 1) * 24

In [None]:
a

In [None]:
COMPLETE

In [None]:
base = datetime.datetime(2015, 1, 1)
date_list = [base + datetime.timedelta(hours=x) for x in range(0, a)]

In [None]:
date_list[a-1]

In [None]:
datedf = pd.DataFrame(data={'producet_at': date_list})

In [None]:
datedf

In [None]:
tdf = pd.read_csv("./fix/./2015/Boxberg_201501010000_201512312345_71.csv")

In [None]:
tdf.dtypes

In [None]:
TEST = "1.234"

In [None]:
r1 = r"\d"
r2 = r"\\d"
r3 = r"\\\d"
r4 = "\\d"
r5 = "\\\\d"

In [None]:
test = re.findall(r"\d", TEST)

In [None]:
"".join(test)

In [None]:
alist = ['BNA0104', 'BNA0124']

In [None]:
tdf

In [None]:
tdf[alist] = tdf[alist].replace(['-'], [0])

In [None]:
errs = validate_dirs(FOLDERS)

In [None]:
errs

In [None]:
#df = gen_powers(FOLDERS)

In [None]:
mask = (df['produced_at'] > "2015-01-01 00:00:00") & (df['produced_at'] <= "2017-01-01 00:00:00")

In [None]:
df.loc[mask]

In [None]:
df = df.sort_values(by=["produced_at", "blockid"]).reset_index(drop=True)

In [None]:
mask = df.duplicated(subset=["produced_at", "blockid"])

In [None]:
df[mask]

In [None]:
fs = get_files_from_folder(FOLDERS[0])

In [None]:
bk = fs[1]

In [None]:
tf = pd.read_csv(bk, sep=";")#, na_values=0)

In [None]:
date, tod = ("Datum", "Uhrzeit")
tf["produced_at"] = tf[date] + " " + tf[tod]

In [None]:
tf.loc[tf.duplicated(['produced_at'], keep=False)]

In [None]:
tf.sort_values(by=['Datum'])

In [None]:
tf["produced_at"] = pd.to_datetime(tf['produced_at'])

In [None]:
tf

In [None]:
tf2 = conv_to_dt(tf)

In [None]:
F15 = FOLDERS[2]
F16 = FOLDERS[0]
F18 = FOLDERS[1]

In [None]:
F15

In [None]:
df.sort_values(by=['produced_at', 'blockid'])

In [None]:
df.loc[mask]

In [None]:
df

In [None]:
ft = pd.read_csv("faulty.csv", sep=';')

In [None]:
errs = validate_dirs(FOLDERS)

In [None]:
#errs

In [None]:
is_valid_df(ft)

In [None]:
ft.loc[ft['Datum'].apply(lambda x: len(x.split("."))) != 3].shape[0]

In [None]:
df = gen_powers(FOLDERS)

In [None]:
df = pd.read_csv("./2015/Bergkamen_201501010000_201512312345_28.csv", sep=";")#, na_values=0)

In [None]:
#df

In [None]:
files = get_files_from_folder(F15)
final = gen_df(files)

In [None]:
'''
KeyError
Heizkraftwerk_Dresden-Nossener_Br_cke
['Generation_DE Heizkraftwerk Dresden-Nossener Brücke ']
Generation_DE Heizkraftwerk Dresden-Nossener Brücke 
ParserError
./2015/Kraftwerk_BASF_Ludwigshafen_Mitte_201501010000_201512312345_20.csv
KeyError
Kraftwerk_BASF_Ludwigshafen_Mitte
['Generation_DE Koepchenwerk[MW]']
Generation_DE Koepchenwerk[MW]
KeyError
Kraftwerk_BASF_Ludwigshafen_S_d
['Generation_DE GUD C 200']
Generation_DE GUD C 200
KeyError
Kraftwerk_West
['Generation_DE West 2[MW]', 'Generation_DE West 1[MW]']
Generation_DE West 2[MW]
KeyError
Kraftwerk_West
['Generation_DE West 2[MW]', 'Generation_DE West 1[MW]']
Generation_DE West 1[MW]

'''

In [None]:
#files

In [None]:
name = get_name_from_file(FN)

In [None]:
onlyfiles.sort()

In [None]:
#onlyfiles

In [None]:
dq = pd.read_csv(files[2], sep=";")

In [None]:
#final = gen_df(F15)

In [None]:
p2 = conv_to_power(final)

In [None]:
powers3.shape

In [None]:
p2

In [None]:
final = final.dropna(subset=["produced_at"])

In [None]:
powers = final.melt(id_vars=["produced_at"], var_name='blockid', value_name='power')

In [None]:
powers2 = powers.copy()

In [None]:
powers2['power'] = powers2['power'].fillna(0)

In [None]:
powers2.power.replace(['-'], [0], inplace=True)

In [None]:
powers3 = powers2.astype({"power": int})

In [None]:
powers3.dtypes

In [None]:
powers3.to_csv("produced_power_pg.csv", index=True, header=False)
powers3.to_csv("produced_power.csv", index=False)
powers3.to_csv("produced_power_nh.csv", index=False, header=False)

In [None]:
list(powers3)

In [None]:
powers.blockid.str.len().drop_duplicates()

In [None]:
mask = powers['blockid'].str.len() == 8

In [None]:
engine = create_engine('postgresql://simon:"N0m1596."@localhost:5432/power')

powers.to_sql("power", engine, if_exists="replace", method="multi")

In [None]:
powers.loc[mask]

In [None]:
powers

In [None]:
#powers.groupby("Blockid")['Blockid'].apply(lambda x: x.str.len(x).count())

In [None]:
powers.dtypes

In [None]:
df

In [None]:
dq = pd.read_csv(files[1], sep=";")
df = pd.read_csv(files[2], sep=";")

In [None]:
files[2]

In [None]:
files[1]

In [None]:
df2 = conv_to_dt(df)
df3 = rename_to_blockid(df2, "Braunkohlekraftwerk_Lippendorf")
dq2 = conv_to_dt(dq)
dq3 = rename_to_blockid(dq2, "Boxberg")

In [None]:
ids = dq3["Datetime"]

In [None]:
dq3[ids.isin(ids[ids.duplicated()])]

In [None]:
#dq3.duplicated(["Datetime"])

In [None]:
dq3.drop_duplicates(["Datetime"])

In [None]:
tot = pd.merge(dq3, df3, how='outer', on=['Datetime'])

In [None]:
tot.drop_duplicates(["Datetime"])

In [None]:
tot = pd.concat([dq3, df3], axis=1, sort=False)

In [None]:
tot

In [None]:
'''
Error tokenizing data. C error: Expected 3 fields in line 2355, saw 4

./2016_17/Cuno_Heizkraftwerk_Herdecke_201601010000_201712312345_17.csv
Error tokenizing data. C error: Expected 4 fields in line 7786, saw 9

./2016_17/Duisburg_Heizkraftwerk_III_201601010000_201712312345_18.csv
Error tokenizing data. C error: Expected 3 fields in line 5981, saw 4

./2016_17/Gemeinschaftskraftwerk_Kiel_201601010000_201712312345_7.csv
Error tokenizing data. C error: Expected 4 fields in line 5861, saw 5

./2016_17/Huckingen_201601010000_201712312345_6.csv
Error tokenizing data. C error: Expected 5 fields in line 13947, saw 7

./2016_17/Kraftwerk_BASF_Ludwigshafen_Mitte_201601010000_201712312345_20.csv
Error tokenizing data. C error: Expected 5 fields in line 7403, saw 9

./2016_17/Kraftwerk_Mittelsb_ren_201601010000_201712312345_4.csv
Error tokenizing data. C error: Expected 5 fields in line 8430, saw 6

./2016_17/Kraftwerk_Werdohl-Elverlingsen_201601010000_201712312345_15.csv
Error tokenizing data. C error: Expected 3 fields in line 15454, saw 5

./2016_17/Kraftwerk_Wilhelmshaven_201601010000_201712312345_13.csv
Error tokenizing data. C error: Expected 4 fields in line 1721, saw 8

./2016_17/Reuter_West_201601010000_201712312345_16.csv
Error tokenizing data. C error: Expected 4 fields in line 7454, saw 11

./2016_17/Tiefstack_201601010000_201712312345_31.csv
Error tokenizing data. C error: Expected 3 fields in line 14633, saw 4

./2016_17/Trianel_Kohlekraftwerk_L_nen_201601010000_201712312345_26.csv
Error tokenizing data. C error: Expected 4 fields in line 5877, saw 6

./2016_17/Waldeck_2_201601010000_201712312345_32.csv
Error tokenizing data. C error: Expected 3 fields in line 7138, saw 10

./2016_17/Wehr_201601010000_201712312345_3.csv
Error tokenizing data. C error: Expected 3 fields in line 3689, saw 4

./2016_17/Weiher_201601010000_201712312345_21.csv

'''