In [1]:
# rank the obtained results using the *.log files
import os
import pandas as pd
import numpy as np

In [2]:
source = "10"
targetdir = '../../data/' + source + "/"
filelist = sorted(os.listdir(targetdir))

In [3]:
filelist

['1.data', '2.data', '3.data', '4.data', '5.data', '6.data', '7.data']

In [4]:
# Create dataframe from files
df = pd.DataFrame()

for file in filelist:
    filename = targetdir+file
    col_name = [file]
    temp_df = pd.read_csv(filename,names=col_name)    
    df = pd.concat([df, temp_df], axis=1)
    
# Look at the data
df.head()

Unnamed: 0,1.data,2.data,3.data,4.data,5.data,6.data,7.data
0,0.7981,0.9064,1.0,1.0,0.9103,1.0,0.6492
1,0.7981,0.9064,1.0001,-1.0,0.9103,-1.0,0.6492
2,0.7981,0.9064,1.0002,-1.0,0.9103,-1.0,0.6492
3,0.6604,0.7501,0.8219,0.8276,0.7533,0.5203,0.5373
4,0.6604,0.7501,0.8219,0.8276,0.7533,0.5203,0.5373


In [5]:
# Clip values > 1 with 1 and ignore 0s
df.mask(df > 1, 1, inplace=True)
df.mask(df == 0, np.NaN, inplace=True)

In [6]:
# Count NaN values
df.isna().sum()

1.data    0
2.data    0
3.data    0
4.data    0
5.data    0
6.data    0
7.data    0
dtype: int64

In [7]:
# Ignore invalid values by dropping them from the dataframe
df = df.dropna()

In [8]:
df

Unnamed: 0,1.data,2.data,3.data,4.data,5.data,6.data,7.data
0,0.7981,0.9064,1.0000,1.0000,0.9103,1.0000,0.6492
1,0.7981,0.9064,1.0000,-1.0000,0.9103,-1.0000,0.6492
2,0.7981,0.9064,1.0000,-1.0000,0.9103,-1.0000,0.6492
3,0.6604,0.7501,0.8219,0.8276,0.7533,0.5203,0.5373
4,0.6604,0.7501,0.8219,0.8276,0.7533,0.5203,0.5373
...,...,...,...,...,...,...,...
724,0.7393,0.7869,0.9528,-1.0000,0.7734,0.5320,0.5525
725,0.7393,0.7869,0.9528,-1.0000,0.7734,0.5320,0.5525
726,0.6306,0.6712,0.7977,1.0000,0.6597,0.4538,0.4713
727,0.6306,0.6712,0.7977,1.0000,0.6597,0.4538,0.4713


In [9]:
# Save processed dataframe as csv file
df.to_csv("../../data/processed/" + source +".csv",index=False)

In [10]:
# Creating ranked dataframe
ranked_df = pd.DataFrame()
stats_df = pd.DataFrame()

In [11]:
# Creating scenario quantity variable
tao = len(df)
tao

729

## Ranking of WWTP

In [12]:
for column in df:    
    wwtp = column[0]
    
    # TODO: get original (pre-analysis) value
    # pending
    
    # calculate mean 
    avg_eff = round(df[column].mean(),3)
    
    # calculate max
    max_eff = round(df[column].max(),3)
    
    # calculate min
    min_eff = round(df[column].min(),3)
    
    # calculate amplitude 
    amplitude = round((max_eff - min_eff)*100,2)
    
    amp_str = "Amplitude (max-min)(%)"
    
    # print stats results
    print("WWTP", wwtp,"Mean =",avg_eff,"Maximum =",max_eff,"Minimum =",min_eff, amp_str,"=",amplitude)
    stats_df = stats_df.append({ 'WWTP': wwtp, "Mean": avg_eff, "Maximum" : max_eff, "Minimum": min_eff, amp_str: amplitude},ignore_index=True)
    
    
    # TODO: Populate statistics dataframe using pd.df.append
    
    # Calculating Sk sum of factors
    Sk = round(df[column].sum(),3)    
    
    # Calculating ek sum of factors of 1 (or above if errors in calculation)
    ek = df[column] >= 1
    ek = ek.sum()
    print("ek =",ek)
    
    # Calculating R1k ek/tao
    R1k = round(ek/tao,3)
    
    # Calculate R2k
    if tao != ek:
        R2k = (Sk - ek)/(tao - ek)
    elif R1k == 1:
        R2k = 0
        
    R2k = round(R2k,3)
    
    # Printing results
    print("WWTP", wwtp,"| ek =",ek,"| R1k =",R1k, "| Sk =",Sk, "| R2k =",R2k)    
    
    # Populate ranking dataframe using pd.df.append
    # Using unicode to name columns with super and subscripts
    R1k_col = 'R\u00B9\u2096\u2080'
    R2k_col = 'R\u00B2\u2096\u2080'
    ranked_df = ranked_df.append({ R2k_col:R2k, R1k_col: R1k,'WWTP': wwtp},ignore_index=True)

WWTP 1 Mean = 0.714 Maximum = 0.912 Minimum = 0.552 Amplitude (max-min)(%) = 36.0
ek = 0
WWTP 1 | ek = 0 | R1k = 0.0 | Sk = 520.569 | R2k = 0.714
WWTP 2 Mean = 0.784 Maximum = 0.966 Minimum = 0.63 Amplitude (max-min)(%) = 33.6
ek = 0
WWTP 2 | ek = 0 | R1k = 0.0 | Sk = 571.685 | R2k = 0.784
WWTP 3 Mean = 0.876 Maximum = 1.0 Minimum = 0.65 Amplitude (max-min)(%) = 35.0
ek = 216
WWTP 3 | ek = 216 | R1k = 0.296 | Sk = 638.57 | R2k = 0.824
WWTP 4 Mean = 0.041 Maximum = 1.0 Minimum = -1.0 Amplitude (max-min)(%) = 200.0
ek = 108
WWTP 4 | ek = 108 | R1k = 0.148 | Sk = 30.209 | R2k = -0.125
WWTP 5 Mean = 0.779 Maximum = 0.966 Minimum = 0.622 Amplitude (max-min)(%) = 34.4
ek = 0
WWTP 5 | ek = 0 | R1k = 0.0 | Sk = 567.975 | R2k = 0.779
WWTP 6 Mean = 0.232 Maximum = 1.0 Minimum = -1.0 Amplitude (max-min)(%) = 200.0
ek = 18
WWTP 6 | ek = 18 | R1k = 0.025 | Sk = 169.34 | R2k = 0.213
WWTP 7 Mean = 0.556 Maximum = 0.686 Minimum = 0.446 Amplitude (max-min)(%) = 24.0
ek = 0
WWTP 7 | ek = 0 | R1k = 0.0 |

In [13]:
# Reorder columns to be usable as a results table
ranked_df = ranked_df.reindex(columns=['WWTP',R1k_col, R2k_col])

In [14]:
ranked_df

Unnamed: 0,WWTP,R¹ₖ₀,R²ₖ₀
0,1,0.0,0.714
1,2,0.0,0.784
2,3,0.296,0.824
3,4,0.148,-0.125
4,5,0.0,0.779
5,6,0.025,0.213
6,7,0.0,0.556


In [15]:
import os

path = "../../results/" + source + "/"

# Save rankings dataframe as csv file

try:
    ranked_df.to_csv(path + "ranking.csv",index=False)
    print("Save succesful")
except:
    print("Creating folder and saving")
    os.mkdir(path)
    ranked_df.to_csv(path + "ranking.csv",index=False)

Creating folder and saving


## Calculate Descriptive Statistics

In [16]:
# Calculate the mean of every column
mean_mean = round(stats_df.Mean.mean(),3)
mean_max = round(stats_df.Maximum.mean(),3)
mean_min = round(stats_df.Minimum.mean(),3)
mean_amp = round(stats_df[amp_str].mean(),3)

In [17]:
# Add means to stats dataframe
stats_df = stats_df.append({ 'WWTP': "Mean", "Mean" : mean_mean, "Maximum" : mean_max, 
                            "Minimum" : mean_min, amp_str : mean_amp},ignore_index=True)

In [18]:
# Calculate the standard deviation of every column
sd_mean = round(stats_df.Mean.std(),3)
sd_max = round(stats_df.Maximum.std(),3)
sd_min = round(stats_df.Minimum.std(),3)
sd_amp = round(stats_df[amp_str].std(),3)

In [19]:
# Add means to stats dataframe
stats_df = stats_df.append({ 'WWTP': "SD", "Mean" : sd_mean, "Maximum" : sd_max, 
                            "Minimum" : sd_min, amp_str : sd_amp},ignore_index=True)

In [20]:
# Reorder columns
stats_df = stats_df.reindex(columns=["WWTP", "Mean", "Maximum", "Minimum", amp_str])

In [21]:
stats_df

Unnamed: 0,WWTP,Mean,Maximum,Minimum,Amplitude (max-min)(%)
0,1,0.714,0.912,0.552,36.0
1,2,0.784,0.966,0.63,33.6
2,3,0.876,1.0,0.65,35.0
3,4,0.041,1.0,-1.0,200.0
4,5,0.779,0.966,0.622,34.4
5,6,0.232,1.0,-1.0,200.0
6,7,0.556,0.686,0.446,24.0
7,Mean,0.569,0.933,0.129,80.429
8,SD,0.292,0.105,0.717,75.714


In [22]:
# Save statistics dataframe as csv file
stats_df.to_csv(path + "statistics.csv",index=False)

In [23]:
# Convert Jupyter Notebook to PDF LaTeX file
!jupyter-nbconvert --to pdf "clip-max-ignore-zeros-custom.ipynb" --output-dir "../../results/10/"

[NbConvertApp] Converting notebook clip-max-ignore-zeros-custom.ipynb to pdf
[NbConvertApp] Writing 45149 bytes to ./notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', './notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', './notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 47037 bytes to ../../results/10/clip-max-ignore-zeros-custom.pdf
