# Confidence Analysis

## Code for counting number of inputs

This code counts the number of NaN values and values that equal the defaults for each field to quantify the confidence of the results. The less default or mmissing values, the more certain we can be.

In [1]:
import xml.etree.ElementTree as ET

import pandas as pd
import numpy as np

### XML to pandas

In [2]:
def xml_to_pandas(directory):

    # Parse the XML file
    tree = ET.parse(directory)
    root = tree.getroot()

    # Extract data
    data = []
    for field in root.findall('Field'):

        field_data = {}

        for a in field.findall('A'):

            field_data[a.attrib['name']] = a.text
            
        data.append(field_data)

    # Create DataFrame
    df = pd.DataFrame(data) # turn data into a dataframe
    df = df.T               # rotate dataframe to match format

    return df

In [3]:
df = xml_to_pandas('./data/RMI_v10.2_global_fields.xml')
df.to_csv('./data/xml_to_csv.csv')

In [4]:
list(df[1])

['1',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 'Canada',
 'Ovintiv Horn River BC',
 '17.0',
 '5733.678005',
 '0.4',
 '8',
 '2.775',
 '6.8',
 '1232.740771',
 '173.2062041',
 '0.0',
 '47.0',
 '2.86',
 '0.33',
 '89.18',
 '5.3',
 '1.62',
 '0.71',
 '0.0',
 '66528000.0',
 '2.025808595',
 '3.025808595',
 '363.8',
 '99792000.0',
 'NG',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '1',
 'None',
 'Acid Wet Gas',
 '1784.830185',
 '0.002',
 '1.0',
 '0.0',
 '1.0',
 '0.0',
 '0.0',
 '8000.0',
 '500.0',
 '1000.0',
 '800.0',
 '100.0',
 'None',
 'Stabilization',
 'All',
 '59',
 'Med carbon',
 'Med',
 '250000.0',
 nan]

In [5]:
# Parse the XML file
tree = ET.parse('./data/RMI_v10.2_global_fields.xml')
root = tree.getroot()

In [6]:
# Extract data
data = []
for field in root.findall('Field'):

    field_data = {}

    for a in field.findall('A'):

        field_data[a.attrib['name']] = a.text
        
    data.append(field_data)

In [7]:
# Create DataFrame
df = pd.DataFrame(data) # turn data into a dataframe
df = df.T               # rotate dataframe to match format

### Calculating relative confidence

In [8]:
attrs = pd.read_csv('./data/attributes_metadata.csv')       # reading in details about variables
attrs = attrs[attrs['set_in_csv']==1]                       # filtering for variables that can be set in csv

# this field is not in the attributes table for some reason so adding it manually
tanker_size = pd.DataFrame({
    'name': ['ocean_tanker_size'],
    'opgee_type': ['float'],
    'default': [250000.0]
    })

attrs = pd.concat([attrs,tanker_size],ignore_index=True)
# attrs

FileNotFoundError: [Errno 2] No such file or directory: './data/attributes_metadata.csv'

In [8]:
variable_names = list(df.index)                                         # find variables from fields

# Filter the defaults for only the variable names that we look at in the confidence analysis
filtered_defaults = attrs[attrs['name'].isin(variable_names)]
filtered_defaults = filtered_defaults[['name','opgee_type','default']]  # only retain these columns
filtered_defaults.reset_index(inplace=True,drop=True)
filtered_defaults = filtered_defaults.set_index('name')
# filtered_defaults

In [9]:
filtered_defaults.drop(columns=['opgee_type'],inplace=True) # remove the type colomn for ease
result = filtered_defaults.join(df)                         # add the defaults column to the front
# result

In [10]:
# Count non-NaN values in each column
count_non_nan = result.isna().sum()

# Add the count as a new row
result.loc['count'] = count_non_nan

# result

In [11]:
# adding the number of variables for each column that are equal to the default

for col in result.columns:
    
    result.at['count',col] = result.at['count',col] + sum(result[col]==result['default'])   # adding this number to the count of nans

result.loc['pct_conf'] = (59-result.loc['count'])/59 * 100                                  # creating another column that has it as pct
# result

In [175]:
# do basic statistics and print

print(result.drop(columns=['default']).T[['count','pct_conf']]
      .astype(float).describe())


name         count     pct_conf
count  3709.000000  3709.000000
mean     27.461580    53.454949
std       3.457781     5.860646
min      18.000000    40.677966
25%      24.000000    47.457627
50%      28.000000    52.542373
75%      31.000000    59.322034
max      35.000000    69.491525
