In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'climate_model_simulation_crashes'

In [3]:
input_dir = './raw/'
inp_fname = 'Climate_Model_Simulation_Crashes.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,Study,Run,vconst_corr,vconst_2,vconst_3,vconst_4,vconst_5,vconst_7,ah_corr,ah_bolus,...,efficiency_factor,tidal_mix_max,vertical_decay_scale,convect_corr,bckgrnd_vdc1,bckgrnd_vdc_ban,bckgrnd_vdc_eq,bckgrnd_vdc_psim,Prandtl,outcome
0,1.0,1.0,0.859036,0.927825,0.252866,0.298838,0.170521,0.735936,0.428325,0.567947,...,0.245675,0.104226,0.869091,0.997518,0.44862,0.307522,0.85831,0.796997,0.869893,b'0'
1,1.0,2.0,0.606041,0.457728,0.359448,0.306957,0.843331,0.934851,0.444572,0.828015,...,0.61687,0.975786,0.914344,0.845247,0.864152,0.346713,0.356573,0.438447,0.512256,b'1'
2,1.0,3.0,0.9976,0.373238,0.517399,0.504993,0.618903,0.605571,0.746225,0.195928,...,0.679355,0.803413,0.643995,0.718441,0.924775,0.315371,0.250642,0.285636,0.365858,b'1'
3,1.0,4.0,0.783408,0.104055,0.197533,0.421837,0.742056,0.490828,0.005525,0.392123,...,0.471463,0.597879,0.761659,0.362751,0.912819,0.977971,0.845921,0.699431,0.475987,b'1'
4,1.0,5.0,0.40625,0.513199,0.061812,0.635837,0.844798,0.441502,0.191926,0.487546,...,0.551543,0.743877,0.312349,0.650223,0.522261,0.043545,0.37666,0.280098,0.132283,b'1'


In [5]:
id_col = "id"
target_col = "outcome"

## Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id  Study  Run  vconst_corr  vconst_2  vconst_3  vconst_4  vconst_5  \
0   0    1.0  1.0     0.859036  0.927825  0.252866  0.298838  0.170521   
1   1    1.0  2.0     0.606041  0.457728  0.359448  0.306957  0.843331   
2   2    1.0  3.0     0.997600  0.373238  0.517399  0.504993  0.618903   
3   3    1.0  4.0     0.783408  0.104055  0.197533  0.421837  0.742056   
4   4    1.0  5.0     0.406250  0.513199  0.061812  0.635837  0.844798   

   vconst_7   ah_corr  ...  efficiency_factor  tidal_mix_max  \
0  0.735936  0.428325  ...           0.245675       0.104226   
1  0.934851  0.444572  ...           0.616870       0.975786   
2  0.605571  0.746225  ...           0.679355       0.803413   
3  0.490828  0.005525  ...           0.471463       0.597879   
4  0.441502  0.191926  ...           0.551543       0.743877   

   vertical_decay_scale  convect_corr  bckgrnd_vdc1  bckgrnd_vdc_ban  \
0              0.869091      0.997518      0.448620         0.307522   
1              0.914344   

## Convert byte strings to strings

In [7]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

['outcome']

In [8]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

Unnamed: 0,id,Study,Run,vconst_corr,vconst_2,vconst_3,vconst_4,vconst_5,vconst_7,ah_corr,...,efficiency_factor,tidal_mix_max,vertical_decay_scale,convect_corr,bckgrnd_vdc1,bckgrnd_vdc_ban,bckgrnd_vdc_eq,bckgrnd_vdc_psim,Prandtl,outcome
0,0,1.0,1.0,0.859036,0.927825,0.252866,0.298838,0.170521,0.735936,0.428325,...,0.245675,0.104226,0.869091,0.997518,0.44862,0.307522,0.85831,0.796997,0.869893,0
1,1,1.0,2.0,0.606041,0.457728,0.359448,0.306957,0.843331,0.934851,0.444572,...,0.61687,0.975786,0.914344,0.845247,0.864152,0.346713,0.356573,0.438447,0.512256,1
2,2,1.0,3.0,0.9976,0.373238,0.517399,0.504993,0.618903,0.605571,0.746225,...,0.679355,0.803413,0.643995,0.718441,0.924775,0.315371,0.250642,0.285636,0.365858,1
3,3,1.0,4.0,0.783408,0.104055,0.197533,0.421837,0.742056,0.490828,0.005525,...,0.471463,0.597879,0.761659,0.362751,0.912819,0.977971,0.845921,0.699431,0.475987,1
4,4,1.0,5.0,0.40625,0.513199,0.061812,0.635837,0.844798,0.441502,0.191926,...,0.551543,0.743877,0.312349,0.650223,0.522261,0.043545,0.37666,0.280098,0.132283,1


## Drop unuseful columns

In [9]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

[]


## Convert ? to NaN

In [10]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [11]:
data.to_csv(outp_fname, index=False)