In [3]:
import pandas as pd

In [4]:
# Load in data

in_dir = '../../data_old'

df_train = pd.read_csv(f'{in_dir}/train.csv')
df_train_xclim = pd.read_csv(f'{in_dir}/train_xclim.csv')
df_dev_in = pd.read_csv(f'{in_dir}/dev_in.csv')
df_dev_xclim = pd.read_csv(f'{in_dir}/dev_xclim.csv')
df_dev_out = pd.read_csv(f'{in_dir}/dev_out.csv')
df_eval_in = pd.read_csv(f'{in_dir}/eval_in.csv')
df_eval_out = pd.read_csv(f'{in_dir}/eval_out.csv')
print("Loaded Data")

df_train_cat = pd.concat([df_train, df_train_xclim])

Loaded Data


In [None]:
# Identify the categorical features
cat_features = []
for col in df_train_cat:
    values = df_train_cat[col].tolist()
    unique = list(dict.fromkeys(values))
    if len(unique) < 20:
        cat_features.append(col)

In [None]:
# !pip install scipy
from scipy import stats
import numpy as np

nan_replacements = {}
for col in df_train_cat:
    if col in cat_features:
        nan_replacements[col] = stats.mode(np.asarray(df_train_cat[col].tolist()))[0][0]
    else:
        nan_replacements[col] = np.mean(np.asarray(df_train_cat[col].dropna().tolist()))

In [None]:
# Replaces all NaNs
for col in df_train_cat:
    df_train[col] = df_train[col].fillna(nan_replacements[col])
    df_train_xclim[col] = df_train_xclim[col].fillna(nan_replacements[col])
    df_dev_in[col] = df_dev_in[col].fillna(nan_replacements[col])
    df_dev_xclim[col] = df_dev_xclim[col].fillna(nan_replacements[col])
    df_dev_out[col] = df_dev_out[col].fillna(nan_replacements[col])
    df_eval_in[col] = df_eval_in[col].fillna(nan_replacements[col])
    df_eval_out[col] = df_eval_out[col].fillna(nan_replacements[col])
df_train_cat = pd.concat([df_train, df_train_xclim])
print("Replaced NaNs")

In [None]:
# !pip install scikit-learn

from typing import Dict

import numpy as np
import sklearn.preprocessing


def normalize(
    X: Dict[str, np.ndarray], normalization: str, seed: int, noise: float = 1e-3
) -> Dict[str, np.ndarray]:
    X_train = X['train_cat']
    if normalization == 'standard':
        normalizer = sklearn.preprocessing.StandardScaler()
    elif normalization == 'quantile':
        normalizer = sklearn.preprocessing.QuantileTransformer(
            output_distribution='normal',
            n_quantiles=max(min(X['train_cat'].shape[0] // 30, 1000), 10),
            subsample=1e9,
            random_state=seed,
        )
        if noise:
            X_train = X_train.copy()
            stds = np.std(X_train, axis=0, keepdims=True)
            noise_std = noise / np.maximum(stds, noise)
            X_train += noise_std * np.random.default_rng(seed).standard_normal(
                X_train.shape
            )
    else:
        raise ValueError(f'unknown normalization: {normalization}')
    normalizer.fit(X_train)
    return {k: normalizer.transform(v) for k, v in X.items()}

In [None]:
# Set Seed
seed = 100

In [None]:
# Normalize using training stats
# Quantile normalisation is used (maps to a normal distribution)
X_train_cat_np = np.asarray(df_train_cat.iloc[:,6:])
X_train_np = np.asarray(df_train.iloc[:,6:])
X_train_xclim_np = np.asarray(df_train_xclim.iloc[:,6:])
X_dev_in_np = np.asarray(df_dev_in.iloc[:,6:])
X_dev_xclim_np = np.asarray(df_dev_xclim.iloc[:,6:])
X_dev_out_np = np.asarray(df_dev_out.iloc[:,6:])
X_eval_in_np = np.asarray(df_eval_in.iloc[:,6:])
X_eval_out_np = np.asarray(df_eval_out.iloc[:,6:])

In [None]:
X = {'train_cat': X_train_cat_np, 'train': X_train_np, 'train_xclim': X_train_xclim_np, 'dev_in': X_dev_in_np, 'dev_xclim': X_dev_xclim_np, 'dev_out': X_dev_out_np, 'eval_in': X_eval_in_np, 'eval_out': X_eval_out_np}
X = normalize(X, normalization='quantile', seed=100)

In [None]:
df_train.loc[:,6:] = X['train']
df_train_xclim.loc[:,6:] = X['train_xclim']
df_dev_in.loc[:,6:] = X['dev_in']
df_dev_xclim.loc[:,6:] = X['dev_xclim']
df_dev_out.loc[:,6:] = X['dev_out']
df_eval_in.loc[:,6:] = X['eval_in']
df_eval_out.loc[:,6:] = X['eval_out']

print('Normalized')

In [None]:
# Save modified dataframes
out_dir = '../data_preprocessed_xclim'
df_train.to_csv(f'{args.out_dir}/train.csv', index=False)
df_train_xclim.to_csv(f'{args.out_dir}/train_xclim.csv', index=False)
df_dev_in.to_csv(f'{args.out_dir}/dev_in.csv', index=False)
df_dev_xclim.to_csv(f'{args.out_dir}/dev_xclim.csv', index=False)
df_dev_out.to_csv(f'{args.out_dir}/dev_out.csv', index=False)
df_eval_in.to_csv(f'{args.out_dir}/eval_in.csv', index=False)
df_eval_out.to_csv(f'{args.out_dir}/eval_out.csv', index=False)