## Import libraries

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.impute import SimpleImputer

%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

## Read data

In [None]:
y_a = pd.read_parquet('A/train_targets.parquet')
y_b = pd.read_parquet('B/train_targets.parquet')
y_c = pd.read_parquet('C/train_targets.parquet')

X_a = pd.read_parquet('A/X_train_observed.parquet')
X_b = pd.read_parquet('B/X_train_observed.parquet')
X_c = pd.read_parquet('C/X_train_observed.parquet')

X_a_estimated = pd.read_parquet('A/X_train_estimated.parquet')
X_b_estimated = pd.read_parquet('B/X_train_estimated.parquet')
X_c_estimated = pd.read_parquet('C/X_train_estimated.parquet')

X_a_test = pd.read_parquet('A/X_test_estimated.parquet')
X_b_test = pd.read_parquet('B/X_test_estimated.parquet')
X_c_test = pd.read_parquet('C/X_test_estimated.parquet')

## Remove highly correlated features

Investigated in **correlation_cleanup.ipynb**

In [None]:
to_drop = ['fresh_snow_12h:cm', 'fresh_snow_24h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'diffuse_rad:W', 'direct_rad:W', 'pressure_100m:hPa', 'pressure_50m:hPa', 'sfc_pressure:hPa', 'absolute_humidity_2m:gm3','air_density_2m:kgm3','dew_point_2m:K', 'clear_sky_rad:W', 'sun_elevation:d', 'clear_sky_energy_1h:J', 'is_in_shadow:idx', 'total_cloud_cover:p']

X_a.drop(to_drop, axis=1, inplace=True)
X_b.drop(to_drop, axis=1, inplace=True)
X_c.drop(to_drop, axis=1, inplace=True)

X_a_estimated.drop(to_drop, axis=1, inplace=True)
X_b_estimated.drop(to_drop, axis=1, inplace=True)
X_c_estimated.drop(to_drop, axis=1, inplace=True)

## Fix missing values

### Fix snow density

The snow density feature is either 250 or Nan, so I'll map it to a binary. 0 for NaN values and 1 for the 250 values.

In [None]:
def fix_snow_density(df):
    df["snow_density:kgm3"] = df["snow_density:kgm3"].apply(
        lambda a : np.isnan(a)
        ).map({True: 0, False: 1})

fix_snow_density(X_a)
fix_snow_density(X_b)
fix_snow_density(X_c)

fix_snow_density(X_a_estimated)
fix_snow_density(X_b_estimated)
fix_snow_density(X_c_estimated)

### Fix the rest

We'll fix the two other features with missing values using multiple imputation

In [None]:
def multiple_imputation(df):
    # Create a MICE imputer    
    mi = sm.MICE(endog=df, exog=None, nskip=1, niter=10, verbose=1)

    # Fit the MICE model
    results = mi.fit()

    # Obtain the imputed datasets
    imputed_datasets = results.endog

    # Now you have multiple imputed datasets. You can analyze each separately or combine the results.

    # Combine the results, e.g., by averaging or using a weighted combination
    mean_imputation = imputed_datasets.mean(axis=0)

    # Perform your analysis using the imputed datasets or the combined result