## Import Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import random
import sklearn
import seaborn as sns
from scipy import stats

%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

## Read data

In [None]:
y_a = pd.read_parquet('A/train_targets.parquet')
y_b = pd.read_parquet('B/train_targets.parquet')
y_c = pd.read_parquet('C/train_targets.parquet')

X_est_a = pd.read_parquet('A/X_train_estimated.parquet')
X_est_b = pd.read_parquet('B/X_train_estimated.parquet')
X_est_c = pd.read_parquet('C/X_train_estimated.parquet')

X_obs_a = pd.read_parquet('A/X_train_observed.parquet')
X_obs_b = pd.read_parquet('B/X_train_observed.parquet')
X_obs_c = pd.read_parquet('C/X_train_observed.parquet')

X_obs_a.head()

## Check X data on all locations

In [None]:
mean_a = X_obs_a.drop('date_forecast', axis=1).mean()
mean_b = X_obs_b.drop('date_forecast', axis=1).mean()
mean_c = X_obs_c.drop('date_forecast', axis=1).mean()

huge_indices = mean_a[mean_a > 10000].index
large_indices = mean_a[(mean_a > 1000) & (mean_a <= 10000)].index
medium_indices = mean_a[(mean_a > 100) & (mean_a <= 1000)].index
small_indices = mean_a[(mean_a > 10) & (mean_a <= 100)].index
tiny_indices = mean_a[(mean_a > 1) & (mean_a <= 10)].index
tiniest_indices = mean_a[(mean_a <= 1)].index

def display_mean_values(indices, mean_a, mean_b, mean_c):
    mean_data = {
        'A': mean_a[indices],
        'B': mean_b[indices],
        'C': mean_c[indices]
    }

    # Create a dataframe from the mean data
    mean_df = pd.DataFrame(mean_data)

    # Plot the mean values
    mean_df.plot(kind='bar')
    plt.xlabel('Features')
    plt.ylabel('Mean Values')
    plt.title('Mean Values')
    plt.show()

display_mean_values(huge_indices, mean_a, mean_b, mean_c)

In [None]:
display_mean_values(large_indices, mean_a, mean_b, mean_c)

In [None]:
display_mean_values(medium_indices, mean_a, mean_b, mean_c)

In [None]:
display_mean_values(small_indices, mean_a, mean_b, mean_c)

In [None]:
display_mean_values(tiny_indices, mean_a, mean_b, mean_c)

In [None]:
display_mean_values(tiniest_indices, mean_a, mean_b, mean_c)

## Observations

We can see that there are not that much of a difference i the first 5 of the plots. However, lets get a closer look at the last one:

In [None]:
big_c_indices = mean_c[tiniest_indices][mean_c > 0.5].index
small_c_indices = mean_c[tiniest_indices][mean_c <= 0.5].index

display_mean_values(big_c_indices, mean_a, mean_b, mean_c)

In [None]:
display_mean_values(small_c_indices, mean_a, mean_b, mean_c)

From the graphs we can see that C differentiates itself hugely in elevation and probability of rime. Additiionally it snows a lot in C, less in B and least in A. Let's look at what the **elevation** and **prob_rime** features imply:

**elevation:m** = elevation of ground above sea level [m]

**prob_rime** = rime probability [%]

Let's also look closer at the **sun_elevation** feature:

In [None]:
display_mean_values(['sun_elevation:d',], mean_a, mean_b, mean_c)

**sun_elevation:d** = The solar elevation angle (angle between the sun and the horizon) gives the position of the sun above the horizon.

This suggests that location B and C are close, and location A is in another location completely.

## Check target data on different locations

In [None]:
y_a.describe()

In [None]:
y_b.describe()

In [None]:
y_c.describe()

As we can see, the mean, max and std are huge on A compared to B and C. Also, notice that the data from location B and C are very similar compared to A

## Identify outliers

In [None]:
z_scores = np.abs(stats.zscore(X_obs_a.drop('date_forecast', axis=1)))
outliers = (z_scores > 3).all(axis=1)
X_obs_a[outliers].head()

## Visualize outliers

### Boxplot A

In [None]:
X_obs_a.boxplot(figsize=(20, 10))

### Boxplot B

In [None]:
X_obs_b.boxplot(figsize=(20, 10))

### Boxplot C

In [None]:
X_obs_c.boxplot(figsize=(20, 10))

## Findings

We can see that 3 of the features have a lot of outliers on all of the locations. Let's investigate closer:

#### Finding outlier features

In [None]:
all_features = X_obs_a.columns.tolist()
# Find outliers by index according to the boxplot
outlier_features = [all_features[i] for i in (4, 10, 12)]
print(outlier_features)

Looked up the features and found this

**clear_sky_energy_1h:J** = clear sky energy of previous time period, available up to 24h [J/m2]

**diffuse_rad_1h:J** = accumulated diffuse radiation of previous time period, available up to 24h [J/m2]

**direct_rad_1h:J** = accumulated direct radiation of previous time period, available up to 24h [J/m2]

All of these are for the last hour, so I decided to investigate the datasets further

In [None]:
X_obs_a.head(100)

### Boxplot A

In [None]:
X_obs_a.boxplot(outlier_features, figsize=(20, 10))