In [145]:
from pathlib import Path
import pandas as pd
from pyampute.exploration.mcar_statistical_tests import MCARTest
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
dataset_path = Path().resolve().parent/'curated'/'curated_data_penmanshiel.parquet'

In [147]:
df = pd.read_parquet(dataset_path)

In [148]:
df.shape[0]

680680

In [149]:
def filter_dataframe_with_columns_list(df, columns_csv_path):

    cols_df = pd.read_csv(columns_csv_path, header=None)
    columns_to_preserve = cols_df.iloc[0].tolist()

    def normalize(name):
        return name.strip().strip('"').strip("'")
    
    clean_preserve = [normalize(c) for c in columns_to_preserve]

    col_map = {normalize(orig): orig for orig in df.columns}
    matching_cols = [col_map[c] for c in clean_preserve if c in col_map]
    
    return df[matching_cols]

columns_to_preserve_path = Path().resolve().parent / 'curated' / 'columns_to_preserve.csv'
df_main = pd.read_parquet(dataset_path)
df_filtered = filter_dataframe_with_columns_list(df_main, columns_to_preserve_path)


In [150]:
df_missing = df_filtered.isna().sum().reset_index()
df_missing.columns = ['variable', 'missing_values']
df_missing = df_missing.sort_values(by='missing_values', ascending=False)
df_missing ## First we need to filter through the variables that have reasonable low missing_values. Then we can proceed with the with checking MCAR, MAR or NMAR. 

Unnamed: 0,variable,missing_values
0,Density adjusted wind speed (m/s),68068
2,Nacelle ambient temperature (°C),68068
3,Nacelle temperature (°C),68068
4,"Nacelle temperature, Max (°C)",68068
5,"Nacelle temperature, Min (°C)",68068
...,...,...
70,Wind speed Sensor 2 (m/s),11761
69,Wind speed (m/s),11761
87,Energy Export (kWh),9443
88,time,0


In [166]:
df_filtered.head(5)

Unnamed: 0,Density adjusted wind speed (m/s),Vane position 1+2 (°),Nacelle ambient temperature (°C),Nacelle temperature (°C),"Nacelle temperature, Max (°C)","Nacelle temperature, Min (°C)","Temperature motor axis 1, Max (°C)","Temperature motor axis 1, Min (°C)","Temperature motor axis 1, StdDev (°C)","Temperature motor axis 2, Max (°C)",...,Temperature motor axis 1 (°C),Temperature motor axis 2 (°C),Temperature motor axis 3 (°C),CPU temperature (°C),Generator RPM (RPM),Gear oil inlet pressure (bar),Gear oil pump pressure (bar),Energy Export (kWh),time,Long Term Wind (m/s)
1528,5.544045,2.99,9.78,21.389999,21.5,21.200001,46.0,30.0,6.37,50.0,...,38.580002,41.299999,39.23,58.490002,1209.609985,1.6,4.76,33.0,2016-06-17 08:50:00,5.8
1529,7.46637,-1.25,9.63,21.65,21.9,20.049999,41.0,33.0,2.04,43.0,...,36.139999,38.34,36.650002,58.919998,1559.449951,1.96,5.49,90.0,2016-06-17 09:00:00,5.8
1530,7.308725,-4.0,9.44,17.35,20.0,16.049999,33.0,30.0,0.74,35.0,...,31.190001,33.220001,31.790001,58.900002,1551.719971,1.86,5.18,123.0,2016-06-17 09:10:00,5.8
1531,6.989841,-9.03,9.47,15.62,16.5,15.2,30.0,29.0,0.5,32.0,...,29.530001,31.51,30.459999,58.220001,1406.910034,1.72,4.92,65.0,2016-06-17 09:20:00,5.8
1532,7.018885,-0.1,9.57,18.559999,19.6,16.6,29.0,28.0,0.3,31.0,...,28.879999,30.99,29.389999,57.990002,1459.48999,1.83,5.17,80.0,2016-06-17 09:30:00,5.8


In [151]:
df_numeric = df_filtered.select_dtypes(include=[np.number])

In [None]:
mt = MCARTest(method="little")        # From here we know that the missing values are not missing completely at random (MCAR), assuming they are MAR. 
pval = mt.little_mcar_test(df_numeric)  
print(f"Little's MCAR p-value: {pval:.4g}")

Little's MCAR p-value: 0


In [None]:
imputer = IterativeImputer(random_state=0, max_iter=10)
df_imputed = imputer.fit_transform(df_numeric) ## Build a pipeline maybe



In [171]:
df_x = pd.DataFrame(df_imputed, columns=df_numeric.columns, index=df_numeric.index)
df_x['time'] = df_filtered['time']
df_x.to_parquet(Path().resolve().parent/'curated'/'imputed_data_penmanshiel.parquet')

#### Feature Engineering

In [None]:
# Steps in feature engineering:
# 1. Handling missing values: Correcting nulls through imputation, deletion, interpolation or domain informed filling. Done. 
# 2. Undertsanding the data: Checking distributions, correlations and identifying datatypes (numeric, categorical, text, time series, etc.)
#   - 2.1 Feature engineering, lag values, moving averages for different timeframes. 
# 3. Handling outliers: Detecting and treating outliers. 
# 4. Encoding categorical variables. There are non categorical variables. 
# 6. Transforming features: Applying log transforms, Box-Cox, or scaling to normalize skewed distributions
# 7. Feature creation: Deriving new variables from existing ones.
# 8. Handling multicolianrity
# 9. Feature selection: Keeping only the most relevant predictors based on statistical tests