### Imports  

In [None]:
from Utils import *

### NPZ to DF and some visualizations

In [2]:
#dfs creation from npz
folder_path = r"C:\Users\stebi\Desktop\potatoes\1800"
dataframes = load_npz_files(folder_path)
for name, df in dataframes.items():
    globals()[name] = df

In [None]:
#500 steps before and after sprouting
df_names = [f'df{i+1}' for i in range(32)]
for name in df_names:
    df = globals()[name]
    sprouting_500(df, name)

In [None]:
#100 steps before and after sprouting
df_names = [f'df{i+1}' for i in range(32)]
for name in df_names:
    df = globals()[name]
    sprouting_100(df, name)

In [None]:
#plot whole TS
df_names = [f'df{i+1}' for i in range(32)]
for name in df_names:
    df = globals()[name]
    plot_time_series(df, name)

In [None]:
#ACF e PACF
filtered_df1 = df1.loc[df1['y'] <= 0]
combined_signal = filtered_df1.drop(columns='y', errors='ignore').values.flatten()
downsampled_signal = downsample_signal(combined_signal, factor=10) 
plot_acf_pacf(downsampled_signal, 'Signal)')

In [None]:
#check stationarity
stationarity_results_df1 = check_combined_stationarity_kpss(df1)
for key, value in stationarity_results_df1.items():
    print(f"{key}: {value}")

In [None]:
#filter out rows with y > 0
filtered_dataframes = filter_dataframes(32)

### Windowing

In [None]:
#windowing
processed_dataframes_6 = window(filtered_dataframes, 6)
processed_dataframes_12 = window(filtered_dataframes, 12)
processed_dataframes_18 = window(filtered_dataframes, 18)
processed_dataframes_24 = window(filtered_dataframes, 24)

### Normalization

In [None]:
#normalization
normalized_dataframes_6 = normalize(processed_dataframes_6)
normalized_dataframes_12 = normalize(processed_dataframes_12)
normalized_dataframes_18 = normalize(processed_dataframes_18)
normalized_dataframes_1_day = normalize(processed_dataframes_24)

### Feature generation

In [None]:
#featgen
extracted_features_6 = featgen(normalized_dataframes_6)
extracted_features_12 = featgen(normalized_dataframes_12)
extracted_features_18 = featgen(normalized_dataframes_18)
extracted_features_1_day = featgen(normalized_dataframes_1_day)

combined_features_6 = combined(extracted_features_6)
combined_features_12 = combined(extracted_features_12)
combined_features_18 = combined(extracted_features_18)
combined_features_1_day = combined(extracted_features_1_day)

### Scaling

In [None]:
#scaling
scal_norm_dataframes_6 = scale(combined_features_6, target_column='y')
scal_norm_dataframes_12 = scale(combined_features_12, target_column='y')
scal_norm_dataframes_18 = scale(combined_features_18, target_column='y')
scal_norm_dataframes_1_day = scale(combined_features_1_day, target_column='y')


## Regression
### Define models

In [None]:
#define models
models = {
    'XGB': XGBRegressor(),
    'ADAB': AdaBoostRegressor(),
    'LGBM': lgb.LGBMRegressor()
}

### Modelling with scaling and normalization

In [None]:
#modelling, scal + norm
dictionaries = {
    '6_h_window': scal_norm_dataframes_6,
    '12_h_window': scal_norm_dataframes_12,
    '18_h_window': scal_norm_dataframes_18,
    '1_d_window': scal_norm_dataframes_1_day
}

results, individual_maes, individual_predictions = regression(models, dictionaries)
pretty_print(results)

In [None]:
#mean day pred
mean_days = day_predict(individual_predictions)
prettyp(mean_days)

In [None]:
#boxplots
create_boxplots(individual_maes)

In [None]:
#line plot
plot_lines(individual_maes)

## Comparison df9 against others

In [None]:
# raw df6, 5, 17 against raw df9
df_reference = df9
dfs_to_compare = [df5, df6, df17]
labels = ['df5', 'df6', 'df17']

comparison(df_reference, dfs_to_compare, labels)

### Modelling with normalization

In [None]:
#modelling,  only norm
dictionaries = {
    '6_h': combined_features_6,
    '12_h': combined_features_12,
    '18_h': combined_features_18,
    '1_d': combined_features_1_day
}

results, individual_maes, individual_predictions = regression(models, dictionaries)
pretty_print(results)

In [None]:
#mean day pred
mean_days = day_predict(individual_maes)
prettyp(mean_days)

In [None]:
#boxplot
create_boxplots(individual_maes)

In [None]:
#line plot
plot_lines(individual_maes)

### Modelling without scaling or normalization

In [None]:
#featgen, original data no normalization
extracted_features_6 = featgen(processed_dataframes_6)
extracted_features_12 = featgen(processed_dataframes_12)
extracted_features_18 = featgen(processed_dataframes_18)
extracted_features_1_day = featgen(processed_dataframes_24)

combined_features_6 = combined(extracted_features_6)
combined_features_12 = combined(extracted_features_12)
combined_features_18 = combined(extracted_features_18)
combined_features_1_day = combined(extracted_features_1_day)

In [None]:
#modelling, no scale no norm
dictionaries = {
    '6_h': combined_features_6,
    '12_h': combined_features_12,
    '18_h': combined_features_18,
    '1_d': combined_features_1_day
}

results, individual_maes, individual_predictions = regression(models, dictionaries)
pretty_print(results)

In [None]:
#mean day pred
mean_days = day_predict(individual_maes)
pretty_print(mean_days)

In [None]:
#boxplot
create_boxplots(individual_maes)

In [None]:
#line plot
plot_lines(individual_maes)

### Modelling with scaling

In [None]:
#scaling raw data
scaled_dataframes_6 = scale(combined_features_6, target_column='y')
scaled_dataframes_12 = scale(combined_features_12, target_column='y')
scaled_dataframes_18 = scale(combined_features_18, target_column='y')
scaled_dataframes_1_day = scale(combined_features_1_day, target_column='y')

In [None]:
#modelling, only scale
dictionaries = {
    '6_h': scaled_dataframes_6,
    '12_h': scaled_dataframes_12,
    '18_h': scaled_dataframes_18,
    '1_d': scaled_dataframes_1_day
}

results, individual_maes, individual_predictions = regression(models, dictionaries)
pretty_print(results)

In [None]:
#mean day pred
mean_days = day_predict(individual_maes)
prettyp(mean_days)

In [None]:
#boxplot
create_boxplots(individual_maes)

In [None]:
#line plot
plot_lines(individual_maes)

## Raw data

In [None]:
#group all parquets for each plant
#source_dirs = [r"C:\Users\stebi\Desktop\potatoes\year_2021", r"C:\Users\stebi\Desktop\potatoes\year_2022"]
#target_dir = r"C:\Users\stebi\Desktop\potatoes\organized_stations"
#collect_parquet_files(source_dirs, target_dir)

In [None]:
#change folder name --> 1 to 32
#base_dir = r"C:\Users\stebi\Desktop\potatoes\organized_stations" 
#rename_station_folders(base_dir)

In [None]:
#parquet to df
base_dir = r"C:\Users\stebi\Desktop\potatoes\organized_stations" 
dataframes = merge_parquets_to_dataframes(base_dir) 

In [None]:
#clean dataframes (Y,M,D + h,m,s)
cleaned_dataframes = clean_all_dataframes(dataframes)

In [None]:
#gaps analysis
gap_results = analyze_gap(cleaned_dataframes)

for key, stats in gap_results.items():
    print(f"Summary for dataframe {key}:")
    print(f"  Total rows: {stats['total_rows']}")
    print(f"  Total gap seconds: {stats['total_gap_seconds']}")
    print(f"  Gap percentage: {stats['gap_percentage']:.2f}%")
    print(f"  Average gap length: {stats['avg_gap_length']:.2f} seconds")
    print(f"  Number of gaps: {stats['num_gaps']}")
    print()

In [None]:
#gaps filling
filled_dataframes = fill_gaps(cleaned_dataframes)

In [None]:
#gaps check
check_temporal_integrity(filled_dataframes)

In [None]:
#print df1
filled_dataframes['df1']

In [None]:
# Add the 'y' column to each dataframe
added_dataframes = add_y_column(filled_dataframes, match_table)

In [None]:
#sanity check
added_dataframes['df1']

In [None]:
#drop 'timestamp'
added_dataframes_nostamp = drop_timestamp_column(added_dataframes)

In [None]:
#interpolation
dataframes = interpolate_mv_column(added_dataframes_nostamp)

In [None]:
#drop y>0
dataframes = drop_rows_y_greater_than_zero(dataframes)

In [None]:
#windowing (6, 12, 18, 24 h)
dataframes_6 = window_from_raw(dataframes, 6)
dataframes_12 = window_from_raw(dataframes, 12)
dataframes_18 = window_from_raw(dataframes, 18)
dataframes_24 = window_from_raw(dataframes, 24)

In [None]:
#normalize
normalized_6 = normalize(dataframes_6)
normalized_12 = normalize(dataframes_12)
normalized_18 = normalize(dataframes_18)
normalized_1_day = normalize(dataframes_24)

In [None]:
#featgen
extracted_6 = featgen(normalized_6)
extracted_12 = featgen(normalized_12)
extracted_18 = featgen(normalized_18)
extracted_1_day = featgen(normalized_1_day)

combined_6 = combined(extracted_6)
combined_12 = combined(extracted_12)
combined_18 = combined(extracted_18)
combined_1_day = combined(extracted_1_day)

In [None]:
#scaling
scal_norm_6 = scale(combined_6, target_column='y')
scal_norm_12 = scale(combined_12, target_column='y')
scal_norm_18 = scale(combined_18, target_column='y')
scal_norm_1_day = scale(combined_1_day, target_column='y')

In [None]:
#modelling
dictionaries = {
    '6_h_window': scal_norm_6,
    '12_h_window': scal_norm_12,
    '18_h_window': scal_norm_18,
    '1_d_window': scal_norm_1_day
}

results, individual_maes, individual_predictions = regression(models, dictionaries)
pretty_print(results)

In [None]:
#mean day pred
mean_days = day_predict(individual_predictions)
prettyp(mean_days)

In [None]:
#boxplots
create_boxplots(individual_maes)

In [None]:
#line plot
plot_lines(individual_maes)