In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
from sklearn.impute import SimpleImputer


# 讀取上傳的文件
'''
north_data = pd.read_csv('D:/DS_Prediction/Weather/north_weekly_averages.csv')
south_data = pd.read_csv('D:/DS_Prediction/Weather/south_weekly_averages.csv')
central_data = pd.read_csv('D:/DS_Prediction/Weather/central_weekly_averages.csv')
east_data = pd.read_csv('D:/DS_Prediction/Weather/east_weekly_averages.csv')
fuel_prices = pd.read_csv('D:/DS_Prediction/fuel_prices.csv')
cabbage_prices = pd.read_csv('D:/DS_Prediction/Domestic_Cabbage.csv')
'''

north_data = pd.read_csv('./Weather/north_weekly_averages.csv')
south_data = pd.read_csv('./Weather/south_weekly_averages.csv')
central_data = pd.read_csv('./Weather/central_weekly_averages.csv')
east_data = pd.read_csv('./Weather/east_weekly_averages.csv')
fuel_prices = pd.read_csv('./fuel_prices.csv')
cabbage_prices = pd.read_csv('./vegetable-csv/Domestic_Cabbage.csv')
chinese_cabbage_prices = pd.read_csv('./vegetable-csv/Domestic_Chinese_cabbage.csv')
cauliflower_prices = pd.read_csv('./vegetable-csv/Domestic_Cauliflower.csv')



In [None]:
## 
''' 
Data Processing (2019~2022)

'''
## 

# Merge regional data into a single DataFrame
regional_data_1 = pd.concat([north_data, south_data, central_data, east_data], ignore_index=True)


# Check and rename date columns if necessary
def ensure_date_column(df, possible_names):
    for col in df.columns:
        if col in possible_names:
            df.rename(columns={col: 'date'}, inplace=True)
            break
    return df

# Rename the date columns where applicable
regional_data = ensure_date_column(regional_data_1, ['週', 'date'])
fuel_prices = ensure_date_column(fuel_prices, ['Date', 'date', '週', '日期'])
cabbage_prices = ensure_date_column(cabbage_prices, ['週', 'date'])

# Convert date columns to datetime
def parse_date(df, column_name):
    if column_name in df.columns:
        df[column_name] = pd.to_datetime(df[column_name], errors='coerce', utc=True)
    return df

regional_data = parse_date(regional_data, 'date')
fuel_prices = parse_date(fuel_prices, 'date')
cabbage_prices = parse_date(cabbage_prices, 'date')

# Drop rows with missing or invalid 'date' values
for df in [regional_data, fuel_prices, cabbage_prices]:
    if 'date' in df.columns:
        df.dropna(subset=['date'], inplace=True)

# Ensure 'date' columns are consistent and datetimelike
for df in [regional_data, fuel_prices, cabbage_prices]:
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date']).dt.normalize()

# Extract additional features from date
def extract_date_features(df, date_column):
    df['year'] = df[date_column].dt.year
    df['month'] = df[date_column].dt.month
    df['week'] = df[date_column].dt.day // 7
    df['day_of_week'] = df[date_column].dt.dayofweek
    return df

cabbage_prices = extract_date_features(cabbage_prices, 'date')
fuel_prices = extract_date_features(fuel_prices, 'date')

# Merge all the data into a single DataFrame
try:
    merged_data = pd.merge(cabbage_prices, regional_data, on='date', how='left')
    merged_data = pd.merge(merged_data, fuel_prices, on=['year', 'month', 'week'], how='left')
except KeyError as e:
    raise KeyError(f"Error during merging: {e}. Please check that all dataframes contain a 'date' column.")

# 根據 Group 分組計算均值並填補缺失值
merged_data['Fuel_92'] = merged_data.groupby('month')['Fuel_92'].transform(lambda x: x.fillna(x.mean()))
merged_data['Fuel_95'] = merged_data.groupby('month')['Fuel_95'].transform(lambda x: x.fillna(x.mean()))
merged_data['Fuel_High'] = merged_data.groupby('month')['Fuel_High'].transform(lambda x: x.fillna(x.mean()))

# Save the merged DataFrame to a CSV file
output_file = "merged_data_m.csv"
merged_data.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"Merged data saved to {output_file}")

print(merged_data)