In [5]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit

def extract_date(date):
    if isinstance(date, str):
        return date.split('T')[0]
    else:
        return date.strftime('%Y-%m-%d')

meteo = pd.read_excel('./donnees-synop-essentielles-omm.xlsx')
solar = pd.read_excel('./rayonnement-solaire-vitesse-vent-tri-horaires-regionaux.xlsx')
tourisme = pd.read_excel('./economicref-france-commune-classement-touristique.xlsx')
densite = pd.read_excel('./FET2021-19.xlsx', header = 2)
incendies = pd.read_excel('./Incendies.xlsx', header = 3)

meteo['Date'] = meteo['Date'].apply(extract_date)
solar['Date'] = solar['Date'].apply(extract_date)
incendies['Date'] = incendies['Date'].apply(extract_date)

# Deduplicate solar data
solar.drop_duplicates(subset=["Date", "region (code)"], keep="first", inplace=True)

# Merge df
meteo = pd.merge(meteo, solar, on=['Date', 'region (code)'], how='left')
meteo = pd.merge(meteo, tourisme[['communes (name)', 'Type Touristique']], on='communes (name)', how='left')
meteo = pd.merge(meteo, densite[['communes (code)', 'Libellé typologie']], on='communes (code)', how='left')
incendies['Incendie'] = 1
meteo = pd.merge(meteo, incendies[['department (code)','Date', 'Incendie']], on=['department (code)','Date'], how='left')

# Fill NA and move 'Incendie' column to the beginning
meteo['Incendie'] = meteo['Incendie'].fillna(0)
last_col = meteo.pop('Incendie')
meteo.insert(0, 'Incendie', last_col)

# Sort, reset index, and select columns
meteo.sort_values('Date', ascending=False, inplace=True)
meteo.reset_index(drop=True, inplace=True)
meteo = meteo.iloc[:, [0, 2, 6, 7, 8, 10, 43, 71, 72, 73, 74, 75, 78, 70, 80, 81, 84, 85, 86, 87]]

# Filter out 80% of non-incendie cases
meteo = meteo.drop(meteo[meteo['Incendie'] == 0].sample(frac=0.8).index)

# Prepare X and Y
Y = meteo.pop('Incendie')
X = pd.get_dummies(meteo, columns=['Date', 'communes (name)', 'communes (code)', 'department (name)', 'region (name)', 'Type Touristique', 'Libellé typologie'])

# Split data into train and test
tscv = TimeSeriesSplit(n_splits=5)

for train_index, test_index in tscv.split(X, Y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

# Split test data into public and private
test_size = len(X_test)
test_public_size = int(test_size * (1/3))

X_test_public = X_test.iloc[:test_public_size]
Y_test_public = Y_test.iloc[:test_public_size]
X_test_private = X_test.iloc[test_public_size:]
Y_test_private = Y_test.iloc[test_public_size:]

# Export to CSV
datasets = {'X_train': X_train, 'Y_train': Y_train, 'X_test_public': X_test_public, 'Y_test_public': Y_test_public, 'X_test_private': X_test_private, 'Y_test_private': Y_test_private}
for name, dataset in datasets.items():
    dataset.to_csv(f'./data/{name}.csv', index=False)