In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/the-future-crop-challenge/pr_wheat_train.parquet
/kaggle/input/the-future-crop-challenge/tasmax_maize_train.parquet
/kaggle/input/the-future-crop-challenge/sample_submission.csv
/kaggle/input/the-future-crop-challenge/soil_co2_wheat_train.parquet
/kaggle/input/the-future-crop-challenge/tas_wheat_train.parquet
/kaggle/input/the-future-crop-challenge/rsds_maize_train.parquet
/kaggle/input/the-future-crop-challenge/tasmin_wheat_train.parquet
/kaggle/input/the-future-crop-challenge/tasmax_wheat_train.parquet
/kaggle/input/the-future-crop-challenge/rsds_maize_test.parquet
/kaggle/input/the-future-crop-challenge/soil_co2_maize_test.parquet
/kaggle/input/the-future-crop-challenge/train_solutions_maize.parquet
/kaggle/input/the-future-crop-challenge/pr_maize_test.parquet
/kaggle/input/the-future-crop-challenge/tas_wheat_test.parquet
/kaggle/input/the-future-crop-challenge/tasmax_maize_test.parquet
/kaggle/input/the-future-crop-challenge/pr_maize_train.parquet
/kaggle/input/the-fu

# Load Data with reduction

In [2]:
DEBUG_MODE = False

CROPS = ("maize", "wheat")

MODES = ('train', 'test')

FEATURES_TEMPORAL = {
    # Time series data -- 240 columns reflecting daily values for 30 days before sowing and 210 days after.
    'tas',       # Mean daily temperature
    'tasmax',    # Max daily temperature
    'tasmin',    # Min daily temperature
    'pr',        # precipitation
    'rsds'      # shortwave radiation
}

FEATURES_STATIC = {
    # Static data
    'soil_co2',  # crop, year, lon, lat, texture_class, real_year, co2, nitrogen
    # dominant USDA soil texture class (constant over time), the ambient CO2 concentration (spatially constant), the planting date and the nitrogen application rate (constant over time)
}

FEATURES = set.union(FEATURES_TEMPORAL, FEATURES_STATIC)

COLUMNS_TO_DROP = ['crop','variable']

# Sowing date
INDEX_SOW = 30  # days
# Time series data length
SEASON_LENGTH = 240  # days
# Nr. of soil texture classes
NUM_TEXTURE_CLASSES = 13  

YEAR_TRAIN_MIN = 1982
YEAR_TRAIN_MAX = 2020  # Inclusive
YEAR_TEST_MIN = 2021
YEAR_TEST_MAX = 2098

PATH_INPUT = os.path.abspath(os.path.join(os.sep, 'kaggle', 'input', 'the-future-crop-challenge'))


In [3]:
# Reduce memory usage of a pandas DataFrame
def reduce_memory_usage(df):
    """Reduce memory usage of a pandas DataFrame."""
    # Function to iterate through columns and modify the data types
    start_mem = df.memory_usage().sum() / 1024**2
    #print(f"Memory usage of dataframe: {start_mem} MB")

    for col in df.columns:
        if col in df.index.names:  # Skip index columns, since other formats of index aren't supported by the engine
            continue

    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)  # Keep sufficient precision
            else:
                if col == "year":  # Ensure precision for grouping columns
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    #print(f"Memory usage after optimization: {end_mem} MB")
    #print(f"Decreased by {100 * (start_mem - end_mem) / start_mem}%")
    return df

In [4]:
def load_data(crop: str, # Which crop
              mode: str, # Which dataset (i.e. train/test)
              select_only_features: bool = True, # Drop every other column (crop, year, lon, lat) if not relevant for computation
              take_subset: bool = False,  # If set to true, take a small subset of the data (for debugging purposes)
             ) -> dict:
    assert crop in CROPS
    assert mode in MODES
    
    output = dict()
    
    for f in FEATURES:
        path = os.path.join(PATH_INPUT, f'{f}_{crop}_{mode}.parquet')
        df = reduce_memory_usage(pd.read_parquet(path))

        columns_to_drop_in_df = [col for col in COLUMNS_TO_DROP if col in df.columns] 
        if columns_to_drop_in_df:
            df = df.drop(columns=columns_to_drop_in_df)

        if select_only_features:
            if f in FEATURES_TEMPORAL:  # Select only the time series data -- drop other columns
                df = df[[str(i) for i in range(SEASON_LENGTH)]]
        
        output[f] = df

        # Free up memory after processing each file
        del df  # Explicitly delete the DataFrame
        gc.collect()  # Force garbage collection
        
    if mode == 'train':
        output['target'] = pd.read_parquet(os.path.join(PATH_INPUT, f'{mode}_solutions_{crop}.parquet'))
    
    # If required, only take a subset of the data for debugging purposes -- we don't really care which samples
    if take_subset:
        num_select = 100  # Take only 100 samples from the dataset
        # Select which samples based on the index of some feature
        ixs_selected = output[tuple(FEATURES)[0]].index[:num_select]
        # Filter all dataframes
        output = {
            key: df.loc[ixs_selected] for key, df in output.items()
        }

    '''
    for df in output.values():
        df.sort_index(inplace=True)
    '''
        
    return output

In [5]:
# Load all available data for all crops
crop_data_train = {
    crop: load_data(crop, 'train', take_subset=DEBUG_MODE, select_only_features=False) for crop in CROPS
}


crop_data_test = {
    crop: load_data(crop, 'test', take_subset=DEBUG_MODE, select_only_features=False) for crop in CROPS
}

# Separate data in features and targets (if available)
crop_features_train = {
    crop: {
        k: v for k, v in data.items() if k in FEATURES
    } for crop, data in crop_data_train.items()
}
crop_features_test = {
    crop: {
        k: v for k, v in data.items() if k in FEATURES
    } for crop, data in crop_data_test.items()
}

crop_targets_train = {
    crop: data['target'] for crop, data in crop_data_train.items()
}


# Extra metrics


In [6]:
def calculate_statistics_on_crop(df):    
    #We already have the vars splitte
    #Consider relying on tasmax, tasmin instead of these aggregated stats
    
    # Calculate statistics
    mean_tas = df['tas'].mean(axis=1).rename('mean_tas')
    median_tas = df['tas'].median(axis=1).rename('median_tas')
    sum_tas = df['tas'].sum(axis=1).rename('sum_tas')
    min_tas = df['tas'].min(axis=1).rename('min_tas')
    max_tas = df['tas'].max(axis=1).rename('max_tas')
    
    mean_pr = df['pr'].mean(axis=1).rename('mean_pr')
    median_pr = df['pr'].median(axis=1).rename('median_pr')
    sum_pr = df['pr'].sum(axis=1).rename('sum_pr')
    min_pr = df['pr'].min(axis=1).rename('min_pr')
    max_pr = df['pr'].max(axis=1).rename('max_pr')
    
    mean_rsds = df['rsds'].mean(axis=1).rename('mean_rsds')
    median_rsds = df['rsds'].median(axis=1).rename('median_rsds')
    sum_rsds = df['rsds'].sum(axis=1).rename('sum_rsds')
    min_rsds = df['rsds'].min(axis=1).rename('min_rsds')
    max_rsds = df['rsds'].max(axis=1).rename('max_rsds')

    # Create a DataFrame to store the results
    summary_df = pd.concat([mean_tas, min_tas, max_tas, median_tas, sum_tas,
                            mean_pr, min_pr, max_pr, median_pr, sum_pr,
                            mean_rsds, median_rsds, sum_rsds, min_rsds, max_rsds], axis=1)

    return summary_df

In [7]:
for crop in CROPS:
    crop_features_train[crop]['summary'] = reduce_memory_usage(calculate_statistics_on_crop(crop_features_train[crop]))

In [8]:
print(type(crop_features_train[crop]['summary']))
print(crop_features_train[crop]['summary'])

<class 'pandas.core.frame.DataFrame'>
          mean_tas  min_tas  max_tas  median_tas  sum_tas   mean_pr  min_pr  \
ID                                                                            
1040990   9.289062  -123.25    381.0    8.421875   2258.0  1.245117 -123.25   
1040991   8.929688  -123.25    381.0    7.992188   2170.0  1.247070 -123.25   
1040992   7.812500  -123.25    381.0    6.933594   1899.0  1.249023 -123.25   
1040993   8.726562  -122.75    381.0    7.781250   2122.0  1.247070 -122.75   
1040994   9.335938  -122.75    381.0    8.406250   2270.0  1.249023 -122.75   
...            ...      ...      ...         ...      ...       ...     ...   
1319732  14.609375   -30.25    419.0   11.953125   3550.0  2.226562  -30.25   
1319733  15.632812   -29.25    419.0   13.179688   3798.0  2.230469  -29.25   
1319734  16.906250   -28.75    419.0   14.257812   4108.0  2.232422  -28.75   
1319735  18.906250   -27.75    419.0   16.546875   4592.0  2.236328  -27.75   
1319736  19.21

In [9]:
base_temp = 10

# Function to calculate GDD, heat stress days, and frost days for each group
def calculate_metrics(group):
    #group['GDD'] = np.maximum(group['mean_tas'] - base_temp, 0).cumsum()
    group['heat_stress_days'] = (group['max_tas'] > 30).cumsum()
    group['frost_days'] = (group['min_tas'] < 0).cumsum()
    return group

In [10]:
g = np.maximum(crop_features_train[crop]['summary']['mean_tas'] - base_temp, 0).cumsum()
print(g)

ID
1040990    0.0
1040991    0.0
1040992    0.0
1040993    0.0
1040994    0.0
          ... 
1319732    inf
1319733    inf
1319734    inf
1319735    inf
1319736    inf
Name: mean_tas, Length: 278747, dtype: float16


  return bound(*args, **kwds)


In [11]:
for crop in CROPS:
    crop_features_train[crop]['summary'] = reduce_memory_usage(calculate_metrics(crop_features_train[crop]['summary']))

In [12]:
yield_target_wheat = pd.DataFrame(crop_targets_train['wheat'])
print(yield_target_wheat)
yield_target_maize = pd.DataFrame(crop_targets_train['maize'])
print(yield_target_maize)

         yield
ID            
1040990  4.775
1040991  4.874
1040992  4.701
1040993  4.848
1040994  5.178
...        ...
1319732  1.418
1319733  1.653
1319734  1.271
1319735  0.469
1319736  0.629

[278747 rows x 1 columns]
        yield
ID           
0       5.595
1       5.895
2       3.023
3       2.071
4       2.239
...       ...
349714  6.240
349715  8.926
349716  2.180
349717  7.311
349718  2.118

[349719 rows x 1 columns]


# LSTM train and evaluation

In [13]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [14]:
def prepare_lstm_data(crop_features, crop_targets):
    # Préparer les données pour LSTM
    X_temporal = []
    
    # Variables temporelles
    temporal_features = ['tas', 'tasmax', 'tasmin', 'pr', 'rsds']
    
    for feature in temporal_features:
        # Convertir les données en numpy array 
        feat_array = crop_features[feature].values
        X_temporal.append(feat_array)
    
    # Empiler les caractéristiques temporelles
    X_temporal = np.stack(X_temporal, axis=-1)
    
    # Récupérer les données statiques
    soil_data = crop_features['soil_co2']
    summary = crop_features['summary']
    texture_class = soil_data['texture_class'].values.reshape(-1, 1)
    co2_level = soil_data['co2'].values.reshape(-1, 1)
    heat_level = summary['heat_stress_days'].values.reshape(-1, 1)
    frost_level = summary['frost_days'].values.reshape(-1, 1)
    
    # Normaliser les données statiques
    scaler_texture = StandardScaler()
    scaler_co2 = StandardScaler()
    scaler_heat = StandardScaler()
    scaler_frost = StandardScaler() 
    
    texture_scaled = scaler_texture.fit_transform(texture_class)
    co2_scaled = scaler_co2.fit_transform(co2_level)
    heat_scaled = scaler_co2.fit_transform(heat_level)
    frost_scaled = scaler_co2.fit_transform(frost_level)
    
    # Combiner données statiques et temporelles
    X_static = np.hstack([texture_scaled, co2_scaled, heat_scaled, frost_scaled])
    
    # Préparer les cibles
    y = crop_targets['yield'].values
    
    return X_temporal, X_static, y

In [15]:
def create_lstm_model(input_shape_temporal, input_shape_static):
    model = Sequential([
        # Couche LSTM pour les données temporelles
        LSTM(64, input_shape=input_shape_temporal, return_sequences=True),
        Dropout(0.2),
        LSTM(32),
        Dropout(0.2),
        
        # Couche pour intégrer les données statiques
        Dense(16, activation='relu'),
        
        # Couche de sortie
        Dense(1)
    ])

    optimizer = tfk.optimizers.Adam(1e-3)
    
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

In [16]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Préparation des données pour chaque culture
crops = ['wheat', 'maize']
models = {}

for crop in crops:
    # Charger les données
    X_temporal, X_static, y = prepare_lstm_data(
        crop_features_train[crop], 
        crop_targets_train[crop]
    )
    
    
    # Diviser les données en train et validation
    X_temporal_train, X_temporal_val, X_static_train, X_static_val, y_train, y_val = train_test_split(
        X_temporal, X_static, y, test_size=0.2, random_state=42
    )
    
    # Créer et entraîner le modèle
    model = create_lstm_model(
        input_shape_temporal=(X_temporal_train.shape[1], X_temporal_train.shape[2]),
        input_shape_static=X_static_train.shape[1]
    )

    
    early_stopping = EarlyStopping(
        monitor='val_loss',   # Surveille la perte sur le set de validation
        patience=8,          # Arrête si aucune amélioration après 10 époques
        restore_best_weights=True  # Restaure les poids avec la meilleure val_loss
    )

    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.1,
        patience=3,
        min_lr=1e-5
    )

    callbacks = [early_stopping, reduce_lr]
    
    # Entraînement
    history = model.fit(
        [X_temporal_train, X_static_train], 
        y_train,
        validation_data=([X_temporal_val, X_static_val], y_val),
        epochs=80, 
        batch_size=64, 
        callbacks=callbacks,
        verbose=1
    )
    
    # Sauvegarder le modèle
    models[crop] = model

  super().__init__(**kwargs)


Epoch 1/80
[1m3485/3485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 25ms/step - loss: 1.8761 - mae: 0.9938 - val_loss: 1.1174 - val_mae: 0.7908 - learning_rate: 0.0010
Epoch 2/80
[1m3485/3485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 24ms/step - loss: 1.1590 - mae: 0.8036 - val_loss: 1.0475 - val_mae: 0.7624 - learning_rate: 0.0010
Epoch 3/80
[1m3485/3485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 24ms/step - loss: 1.0763 - mae: 0.7736 - val_loss: 0.9830 - val_mae: 0.7323 - learning_rate: 0.0010
Epoch 4/80
[1m3485/3485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 24ms/step - loss: 1.0191 - mae: 0.7516 - val_loss: 1.0085 - val_mae: 0.7340 - learning_rate: 0.0010
Epoch 5/80
[1m3485/3485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 24ms/step - loss: 0.9815 - mae: 0.7352 - val_loss: 0.9696 - val_mae: 0.7298 - learning_rate: 0.0010
Epoch 6/80
[1m3485/3485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 24ms/step - loss: 0.

  super().__init__(**kwargs)


Epoch 1/80
[1m4372/4372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 25ms/step - loss: 3.5888 - mae: 1.3480 - val_loss: 2.4334 - val_mae: 1.0869 - learning_rate: 0.0010
Epoch 2/80
[1m4372/4372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 24ms/step - loss: 2.4609 - mae: 1.0984 - val_loss: 2.2201 - val_mae: 1.0338 - learning_rate: 0.0010
Epoch 3/80
[1m4372/4372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 24ms/step - loss: 2.2844 - mae: 1.0536 - val_loss: 2.0637 - val_mae: 0.9909 - learning_rate: 0.0010
Epoch 4/80
[1m4372/4372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 24ms/step - loss: 2.1511 - mae: 1.0202 - val_loss: 2.0024 - val_mae: 0.9789 - learning_rate: 0.0010
Epoch 5/80
[1m4372/4372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 25ms/step - loss: 2.0367 - mae: 0.9949 - val_loss: 1.9551 - val_mae: 0.9709 - learning_rate: 0.0010
Epoch 6/80
[1m4372/4372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 25ms/step - lo

In [17]:
# # NO VALIDATION
# Préparation des données pour chaque culture
# crops = ['wheat', 'maize']

# for crop in crops:
#     # Charger les données
#     X_temporal_test, X_static_test, y_test = prepare_lstm_data(
#         crop_features_test[crop], 
#         crop_targets_test[crop]
#     )

In [18]:
# # Évaluation des modèles
# for crop, model in models.items():
#     print(f"Évaluation du modèle pour {crop}:")
#     loss, mae = model.evaluate(
#         [X_temporal_test, X_static_test], 
#         y_test
#     )
#     print(f"Loss: {loss}, MAE: {mae}")
    
#     model_filename = f"model_{crop}.keras"
#     model.save(model_filename)
#     print(f"Model saved to {model_filename}")
    

## Export data to CSV

In [19]:
print(models['wheat'])
print(models['maize'])

<Sequential name=sequential, built=True>
<Sequential name=sequential_1, built=True>


In [20]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

submission = []

# Itérer sur chaque culture (maize, wheat)
for crop in crops:
    print(f"Prédictions pour la culture : {crop}")
    
    # Accéder aux données de test pour la culture
    features = crop_features_test[crop]

    features['summary'] = reduce_memory_usage(calculate_statistics_on_crop(features))
    features['summary'] = reduce_memory_usage(calculate_metrics(features['summary']))
    
    
    # Préparer les données avec la fonction `prepare_lstm_data` pour obtenir X_temporal, X_static et y (même si on ne l'utilise pas ici)
    X_temporal, X_static, _ = prepare_lstm_data(features, crop_targets_train[crop]) 
    
    
    # Charger le modèle pour la culture
    model = models[crop]
    
    # Faire les prédictions
    yield_predictions = model.predict([X_temporal, X_static]).flatten()
    
    # Récupérer les IDs pour la culture
    IDs = features['soil_co2'].index.values  # Les IDs viennent des index des données
    
    # Créer un DataFrame pour la culture
    crop_submission = pd.DataFrame({
        'ID': IDs,
        'yield': yield_predictions
    })
    
    # Ajouter au fichier final
    submission.append(crop_submission)

# Combiner toutes les prédictions
submission_df = pd.concat(submission)

# Sauvegarder dans un fichier CSV
submission_df.to_csv("submission.csv", index=False)
print("Fichier de soumission généré : submission.csv")


Prédictions pour la culture : wheat
[1m17309/17309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 7ms/step
Prédictions pour la culture : maize
[1m21603/21603[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 7ms/step
Fichier de soumission généré : submission.csv
