# LGBT

This Python Notebook is divided in three sections, in order to remain inside the RAM constraints given by the Kaggle cell. \
In order to execute this script successfully through "Save Version -> Save and Run All (Commit)" in background without having to avoid the Idle Timeout of 40 minutes given by the platform, you will have to comment out all the cells in the other sections to not execute them.

1. Preprocessing of Training Data and Train the Model:
   importing the DataFrames, compute the stats chosen to train the model, train the model (3h) and save it in '/kaggle/working'
2. Preprocessing of Test Data:
   same features selected on Training Dataset and same stats, this operation has to be done separately since the Test Set is huge.
3. Generate Predictions and Submission.csv
   generating the future yields (3h).

The first three cells **do not have** to be commented in any of these steps. {imports, definitions os global vars, reduce_memory_usage function}

**Scores**
* RMSE [2021-2050]: 1.330
* RMSE [2051-2098]: 1.448

In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

import re, gc, os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from lightgbm import LGBMRegressor
import lightgbm as lgb
from math import sqrt

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
gc.enable()

# Input data files are available in the read-only "../input/" directory
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [11]:
CROPS = ("maize", "wheat")

MODES = ('train', 'test')

FEATURES_TEMPORAL = {
    # Time series data -- 240 columns reflecting daily values for 30 days before sowing and 210 days after.
    'tas',       # Mean daily temperature
    'tasmax',    # Max daily temperature
    'tasmin',    # Min daily temperature
    'pr',        # precipitation
    'rsds'      # shortwave radiation
}

FEATURES_STATIC = {
    # Static data
    'soil_co2',  # crop, year, lon, lat, texture_class, real_year, co2, nitrogen
    # dominant USDA soil texture class (constant over time), the ambient CO2 concentration (spatially constant), the planting date and the nitrogen application rate (constant over time)
}

FEATURES = set.union(FEATURES_TEMPORAL, FEATURES_STATIC)

COLUMNS_TO_DROP = ['crop','variable']

# Sowing date
INDEX_SOW = 30  # days
# Time series data length
SEASON_LENGTH = 240  # days
# Nr. of soil texture classes
NUM_TEXTURE_CLASSES = 13  

YEAR_TRAIN_MIN = 1982
YEAR_TRAIN_MAX = 2020  # Inclusive
YEAR_TEST_MIN = 2021
YEAR_TEST_MAX = 2098

PATH_INPUT = os.path.abspath(os.path.join(os.sep, 'kaggle', 'input', 'the-future-crop-challenge'))
# PATH_INPUT = os.path.abspath(os.path.join(os.getcwd(), 'data'))  # For running the notebook locally

In [13]:
# Reduce memory usage of a pandas DataFrame
def reduce_memory_usage(df):
    """Reduce memory usage of a pandas DataFrame."""
    # Function to iterate through columns and modify the data types
    start_mem = df.memory_usage().sum() / 1024**2
    #print(f"Memory usage of dataframe: {start_mem} MB")

    for col in df.columns:
        if col in df.index.names:  # Skip index columns, since other formats of index aren't supported by the engine
            continue

    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)  # Keep sufficient precision
            else:
                if col == "year":  # Ensure precision for grouping columns
                    df[col] = df[col].astype(np.float32)
                if col == 'lat' or 'lon':
                    df[col] = df[col].astype(np.float64)
                elif c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    #print(f"Memory usage after optimization: {end_mem} MB")
    #print(f"Decreased by {100 * (start_mem - end_mem) / start_mem}%")
    return df

# SECTION 1

In [10]:
def load_data(crop: str, # Which crop
              mode: str, # Which dataset (i.e. train/test)
              select_only_features: bool = True # Drop every other column (crop, year, lon, lat) if not relevant for computation
             ) -> dict:
    assert crop in CROPS
    assert mode in MODES
    
    output = dict()
    
    for f in FEATURES:
        path = os.path.join(PATH_INPUT, f'{f}_{crop}_{mode}.parquet')
        df = reduce_memory_usage(pd.read_parquet(path))

        columns_to_drop_in_df = [col for col in COLUMNS_TO_DROP if col in df.columns] 
        if columns_to_drop_in_df:
            df = df.drop(columns=columns_to_drop_in_df)

        if select_only_features:
            if f in FEATURES_TEMPORAL:  # Select only the time series data -- drop other columns
                df = df[[str(i) for i in range(SEASON_LENGTH)]]
        
        output[f] = df

        del df  # Explicitly delete the DataFrame
        gc.collect()  # Force garbage collection
        
    if mode == 'train':
        output['target'] = pd.read_parquet(os.path.join(PATH_INPUT, f'{mode}_solutions_{crop}.parquet'))

    return output

In [11]:
crop_data_train = {
    crop: load_data(crop, 'train', select_only_features=False) for crop in CROPS
}

crop_features_train = {
    crop: {
        k: v for k, v in data.items() if k in FEATURES
    } for crop, data in crop_data_train.items()
}

crop_targets_train = {
    crop: data['target'] for crop, data in crop_data_train.items()
}

In [12]:
columns_to_drop = ['year', 'lon', 'lat']

for crop in CROPS:
    for input_feature in crop_features_train[crop].keys():
        if input_feature == 'soil_co2':
            continue
        crop_features_train[crop][input_feature] = crop_features_train[crop][input_feature].drop(columns=columns_to_drop, errors='ignore')

In [None]:
def calculate_statistics_on_crop(df):    
    #We already have the vars splitte
    #Consider relying on tasmax, tasmin instead of these aggregated stats
    
    # Calculate statistics
    mean_tas = df['tas'].mean(axis=1).rename('mean_tas')
    median_tas = df['tas'].median(axis=1).rename('median_tas')
    sum_tas = df['tas'].sum(axis=1).rename('sum_tas')
    min_tas = df['tas'].min(axis=1).rename('min_tas')
    max_tas = df['tas'].max(axis=1).rename('max_tas')
    
    mean_pr = df['pr'].mean(axis=1).rename('mean_pr')
    median_pr = df['pr'].median(axis=1).rename('median_pr')
    sum_pr = df['pr'].sum(axis=1).rename('sum_pr')
    min_pr = df['pr'].min(axis=1).rename('min_pr')
    max_pr = df['pr'].max(axis=1).rename('max_pr')
    
    mean_rsds = df['rsds'].mean(axis=1).rename('mean_rsds')
    median_rsds = df['rsds'].median(axis=1).rename('median_rsds')
    sum_rsds = df['rsds'].sum(axis=1).rename('sum_rsds')
    min_rsds = df['rsds'].min(axis=1).rename('min_rsds')
    max_rsds = df['rsds'].max(axis=1).rename('max_rsds')

    # Create a DataFrame to store the results
    summary_df = pd.concat([mean_tas, min_tas, max_tas, median_tas, sum_tas,
                            mean_pr, min_pr, max_pr, median_pr, sum_pr,
                            mean_rsds, median_rsds, sum_rsds, min_rsds, max_rsds], axis=1)

    return summary_df

In [None]:
for crop in CROPS:
    crop_features_train[crop]['summary'] = reduce_memory_usage(calculate_statistics_on_crop(crop_features_train[crop]))

In [None]:
def calculate_chunk_statistics(crop_data: dict, x: int) -> pd.DataFrame:
    # Categories and their prefixes
    categories = ['pr', 'tas', 'rsds', 'tasmax', 'tasmin']
    statistics = ['mean', 'median', 'sum', 'max', 'min']
    
    # Initialize an empty DataFrame to store the results
    chunk_summary_df = pd.DataFrame()

    for category in categories:
        
        # Split the columns into chunks of size x
        for i in range(0, crop_data[category].shape[1], x):  # Iterate over column indices
            chunk_columns = crop_data[category].iloc[:, i:i + x]  # Slice columns
            
            if not chunk_columns.empty:
                chunk_stats = {
                    f'{category}_{stat}_chunk_{i//x}': getattr(chunk_columns, stat)(axis=1)
                    for stat in statistics
                }
                
                chunk_summary_df = pd.concat([chunk_summary_df, pd.DataFrame(chunk_stats)], axis=1)
                
                del chunk_columns, chunk_stats
                gc.collect()

    return chunk_summary_df

In [None]:
for crop in CROPS:
    crop_features_train[crop]['chunk_stats'] = reduce_memory_usage(calculate_chunk_statistics(crop_features_train[crop], 30))

In [14]:
#Merging yields
for crop in CROPS:
    crop_features_train[crop]['soil_co2'] = crop_features_train[crop]['soil_co2'].merge(
        crop_targets_train[crop],
        left_index=True,
        right_index=True,
        how="left"
    )

In [15]:
#Adding mean_yield to the Training Features, since it's a strong predictor.

for crop in CROPS:
    crop_features_train[crop]['soil_co2']['mean_yield'] = crop_features_train[crop]['soil_co2'].groupby(['lat', 'lon'])['yield'].transform('mean')

# MODEL 

In [None]:
# Combine features for both crops
X_maize = pd.concat([
    crop_features_train['maize']['soil_co2'].drop(columns=['yield']),
    crop_features_train['maize']['summary'],
    crop_features_train['maize']['chunk_stats']
], axis=1)

X_wheat = pd.concat([
    crop_features_train['wheat']['soil_co2'].drop(columns=['yield']),
    crop_features_train['wheat']['summary'],
    crop_features_train['wheat']['chunk_stats']
], axis=1)

# Combine targets for both crops
y_maize = crop_features_train['maize']['soil_co2']['yield']
y_wheat = crop_features_train['wheat']['soil_co2']['yield']

# Optionally add a crop type column
X_maize['crop_type'] = 'maize'
X_wheat['crop_type'] = 'wheat'

# Combine into a single dataset
X_combined = pd.concat([X_maize, X_wheat], axis=0)
y_combined = pd.concat([y_maize, y_wheat], axis=0)

# Encode crop type as a category
X_combined['crop_type'] = X_combined['crop_type'].astype('category')

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

# LightGBM parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'n_estimators': 100000,
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
}

# Create LightGBM datasets
trainset = lgb.Dataset(X_train, label=y_train)
val_dataset = lgb.Dataset(X_val, label=y_val, reference=trainset)

# Train the model with early stopping
model = lgb.train(
    params,
    trainset,
    valid_sets=[trainset, val_dataset],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50)]
)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate Metrics
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"RMSE: {rmse}")

r2 = r2_score(y_val, y_pred)
print(f"R2: {r2}")

model.save_model('/kaggle/working/lightgbm_model.txt')

In [None]:
# Feature Importances
feature_importance_df = pd.DataFrame()
feature_importance_df['Feature'] = X_train.columns
feature_importance_df['Importance'] = model.feature_importance(importance_type='gain')
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

# Plotting the top 50 features by importance
top_n = 50
top_features = feature_importance_df.iloc[:top_n]

plt.figure(figsize=(12, 10))
plt.barh(top_features['Feature'], top_features['Importance'], align='center', color='skyblue')
plt.xlabel('Feature Importance (Gain)')
plt.ylabel('Features')
plt.title(f'Top {top_n} Features Importance')
plt.gca().invert_yaxis()  # Invert y-axis to have the most important feature at the top
plt.show()

In [None]:
del X_maize, X_wheat, y_maize, y_wheat, X_val, X_combined, y_combined, y_train, y_val, trainset, val_dataset, crop_data_train, crop_features_train
gc.collect()

# END OF SECTION 1

# SECTION 2 - Test Set Preprocessing and split into Parquets

Remember that you will need also the first three cells

In [None]:
import os

crop_data_test = {
    crop: load_data(crop, 'test', select_only_features=False) for crop in CROPS
}

crop_features_test = {
    crop: {
        k: v for k, v in data.items() if k in FEATURES
    } for crop, data in crop_data_test.items()
}

# Directory to store intermediate Parquet files
output_dir = "/kaggle/working/"

columns_to_drop = ['year', 'lon', 'lat']

In [None]:
for crop in CROPS:
    for input_feature in crop_features_test[crop].keys():
        if input_feature == 'soil_co2':
            continue
        crop_features_test[crop][input_feature] = crop_features_test[crop][input_feature].drop(columns=columns_to_drop, errors='ignore')

    # Calculate summary statistics and chunk stats
    crop_summary = reduce_memory_usage(calculate_statistics_on_crop(crop_features_test[crop]))

    crop_chunk_stats = reduce_memory_usage(calculate_chunk_statistics(crop_features_test[crop], 30))

    # Combine the features for the crop
    X_test_crop = pd.concat([crop_summary, crop_chunk_stats], axis=1)

    # Save the processed crop data to a Parquet file
    parquet_file = os.path.join(output_dir, f"{crop}_processed.parquet")
    X_test_crop.to_parquet(parquet_file)

    print(f"Saved {crop} data to {parquet_file}")

    del crop_summary, crop_chunk_stats, X_test_crop
    gc.collect()

# SECTION 3 - Generate Predictions

In [14]:
PATH_INPUT = os.path.abspath(os.path.join(os.sep, 'kaggle', 'input', 'the-future-crop-challenge'))
PATH_TRAIN_DATA = os.path.abspath(os.path.join(os.sep, 'kaggle', 'input', 'fcc-train-data-stats'))
PATH_WORKING = os.path.abspath(os.path.join(os.sep, 'kaggle', 'working'))

# Prepare test data for predictions
X_test = pd.DataFrame()

for crop in CROPS:

    crop_soil_co2_test = reduce_memory_usage(pd.read_parquet(os.path.join(PATH_INPUT, f'soil_co2_{crop}_test.parquet')))
    crop_stats = reduce_memory_usage(pd.read_parquet(os.path.join(PATH_TRAIN_DATA, f'{crop}_processed.parquet')))

    # Concatenate features for the crop
    X_test_crop = pd.concat([
        crop_soil_co2_test,
        crop_stats
    ], axis=1)

    del crop_soil_co2_test, crop_stats
    gc.collect()
    
    X_test_crop['crop_type'] = crop
    X_test_crop['crop_type'] = X_test_crop['crop_type'].astype('category')

    X_test = pd.concat([X_test, X_test_crop])

    del X_test_crop
    gc.collect()                    

In [27]:
'''
Here crop_features_train[crop]['soil_co2']['mean_yield'] we have the yield of the crops.
I have to link them with latitude and longitude to the grid cells in X_test, we can merge them on 'lat' and 'lon'
'''
crop_features_train = {
    crop: reduce_memory_usage(pd.read_parquet(os.path.join(PATH_INPUT, f'soil_co2_{crop}_train.parquet')))
    for crop in CROPS
}

# Load training targets
crop_targets_train = {
    crop: pd.read_parquet(os.path.join(PATH_INPUT, f'train_solutions_{crop}.parquet'))
    for crop in CROPS
}

In [28]:
for crop in CROPS:
    crop_features_train[crop] = crop_features_train[crop].merge(
        crop_targets_train[crop],
        left_index=True,
        right_index=True,
        how="left"
    )

for crop in CROPS:
    crop_features_train[crop]['mean_yield'] = crop_features_train[crop].groupby(['lat', 'lon'])['yield'].transform('mean')

In [29]:
# Combine all crop features into one DataFrame
all_crop_features = pd.concat([
    crop_features_train[crop][['lat', 'lon', 'mean_yield']] for crop in CROPS
], ignore_index=True)

# Drop duplicates to avoid redundant data
all_crop_features = all_crop_features.drop_duplicates(subset=['lat', 'lon'])

# Merge with X_test
X_test = X_test.merge(all_crop_features, on=['lat', 'lon'], how='left')

In [30]:
PATH_MODEL = os.path.abspath(os.path.join(os.sep, 'kaggle', 'input', 'fcc-lgbt/scikitlearn/v1.0/1', 'model'))
model = lgb.Booster(model_file=os.path.join(PATH_MODEL, 'lightgbm_model.txt'))

feature_names = model.feature_name()

X_test = X_test[feature_names]

categorical_indices = model.params.get('categorical_feature', [])

categorical_features = [X_test.columns[i] for i in categorical_indices]
print("Categorical features used during training:", categorical_features)

for feature in categorical_features:
    if feature in X_test.columns:
        X_test[feature] = X_test[feature].astype('category')

for col in X_test.columns:
    if X_test[col].dtype == 'object' and col not in categorical_features:
        X_test[col] = pd.to_numeric(X_test[col], errors='coerce')

Categorical features used during training: ['crop_type']


In [27]:
# Make predictions using the trained model
predictions = model.predict(X_test)

In [None]:
#Number of Rows needed
print(len(predictions) == 1245149)

In [None]:
sample_submission = pd.read_csv(os.path.join(PATH_INPUT, f'sample_submission.csv'))

In [45]:
# Format predictions into submission format
submission = pd.DataFrame({
    'ID': sample_submission['ID'],
    'yield': predictions
})

In [46]:
# Save combined submission to CSV
submission.to_csv('/kaggle/working/lgbt_submission.csv', index=False)