# FCC - Univariate Linear Regression Model

**Scores**
* RMSE [2021-2050]: 1.414
* RMSE [2051-2098]: 3.072

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import matplotlib.pylab as plt
import gc
import seaborn as sns

plt.style.use('ggplot')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname,_, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        break

import tqdm
import datetime

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
DEBUG_MODE = False

CROPS = ("maize", "wheat")

MODES = ('train', 'test')

FEATURES_TEMPORAL = {
    # Time series data -- 240 columns reflecting daily values for 30 days before sowing and 210 days after.
    'tas',       # Mean daily temperature
    'tasmax',    # Max daily temperature
    'tasmin',    # Min daily temperature
    'pr',        # precipitation
    'rsds'      # shortwave radiation
}

FEATURES_STATIC = {
    # Static data
    'soil_co2',  # crop, year, lon, lat, texture_class, real_year, co2, nitrogen
    # dominant USDA soil texture class (constant over time), the ambient CO2 concentration (spatially constant), the planting date and the nitrogen application rate (constant over time)
}

#FEATURES = set.union(FEATURES_TEMPORAL, FEATURES_STATIC)
FEATURES = FEATURES_STATIC

COLUMNS_TO_DROP = ['crop','variable']

# Sowing date
INDEX_SOW = 30  # days
# Time series data length
SEASON_LENGTH = 240  # days
# Nr. of soil texture classes
NUM_TEXTURE_CLASSES = 13  

YEAR_TRAIN_MIN = 1982
YEAR_TRAIN_MAX = 2020  # Inclusive
YEAR_TEST_MIN = 2021
YEAR_TEST_MAX = 2098

PATH_INPUT = os.path.abspath(os.path.join(os.sep, 'kaggle', 'input', 'the-future-crop-challenge'))
# PATH_INPUT = os.path.abspath(os.path.join(os.getcwd(), 'data'))  # For running the notebook locally

In [3]:
# Reduce memory usage of a pandas DataFrame
def reduce_memory_usage(df):
    """Reduce memory usage of a pandas DataFrame."""
    # Function to iterate through columns and modify the data types
    start_mem = df.memory_usage().sum() / 1024**2
    #print(f"Memory usage of dataframe: {start_mem} MB")

    for col in df.columns:
        if col in df.index.names:  # Skip index columns, since other formats of index aren't supported by the engine
            continue

    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)  # Keep sufficient precision
            else:
                if col == "year":  # Ensure precision for grouping columns
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    #print(f"Memory usage after optimization: {end_mem} MB")
    #print(f"Decreased by {100 * (start_mem - end_mem) / start_mem}%")
    return df

In [5]:
def load_data(crop: str, # Which crop
              mode: str, # Which dataset (i.e. train/test)
              select_only_features: bool = True, # Drop every other column (crop, year, lon, lat) if not relevant for computation
              take_subset: bool = False,  # If set to true, take a small subset of the data (for debugging purposes)
             ) -> dict:
    assert crop in CROPS
    assert mode in MODES
    
    output = dict()
    
    for f in FEATURES:
        path = os.path.join(PATH_INPUT, f'{f}_{crop}_{mode}.parquet')
        df = reduce_memory_usage(pd.read_parquet(path))

        columns_to_drop_in_df = [col for col in COLUMNS_TO_DROP if col in df.columns] 
        if columns_to_drop_in_df:
            df = df.drop(columns=columns_to_drop_in_df)

        if select_only_features:
            if f in FEATURES_TEMPORAL:  # Select only the time series data -- drop other columns
                df = df[[str(i) for i in range(SEASON_LENGTH)]]
        
        output[f] = df

        # Free up memory after processing each file
        del df  # Explicitly delete the DataFrame
        gc.collect()  # Force garbage collection
        
    if mode == 'train':
        output['target'] = pd.read_parquet(os.path.join(PATH_INPUT, f'{mode}_solutions_{crop}.parquet'))
    
    # If required, only take a subset of the data for debugging purposes -- we don't really care which samples
    if take_subset:
        num_select = 100  # Take only 100 samples from the dataset
        # Select which samples based on the index of some feature
        ixs_selected = output[tuple(FEATURES)[0]].index[:num_select]
        # Filter all dataframes
        output = {
            key: df.loc[ixs_selected] for key, df in output.items()
        }
        
    return output

In [6]:
# Load all available data for all crops
crop_data_train = {
    crop: load_data(crop, 'train', take_subset=DEBUG_MODE, select_only_features=False) for crop in CROPS
}


crop_data_test = {
    crop: load_data(crop, 'test', take_subset=DEBUG_MODE, select_only_features=False) for crop in CROPS
}

# Separate data in features and targets (if available)
crop_features_train = {
    crop: {
        k: v for k, v in data.items() if k in FEATURES
    } for crop, data in crop_data_train.items()
}
crop_features_test = {
    crop: {
        k: v for k, v in data.items() if k in FEATURES
    } for crop, data in crop_data_test.items()
}

crop_targets_train = {
    crop: data['target'] for crop, data in crop_data_train.items()
}

In [50]:
yield_maize_df = pd.DataFrame(crop_targets_train['maize'])
yield_wheat_df = pd.DataFrame(crop_targets_train['wheat'])
print(yield_maize_df)

        yield
ID           
0       5.595
1       5.895
2       3.023
3       2.071
4       2.239
...       ...
349714  6.240
349715  8.926
349716  2.180
349717  7.311
349718  2.118

[349719 rows x 1 columns]


In [86]:
print(FEATURES)

print(crop_features_test)

{'soil_co2'}
{'maize': {'soil_co2':           year     lon    lat  texture_class  real_year     co2    nitrogen
ID                                                                         
349719   420.0 -122.25  48.25            9.0       2021   418.0  186.125000
349720   420.0 -122.25  48.75            9.0       2021   418.0  186.125000
349721   420.0 -122.25  49.25            9.0       2021   418.0  184.875000
349722   420.0 -119.75  47.75            9.0       2021   418.0  186.125000
349723   420.0 -116.75  43.25            9.0       2021   418.0  186.125000
...        ...     ...    ...            ...        ...     ...         ...
1040985  497.0  132.75  46.75            9.0       2098  1108.0  221.750000
1040986  497.0  132.75  47.25            9.0       2098  1108.0  221.750000
1040987  497.0  133.25  45.25            9.0       2098  1108.0    1.845703
1040988  497.0  133.25  47.25            9.0       2098  1108.0  221.750000
1040989  497.0  137.75  36.75            9.0       2

In [143]:
print(f"Number of total rows to predict: {len(crop_features_test['wheat']['soil_co2']) + len(crop_features_test['maize']['soil_co2'])}")

yearly_counts_df = pd.DataFrame(crop_features_test['maize']['soil_co2']).groupby('year').size().reset_index(name='count')
print(yearly_counts_df)
yearly_counts_df = pd.DataFrame(crop_features_test['wheat']['soil_co2']).groupby('year').size().reset_index(name='count')
print(yearly_counts_df)

Number of total rows to predict: 1245149
     year  count
0   420.0   9068
1   421.0   9010
2   422.0   9047
3   423.0   8994
4   424.0   9054
..    ...    ...
73  493.0   8620
74  494.0   8847
75  495.0   8837
76  496.0   8600
77  497.0   8773

[78 rows x 2 columns]
     year  count
0   420.0   6871
1   421.0   7579
2   422.0   6798
3   423.0   7346
4   424.0   7026
..    ...    ...
73  493.0   7143
74  494.0   7040
75  495.0   6929
76  496.0   7208
77  497.0   7006

[78 rows x 2 columns]


In [146]:
# Merge yield_maize_df with crop_data_train['maize']['soil_co2'] on 'ID' column
train_data_maize = pd.merge(yield_maize_df, crop_data_train['maize']['soil_co2'], on='ID')
train_data_wheat = pd.merge(yield_wheat_df, crop_data_train['wheat']['soil_co2'], on='ID')

test_data_maize = pd.DataFrame(crop_features_test['maize']['soil_co2'])
test_data_wheat = pd.DataFrame(crop_features_test['wheat']['soil_co2'])

unique_lat_lon_maize = train_data_maize[['lat', 'lon']].drop_duplicates()
unique_lat_lon_wheat = train_data_wheat[['lat', 'lon']].drop_duplicates()

unique_lat_lon_tuples_maize = list(unique_lat_lon_maize.itertuples(index=False, name=None))
unique_lat_lon_tuples_wheat = list(unique_lat_lon_wheat.itertuples(index=False, name=None))

#Safety Checks Section
test_data_maize = test_data_maize[['lat', 'lon']].drop_duplicates() 
test_maize = list(test_data_maize.itertuples(index=False, name=None))
same_values = set(unique_lat_lon_tuples_maize) == set(test_maize)

print("Maize: Contain the same values:", same_values)
print(f" {len(set(unique_lat_lon_tuples_maize))}, {len(set(test_maize))}")

is_subset = set(test_maize).issubset(set(unique_lat_lon_tuples_wheat))
print("Test maize is a subset of unique lat/lon tuples for maize:", is_subset)

test_data_wheat = test_data_wheat[['lat', 'lon']].drop_duplicates() 
test_wheat = list(test_data_wheat.itertuples(index=False, name=None))
same_values = set(unique_lat_lon_tuples_wheat) == set(test_wheat)

print("Wheat: Contain the same values:", same_values)
print(f" {len(set(unique_lat_lon_tuples_wheat))}, {len(set(test_wheat))}")

is_subset = set(test_wheat).issubset(set(unique_lat_lon_tuples_wheat))
print("Test wheat is a subset of unique lat/lon tuples for wheat:", is_subset)

Maize: Contain the same values: True
 9303, 9303
Test maize is a subset of unique lat/lon tuples for maize: True
Wheat: Contain the same values: False
 8663, 8102
Test wheat is a subset of unique lat/lon tuples for wheat: True


In [158]:
print(f"Number of gridcells in training for maize: {len(unique_lat_lon_tuples_maize)}")
print(f"Number of gridcells in training for wheat: {len(unique_lat_lon_tuples_wheat)}")

print(f"Number of gridcells in test for maize: {len(test_maize)}")
print(f"Number of gridcells in test for wheat: {len(test_wheat)}")

def extract_ids(input_set: dict) -> dict:
    return (
    input_set
    .assign(tuple_key=input_set[['lat', 'lon']].apply(tuple, axis=1))
    .groupby('tuple_key')
    .apply(lambda df: df.index.tolist(), include_groups = False)
    .to_dict()
)

Number of gridcells in training for maize: 9303
Number of gridcells in training for wheat: 8663
Number of gridcells in test for maize: 9303
Number of gridcells in test for wheat: 8102


In [176]:
#Extracting only IDs of tuples in the Test set
#Filtering the train set of wheat

# Filter train_data_wheat by checking if the (lat, lon) tuple is in the test set
train_data_filtered_wheat = train_data_wheat[train_data_wheat[['lat', 'lon']].apply(tuple, axis=1).isin(test_wheat)]

grouped_ids_maize = extract_ids(train_data_maize)

grouped_ids_wheat = extract_ids(train_data_filtered_wheat)

In [177]:
print(len(grouped_ids_maize))
print(len(grouped_ids_wheat))

9303
8102


In [196]:
from sklearn.linear_model import LinearRegression
import numpy as np

def computeRegression(input_ids: dict, train_data: dict) -> dict:
    # Dictionary to store regression results for each (lat, lon) group
    regression_results = {}
    
    # Iterate over each (lat, lon) group and its associated IDs
    for tuple_key, ids in input_ids.items():
        # Extract data for the current group
        group_data = train_data.loc[ids]
        
        # Extract 'year' as X and 'yield' as Y
        X = group_data['year'].values.reshape(-1, 1)  # Reshape for sklearn
        Y = group_data['yield'].values
        
        # Fit a linear regression model
        model = LinearRegression()
        model.fit(X, Y)
        
        # Store the results (coefficients and intercept) for the current group
        regression_results[tuple_key] = {
            'model': model,
            'coefficient': model.coef_[0],  # Slope
            'intercept': model.intercept_,  # Intercept
        }
    
    # Display regression results for all groups
    for key, result in regression_results.items():
        '''
        print(f"Group: {key}")
        print(f"  Coefficient (Slope): {result['coefficient']}")
        print(f"  Intercept: {result['intercept']}")
        print()
        '''

    return regression_results

In [197]:
#Computing Regression lines only for tuples (lat,lon) present in the test set, ignoring tuples only in the train set.

regression_models_maize = computeRegression(grouped_ids_maize, train_data_maize)
regression_models_wheat = computeRegression(grouped_ids_wheat, train_data_wheat)

In [198]:
ids_to_predict_maize = extract_ids(crop_data_test['maize']['soil_co2'][['year','lat','lon']])
ids_to_predict_wheat = extract_ids(crop_data_test['wheat']['soil_co2'][['year','lat','lon']])

In [163]:
print(len(ids_to_predict_wheat[(-38.75, -60.25)])) #Example of a Subset of IDs
print(ids_to_predict_maize[(-38.75, -60.25)])
print(ids_to_predict_wheat[(-38.75, -60.25)])

69
[351710, 360771, 369753, 378788, 387835, 396867, 405944, 414940, 423861, 432563, 441538, 450480, 459509, 468478, 477399, 486493, 495557, 504546, 513500, 522464, 531276, 539996, 548998, 557976, 566838, 575842, 584700, 593464, 602471, 611330, 620357, 629197, 638208, 646976, 655786, 664694, 673649, 682398, 691114, 699710, 708650, 717580, 726537, 735499, 744380, 753191, 762172, 770935, 779967, 788684, 797561, 806413, 815369, 824055, 833053, 841956, 850745, 859387, 868282, 877142, 885764, 894590, 903154, 912071, 920778, 929205, 937982, 955703, 964403, 973063, 981714, 990364, 999091, 1007899, 1016718, 1025453, 1034119]
[1321289, 1328178, 1335778, 1342565, 1349835, 1356901, 1363991, 1370970, 1377782, 1391668, 1398854, 1406247, 1413277, 1420283, 1427536, 1434513, 1441854, 1448980, 1456237, 1463391, 1470745, 1478116, 1485035, 1491997, 1498826, 1506337, 1513439, 1520222, 1527849, 1534624, 1541682, 1549044, 1562888, 1569839, 1577087, 1591219, 1620126, 1627219, 1634318, 1641555, 1648769, 165593

In [199]:
result = regression_models_wheat.keys() == ids_to_predict_wheat.keys()
print(result)
#print(regression_models_wheat.keys())

# Find differences
missing_in_models = ids_to_predict_wheat.keys() - regression_models_wheat.keys() # Keys in ids_to_predict_wheat but not in regression_models_wheat
missing_in_predict = regression_models_wheat.keys() - ids_to_predict_wheat.keys() # Keys in regression_models_wheat but not in ids_to_predict_wheat

# Print the differences
#print(f"Keys missing in regression_models_wheat: {missing_in_models}")
#print(f"Keys missing in ids_to_predict_wheat: {missing_in_predict}")

True


In [200]:
def generatePredictions(regression_model: dict,
                        ids_to_predict: dict, 
                        X_test: pd.DataFrame) -> dict:
    diff_counter = 0

    # Dictionary to store future predictions for each (lat, lon) group
    future_predictions = {}
    
    for tuple_key in ids_to_predict.keys():
        # Get the model associated to the tuple_key
        model = regression_model[tuple_key]['model']
        
        # Extract the 'lat' and 'lon' from the tuple_key
        lat, lon = tuple_key
        
        # Filter X_test for rows matching the lat and lon
        filtered_data = X_test[(X_test['lat'] == lat) & (X_test['lon'] == lon)]
        
        X_years_to_predict = np.unique(filtered_data['year'].values).reshape(-1,1) #Check the range of years to make prediction
        
        # Predict yields for the range of future years
        predicted_yields = model.predict(X_years_to_predict)
        
        # Retrieve the list of IDs for this group
        ids_for_group = ids_to_predict.get(tuple_key, [])
    
        # Check if we have enough IDs for the predicted years, some of the tuples of coordinates have one less ID, 76 instead of 77, idk why
    
        if len(ids_for_group) != len(predicted_yields):
            #print(f"Warning: The number of IDs for {tuple_key} does not match the number of predicted yields.")
            diff_counter+=1
    
        # Map the predicted yields to the corresponding IDs and store them
        future_predictions[tuple_key] = []
        for id_, predicted_yield in zip(ids_for_group, predicted_yields):
            future_predictions[tuple_key].append({
                'id': id_,
                'predicted_yield': predicted_yield
            })
    
    print(f"Warning: The number of retrieved IDs does not match the number of predicted yields {diff_counter} times.")

    return future_predictions

In [201]:
# Years to Predict: X [420.0, 497.0] (X_unique)
# Range of IDs to generate: 1245149 values (wheat + maize)

future_predictions_maize = generatePredictions(regression_models_maize, ids_to_predict_maize, crop_data_test['maize']['soil_co2'][['year','lat','lon']])
future_predictions_wheat = generatePredictions(regression_models_wheat, ids_to_predict_wheat, crop_data_test['wheat']['soil_co2'][['year','lat','lon']])



In [216]:
import pandas as pd

# Prepare the list to store the rows for the DataFrame
rows_for_csv = []

# Flatten the future_predictions dictionary
for tuple_key, predictions in future_predictions_maize.items():
    for prediction in predictions:
        rows_for_csv.append({
            'id': prediction['id'],
            'predicted_yield': prediction['predicted_yield']
        })

for tuple_key, predictions in future_predictions_wheat.items():
    for prediction in predictions:
        rows_for_csv.append({
            'id': prediction['id'],
            'predicted_yield': prediction['predicted_yield']
        })

# Convert the list of rows to a DataFrame
df_for_csv = pd.DataFrame(rows_for_csv)
df_for_csv_sorted = df_for_csv.sort_values(by='id')

# Write to CSV in the Kaggle working directory
df_for_csv_sorted.to_csv('/kaggle/working/predicted_yields_linear_regression.csv', index=False)
print("CSV file 'predicted_yields_linear_regression.csv' has been saved to /kaggle/working.")

CSV file 'predicted_yields.csv' has been saved to /kaggle/working.


In [213]:
num_rows = len(df_for_csv_sorted)
print(f"The file has {num_rows} rows.")

df_for_csv_sorted.head()

The file has 1245149 rows.


Unnamed: 0,id,predicted_yield
577120,349719,4.426638
585874,349720,4.843409
593001,349721,2.897787
568779,349722,1.034766
491417,349723,1.885789


In [215]:
# Display future predictions (it can crash, printing is sloow)
'''
for key, predictions in future_predictions.items():
    print(f"Predictions for {key}:")
    for prediction in predictions:
        print(f"ID: {prediction['id']}, Predicted Yield: {prediction['predicted_yield']}")
'''
os.remove("/kaggle/working/predicted_yields.csv")