In [105]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

In [96]:
data = pd.read_csv("data.csv")

In [97]:
data.head()

Unnamed: 0,Area,Item,Year,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
0,Albania,Maize,1990,36613,1485.0,121.0,16.37
1,Albania,Potatoes,1990,66667,1485.0,121.0,16.37
2,Albania,"Rice, paddy",1990,23333,1485.0,121.0,16.37
3,Albania,Sorghum,1990,12500,1485.0,121.0,16.37
4,Albania,Soybeans,1990,7000,1485.0,121.0,16.37


In [98]:
data_sample = data.sample(n=2000, random_state=42)

In [99]:
data_sample.head()

Unnamed: 0,Area,Item,Year,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
25564,Spain,"Rice, paddy",2008,69220,636.0,40719.0,17.21
18113,Madagascar,Wheat,1996,20000,1513.0,152.01,19.71
25607,Spain,Sorghum,2010,51206,636.0,39043.0,16.51
6815,Colombia,Potatoes,2007,166986,3240.0,82439.06,27.45
18144,Madagascar,Sweet potatoes,2000,56319,1513.0,130.46,19.65


In [100]:
data_encoded = pd.get_dummies(data_sample, columns=["Area", "Item"], drop_first=True)

In [101]:
X = data_encoded.drop("hg/ha_yield", axis=1)
y = data_sample['hg/ha_yield']

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state= 42)

In [None]:
models = {
    "Linear Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("model", LinearRegression())
    ]),
    
    "Ridge Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("model", Ridge())
    ]),
    
    "Lasso Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("model", Lasso(max_iter=50000))
    ]),
    
    "KNN Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("model", KNeighborsRegressor())
    ]),
    
    "Decision Tree": Pipeline([
        ("model", DecisionTreeRegressor(random_state=42))
    ]),
    
    "Random Forest": Pipeline([
        ("model", RandomForestRegressor(random_state=42))
    ])
}

In [None]:
param_grid = {
    "Ridge Regression": {"model__alpha": [0.1, 1, 10, 50, 100]},
    "Lasso Regression": {"model__alpha": [0.001, 0.01, 0.1, 1, 10]},
    "KNN Regression": {
        "model__n_neighbors": [3, 5, 7, 10, 15],
        "model__weights": ["uniform", "distance"],
        "model__p": [1, 2] 
    },
    "Decision Tree": {
        "model__max_depth": [None, 5, 10, 20],
        "model__min_samples_split": [2, 5, 10]
    },
    "Random Forest": {
        "model__n_estimators": [50, 100, 200],
        "model__max_depth": [None, 5, 10, 20],
        "model__min_samples_split": [2, 5, 10]
    }
}

In [115]:
results = {}

for name, pipeline in models.items():
    if name in param_grid:
        grid = GridSearchCV(pipeline, param_grid[name], cv=5, scoring="r2", n_jobs=-1)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        best_params = grid.best_params_
    else:
        pipeline.fit(X_train, y_train)
        best_model = pipeline
        best_params = "N/A"
    
    y_pred = best_model.predict(X_test)
    
    results[name] = {
        "Best Params": best_params,
        "R²": r2_score(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred))
    }
    
results_df = pd.DataFrame(results).T
print("\nModel Comparison with Hyperparameter Tuning:")
print(results_df)



Model Comparison with Hyperparameter Tuning:
                                                         Best Params  ...          RMSE
Linear Regression                                                N/A  ...  44104.855484
Ridge Regression                                {'model__alpha': 10}  ...  44136.511521
Lasso Regression                                {'model__alpha': 10}  ...  44116.154248
KNN Regression     {'model__n_neighbors': 15, 'model__p': 1, 'mod...  ...  35122.325862
Decision Tree      {'model__max_depth': 20, 'model__min_samples_s...  ...  33390.082342
Random Forest      {'model__max_depth': None, 'model__min_samples...  ...  27476.943759

[6 rows x 3 columns]


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(
    n_estimators=200,      
    learning_rate=0.1,     
    max_depth=3,           
    random_state=42
)

gbr.fit(X_train, y_train)

y_pred = gbr.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Gradient Boosting Regressor R²:", r2)
print("RMSE:", rmse)

Gradient Boosting Regressor R²: 0.8203961878305807
RMSE: 37995.64561392585


In [None]:
import xgboost as xgb

xgbr = xgb.XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
    objective='reg:squarederror',
    random_state=42
)

xgbr.fit(X_train, y_train)
y_pred = xgbr.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("XGBoost Regressor R²:", r2)
print("RMSE:", rmse)


XGBoost Regressor R²: 0.8163620829582214
RMSE: 38419.99312857825


In [116]:
rf_pipeline = Pipeline([
    ("model", RandomForestRegressor(random_state=42))
])

param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 5]
}

grid = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring="r2", n_jobs=-1)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_

with open('crop_yield_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

model_data = {
    'model': best_model,
    'feature_columns': list(X.columns),
    'categorical_columns': ['Area', 'Item']
}

with open('crop_yield_model_full.pkl', 'wb') as f:
    pickle.dump(model_data, f)