In [19]:
# If you haven't installed these yet
# pip install pandas numpy matplotlib seaborn scikit-learn



import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [20]:
# Load the dataset
df = pd.read_csv("cropdata/yield_df.csv",index_col=0)

# Check the first 5 rows
print(df.head())

      Area         Item  Year  hg/ha_yield  average_rain_fall_mm_per_year  \
0  Albania        Maize  1990        36613                         1485.0   
1  Albania     Potatoes  1990        66667                         1485.0   
2  Albania  Rice, paddy  1990        23333                         1485.0   
3  Albania      Sorghum  1990        12500                         1485.0   
4  Albania     Soybeans  1990         7000                         1485.0   

   pesticides_tonnes  avg_temp  
0              121.0     16.37  
1              121.0     16.37  
2              121.0     16.37  
3              121.0     16.37  
4              121.0     16.37  


In [21]:
# Target column
y = df['hg/ha_yield']

# Features (all other columns)
X = df.drop('hg/ha_yield', axis=1)

In [22]:
from sklearn.model_selection import train_test_split

# 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder

# Categorical and numeric features
categorical_features = ["Item", "Area"]
numeric_features = ["average_rain_fall_mm_per_year", "pesticides_tonnes", "avg_temp"]

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features),
        ("num", StandardScaler(), numeric_features)
    ]
)

# Full pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),  # avoid extra column of ones
    ("model", LinearRegression())
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Evaluate
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

print("Train Score:", train_score)
print("Test Score:", test_score)


Train Score: 0.9563675839897333
Test Score: 0.9506182023657085


In [31]:
# Predict on test set
y_pred = pipeline.predict(X_test)

# Clip negative predictions to 0 (crop yield can't be negative)
y_pred = y_pred.clip(min=0)

# Check first 10 predictions
print("Predictions (clipped):", y_pred[:10])


Predictions (clipped): [ 71782.56812499  22422.45415615  43769.28581062 177886.65606274
  60244.03542277  33030.97940834  26944.25564726 101922.64480531
 221745.55436275  47593.33629471]


In [32]:
import joblib

# Save the pipeline
joblib.dump(pipeline, "crop_yield_pipeline.pkl")


['crop_yield_pipeline.pkl']