In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
df = pd.read_csv("india_crop_yield_dataset_extended.csv")
df.head()

Unnamed: 0,State,Crop,Season,Area,Annual_Rainfall,Fertilizer,Yield
0,Karnataka,soybean,Kharif,13.76,397.6,335.0,38.07
1,Tamil Nadu,groundnut,Kharif,16.57,771.64,183.5,40.96
2,Punjab,groundnut,Kharif,3.97,1639.01,307.75,49.58
3,Tamil Nadu,tea,Zaid,17.58,2801.45,336.66,60.0
4,Punjab,turmeric,Rabi,5.71,529.96,272.77,44.56


In [4]:
y = df["Yield"]

In [5]:
X = df.drop('Yield', axis=1)

In [6]:
num_cols = ["Area", "Annual_Rainfall", "Fertilizer"]
cat_cols = ["State", "Crop", "Season"]

In [21]:
numeric_transformer = StandardScaler()

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [24]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

model.fit(X_train, y_train)

In [25]:
y_pred = model.predict(X_test)

In [26]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("RMSE:", rmse)
print("R2 Score:", r2)

MAE: 2.349186600940407
RMSE: 3.020345385033102
R2 Score: 0.9552872042347988


In [1]:
import pickle

model_path = "crop_yield_model.pkl"

with open(model_path, "wb") as file:
    pickle.dump(model, file)

print("Model saved successfully as crop_yield_model.pkl")

NameError: name 'model' is not defined

In [1]:
pip install scikit-learn==1.3.2

Collecting scikit-learn==1.3.2Note: you may need to restart the kernel to use updated packages.

  Using cached scikit-learn-1.3.2.tar.gz (7.5 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): still running...
  Preparing metadata (pyproject.toml): still running...
  Preparing metadata (pyproject.toml): finished with status 'error'


  error: subprocess-exited-with-error
  
  Preparing metadata (pyproject.toml) did not run successfully.
  exit code: 1
  
  [649 lines of output]
  Partial import of sklearn during the build process.
  test_program.c
  Generating code
  Finished generating code
  test_program.c
  Generating code
  Finished generating code
  
  Error compiling Cython file:
  ------------------------------------------------------------
  ...
      # particularly tiny on Windows/MSVC.
      # It corresponds to the maximum representable value for
      # 32-bit signed integers (i.e. 2^31 - 1).
      RAND_R_MAX = 2147483647
  
  cpdef sample_without_replacement(cnp.int_t n_population,
                                  ^
  ------------------------------------------------------------
  
  sklearn\utils\_random.pxd:19:33: 'int_t' is not a type identifier
  
  Error compiling Cython file:
  ------------------------------------------------------------
  ...
      # It corresponds to the maximum representable va