In [None]:

import zipfile
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 2. Unzip Dataset
zip_path = "/archive.zip"
extract_path = "data"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Files extracted:", os.listdir(extract_path))

# 3. Load CSV File
csv_file = os.listdir(extract_path)[0]  # assumes one CSV inside zip
data = pd.read_csv(os.path.join(extract_path, csv_file))

print("Dataset preview:")
print(data.head())

# 4. Basic Cleaning
data = data.dropna()  # simple & acceptable
print("\nAfter cleaning:", data.shape)

# 5. Separate Features & Target
X = data.drop('Yield', axis=1)
y = data['Yield']

# 6. Identify Column Types
categorical_cols = ['Crop', 'Season', 'State']
numerical_cols = [
    'Crop_Year',
    'Area',
    'Annual_Rainfall',
    'Fertilizer',
    'Pesticide'
]

# 7. Preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# 8. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 9. Models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(
        n_estimators=120,
        max_depth=10,
        random_state=42
    )
}

# 10. Training & Evaluation
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('model', model)
    ])

    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)

    print("\n", name)
    print("MAE :", mean_absolute_error(y_test, predictions))

    rmse = mean_squared_error(y_test, predictions) ** 0.5
    print("RMSE:", rmse)

    print("R²  :", r2_score(y_test, predictions))
    print("-" * 40)

Files extracted: ['crop_yield.csv']
Dataset preview:
           Crop  Crop_Year       Season  State     Area  Production  \
0      Arecanut       1997  Whole Year   Assam  73814.0       56708   
1     Arhar/Tur       1997  Kharif       Assam   6637.0        4685   
2   Castor seed       1997  Kharif       Assam    796.0          22   
3      Coconut        1997  Whole Year   Assam  19656.0   126905000   
4  Cotton(lint)       1997  Kharif       Assam   1739.0         794   

   Annual_Rainfall  Fertilizer  Pesticide        Yield  
0           2051.4  7024878.38   22882.34     0.796087  
1           2051.4   631643.29    2057.47     0.710435  
2           2051.4    75755.32     246.76     0.238333  
3           2051.4  1870661.52    6093.36  5238.051739  
4           2051.4   165500.63     539.09     0.420909  

After cleaning: (19689, 10)

 Linear Regression
MAE : 63.884930308847856
RMSE: 389.6808093285026
R²  : 0.8104798078459079
----------------------------------------

 Random Fores