In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Step 1: Load the raw dataset
# Load the dataset generated in Step 1
dataset = pd.read_csv("crop_price_data.csv")

# Display the first few rows of the dataset
print("Raw Dataset:")
print(dataset.head())

# Step 2: Handle basic issues in raw data (e.g., dropping rows with missing target)
# Drop rows where the target variable ('Crop Price') is missing
dataset = dataset.dropna(subset=['Crop Price'])

# Step 3: Split the dataset into features (X) and target (y)
X = dataset.drop(columns=['Crop Price'])  # Features
y = dataset['Crop Price']  # Target variable

# Step 4: Handle categorical features by encoding them
# For simplicity, use one-hot encoding for categorical variables
X_encoded = pd.get_dummies(X, drop_first=True)

# Step 5: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Step 6: Train a Random Forest Regressor on the raw dataset
# Initialize the model
model = RandomForestRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Step 7: Evaluate the model on the test set
# Predict on the test set
y_pred = model.predict(X_test)

# Calculate performance metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display the performance metrics
print("\nModel Performance on Raw Dataset:")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R²): {r2:.2f}")

# Step 8: Display observations about the raw data and model performance
# Example observations based on outputs
print("\nPossible Observations:")
print("1. RMSE and MAE values may be high if the raw data contains inconsistencies such as outliers or missing values.")
print("2. The R² score indicates the proportion of variance explained by the model. A low R² suggests the model struggles with unprocessed data.")
print("3. Missing data, unscaled features, and unaddressed outliers can negatively impact model performance.")
