In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Step 1: Load the CSV data
print("Loading the CSV data...")
data = pd.read_csv(r"C:\Users\R.R. Dharun raagav\Desktop\Shanmuga priya projects Ml project\Preprocessed_crop_production_data.csv")
print(data.head())

# Step 2: Pivot the data to convert 'Element' values into columns
print("Pivoting the data...")
pivot_data = data.pivot_table(
    index=['Area', 'Item', 'Year'],
    columns='Element',
    values='Value',
    aggfunc='first'
).reset_index()

# Rename columns for clarity
pivot_data.columns = ['Area', 'Item', 'Year', 'Area harvested', 'Laying', 
                      'Milk Animals', 'Producing Animals/Slaughtered', 
                      'Production', 'Stocks', 'Yield', 'Yield/Carcass Weight']

print(pivot_data.head())

# Step 3: Handle missing values
print("Handling missing values...")

# Filling missing values for 'Area harvested' with 0
pivot_data['Area harvested'] = pivot_data['Area harvested'].fillna(0)

# Filling missing values for 'Yield' with the mean value of the column
pivot_data['Yield'] = pivot_data['Yield'].fillna(pivot_data['Yield'].mean())

# Dropping rows where 'Production' is missing
pivot_data = pivot_data.dropna(subset=['Production'])

# Step 4: Encode categorical variables
print("Encoding categorical variables...")
le_area = LabelEncoder()
le_item = LabelEncoder()

pivot_data['Area'] = le_area.fit_transform(pivot_data['Area'])
pivot_data['Item'] = le_item.fit_transform(pivot_data['Item'])

# Step 5: Select features and target
features = ['Area', 'Item', 'Year', 'Area harvested', 'Yield']
target = 'Production'

X = pivot_data[features]
y = pivot_data[target]

# Step 6: Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train a Simple Random Forest Model
print("Training the model...")
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 8: Evaluate the Model
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print("Training R² score:", train_score)
print("Test R² score:", test_score)

# Step 9: Save the Model (Optional)
import pickle
with open("crop_production_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model saved successfully!")


Loading the CSV data...
  Domain Code                        Domain  Area Code (M49)         Area  \
0         QCL  Crops and livestock products                4  Afghanistan   
1         QCL  Crops and livestock products                4  Afghanistan   
2         QCL  Crops and livestock products                4  Afghanistan   
3         QCL  Crops and livestock products                4  Afghanistan   
4         QCL  Crops and livestock products                4  Afghanistan   

   Element Code         Element Item Code (CPC)               Item  Year Code  \
0          5312  Area harvested            1371  Almonds, in shell       2019   
1          5412           Yield            1371  Almonds, in shell       2019   
2          5510      Production            1371  Almonds, in shell       2019   
3          5312  Area harvested            1371  Almonds, in shell       2020   
4          5412           Yield            1371  Almonds, in shell       2020   

   Year   Unit    Value Fl

In [3]:
print(pivot_data.columns)


Index(['Area', 'Item', 'Year', 'Area harvested', 'Laying', 'Milk Animals',
       'Producing Animals/Slaughtered', 'Production', 'Stocks', 'Yield',
       'Yield/Carcass Weight'],
      dtype='object', name='Element')
