In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import joblib

In [2]:
df = pd.read_csv("Crop_data.csv")

In [3]:
print("Missing values:\n", df.isnull().sum())

Missing values:
 N                          0
P                          0
K                          0
Temperature                0
Humidity                   0
pH                         0
Rainfall                   0
Latitude                   0
Longitude                  0
Soil_Type                  0
Crop                       0
Best Sowing Time           0
Duration Time (Approx.)    0
Harvesting Time            0
dtype: int64


In [4]:
df.fillna(df.median(numeric_only=True), inplace=True)

In [5]:
SOIL_TYPES = ["Alluvial", "Black (Regur)", "Red & Yellow", "Laterite", "Arid (Desert)", "Forest / Mountain", "Saline / Alkaline", "Peaty / Marshy"]

In [6]:
soil_type_map = {
    "Alluvial": 1, 
    "Black (Regur)": 2, 
    "Red & Yellow": 3, 
    "Laterite": 4, 
    "Arid (Desert)": 5, 
    "Forest / Mountain": 6, 
    "Saline / Alkaline": 7, 
    "Peaty / Marshy": 8
}

In [7]:
df['Soil_Type_Encoded'] = df['Soil_Type'].map(soil_type_map)

In [8]:
df_ml = df.drop(columns=['Soil_Type', 'Best Sowing Time', 'Duration Time (Approx.)', 'Harvesting Time'])

In [9]:
crop_le = LabelEncoder()
df_ml['Crop'] = crop_le.fit_transform(df_ml['Crop'])

In [10]:
df_ml.to_csv("crop_cleaned.csv", index=False)

In [11]:
print("\n--- Cleaned Data Head (ML Ready) ---")
print(df_ml.head())
print("\n--- Cleaned Data Columns ---")
print(df_ml.columns.tolist())


--- Cleaned Data Head (ML Ready) ---
     N    P    K  Temperature  Humidity    pH  Rainfall  Latitude  Longitude  \
0  102   79  113        24.35     47.27  5.78    284.33   21.4104    73.4537   
1   71  142   80        42.15     90.88  4.36    278.35   19.1361    74.3164   
2   60   36   35        20.84     14.83  6.86    108.02   20.2069    75.2012   
3   83   43  101        25.77     52.53  8.79     97.64   21.2843    75.9873   
4  106   33   83        28.74     51.31  4.71    136.56   19.9024    76.6478   

   Crop  Soil_Type_Encoded  
0     1                  4  
1     3                  3  
2    14                  1  
3     0                  6  
4     6                  2  

--- Cleaned Data Columns ---
['N', 'P', 'K', 'Temperature', 'Humidity', 'pH', 'Rainfall', 'Latitude', 'Longitude', 'Crop', 'Soil_Type_Encoded']
