In [None]:
import pandas as pd
import numpy as np


In [None]:
# Load datasets
crop_df = pd.read_csv("Crop_recommendation.csv")
fao_df = pd.read_csv("FAOSTAT_data_en_12-8-2025.csv")

print("Crop Dataset Shape:", crop_df.shape)
print("FAOSTAT Dataset Shape:", fao_df.shape)


Crop Dataset Shape: (2200, 8)
FAOSTAT Dataset Shape: (5940, 15)


In [None]:
# Preview datasets
crop_df.head(), fao_df.head()


(    N   P   K  temperature   humidity        ph    rainfall label
 0  90  42  43    20.879744  82.002744  6.502985  202.935536  rice
 1  85  58  41    21.770462  80.319644  7.038096  226.655537  rice
 2  60  55  44    23.004459  82.320763  7.840207  263.964248  rice
 3  74  35  40    26.491096  80.158363  6.980401  242.864034  rice
 4  78  42  42    20.130175  81.604873  7.628473  262.717340  rice,
   Domain Code                        Domain  Area Code (M49)   Area  \
 0         QCL  Crops and livestock products              356  India   
 1         QCL  Crops and livestock products              356  India   
 2         QCL  Crops and livestock products              356  India   
 3         QCL  Crops and livestock products              356  India   
 4         QCL  Crops and livestock products              356  India   
 
    Element Code         Element  Item Code (CPC)  \
 0          5312  Area harvested           1654.0   
 1          5412           Yield           1654.0   
 2  

In [None]:
# Dataset info
crop_df.info()
fao_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            2200 non-null   int64  
 1   P            2200 non-null   int64  
 2   K            2200 non-null   int64  
 3   temperature  2200 non-null   float64
 4   humidity     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   rainfall     2200 non-null   float64
 7   label        2200 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5940 entries, 0 to 5939
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Domain Code       5940 non-null   object 
 1   Domain            5940 non-null   object 
 2   Area Code (M49)   5940 non-null   int64  
 3   Area              5940 non-null   object 
 4   Element Code    

In [None]:
# Check missing values
print(crop_df.isnull().sum())
print(fao_df.isnull().sum())


N              0
P              0
K              0
temperature    0
humidity       0
ph             0
rainfall       0
label          0
dtype: int64
Domain Code            0
Domain                 0
Area Code (M49)        0
Area                   0
Element Code           0
Element                0
Item Code (CPC)        0
Item                   0
Year Code              0
Year                   0
Unit                   0
Value                213
Flag                   0
Flag Description       0
Note                5825
dtype: int64


In [None]:
# Fill missing numerical values with mean
crop_df.fillna(crop_df.mean(numeric_only=True), inplace=True)
fao_df.fillna(fao_df.mean(numeric_only=True), inplace=True)


In [None]:
# Remove duplicates
crop_df.drop_duplicates(inplace=True)
fao_df.drop_duplicates(inplace=True)

print("Crop Dataset After Cleaning:", crop_df.shape)
print("FAOSTAT Dataset After Cleaning:", fao_df.shape)


Crop Dataset After Cleaning: (2200, 8)
FAOSTAT Dataset After Cleaning: (5940, 15)


In [None]:
# Standardize crop names
crop_df['label'] = crop_df['label'].str.strip().str.title()

# FAOSTAT crop column name may vary
# Adjust if column name is different (e.g., 'Item')
fao_df['Item'] = fao_df['Item'].str.strip().str.title()


In [None]:
# Find common crops
common_crops = set(crop_df['label']).intersection(set(fao_yield_df['label']))
print("Common Crops Count:", len(common_crops))


Common Crops Count: 2


Exploding the column(item)

In [None]:
import pandas as pd

fao_df = pd.read_csv("FAOSTAT_data_en_12-8-2025.csv")
print("Original FAOSTAT shape:", fao_df.shape)


Original FAOSTAT shape: (5940, 15)


In [None]:
import pandas as pd

# Ensure Item is string
fao_df['Item'] = fao_df['Item'].astype(str)

# Split on comma OR ' and ' (case-insensitive)
fao_df['Item'] = fao_df['Item'].str.split(r'(?i),|\sand\s')

# Explode into rows
fao_exploded_df = fao_df.explode('Item')

# Clean crop names
fao_exploded_df['Item'] = (
    fao_exploded_df['Item']
    .str.strip()
    .str.title()
)

print("FAOSTAT shape after exploding:", fao_exploded_df.shape)


FAOSTAT shape after exploding: (10519, 15)


In [None]:
fao_exploded_df.to_csv("FAOSTAT_exploded_clean.csv", index=False)
print("✅ Exploded FAOSTAT dataset saved as FAOSTAT_exploded_clean.csv")


✅ Exploded FAOSTAT dataset saved as FAOSTAT_exploded_clean.csv


In [None]:
import pandas as pd
import ast

df = pd.read_csv("FAOSTAT_exploded_clean.csv")
print(df['Item'].head())


0        ['Anise'
1       ' Badian'
2    ' Coriander'
3        ' Cumin'
4      ' Caraway'
Name: Item, dtype: object


In [None]:
df['Item'] = (
    df['Item']
    .astype(str)
    .str.replace(r"[\[\]']", "", regex=True)   # remove [ ] and '
    .str.strip()
)


In [None]:
df = df[df['Item'] != ""]


In [None]:
df['Item'] = df['Item'].str.title()


In [None]:
df['Item'].head(15)


Unnamed: 0,Item
0,Anise
1,Badian
2,Coriander
3,Cumin
4,Caraway
5,Fennel
6,Juniper Berries
7,Raw
8,Anise
9,Badian


In [None]:
df.to_csv("FAOSTAT_exploded_final.csv", index=False)
print("✅ Clean FAOSTAT dataset saved as FAOSTAT_exploded_final.csv")


✅ Clean FAOSTAT dataset saved as FAOSTAT_exploded_final.csv


Merging the two DATASETS...


In [None]:
import pandas as pd


In [None]:
# Load cleaned FAOSTAT dataset
fao_df = pd.read_csv("FAOSTAT_exploded_final.csv")

# Load crop recommendation dataset
crop_df = pd.read_csv("Crop_recommendation.csv")

print("FAOSTAT shape:", fao_df.shape)
print("Crop dataset shape:", crop_df.shape)


FAOSTAT shape: (10447, 15)
Crop dataset shape: (2200, 8)


In [None]:
# Standardize names for matching
fao_df['Item'] = fao_df['Item'].astype(str).str.strip().str.title()
crop_df['label'] = crop_df['label'].astype(str).str.strip().str.title()


In [None]:
fao_df = fao_df[fao_df['Item'].isin(common_crops)]
crop_df = crop_df[crop_df['label'].isin(common_crops)]


In [None]:
merged_df = pd.merge(
    crop_df,
    fao_df,
    left_on='label',
    right_on='Item',
    how='inner'
)

print("Merged dataset shape:", merged_df.shape)
merged_df.head()


Merged dataset shape: (14400, 23)


Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label,Domain Code,Domain,...,Element,Item Code (CPC),Item,Year Code,Year,Unit,Value,Flag,Flag Description,Note
0,90,42,43,20.879744,82.002744,6.502985,202.935536,Rice,QCL,Crops and livestock products,...,Area harvested,113.0,Rice,2000,2000,ha,44712000.0,A,Official figure,
1,90,42,43,20.879744,82.002744,6.502985,202.935536,Rice,QCL,Crops and livestock products,...,Yield,113.0,Rice,2000,2000,kg/ha,2850.8,A,Official figure,
2,90,42,43,20.879744,82.002744,6.502985,202.935536,Rice,QCL,Crops and livestock products,...,Production,113.0,Rice,2000,2000,t,127464896.0,A,Official figure,
3,90,42,43,20.879744,82.002744,6.502985,202.935536,Rice,QCL,Crops and livestock products,...,Area harvested,113.0,Rice,2001,2001,ha,44900000.0,A,Official figure,
4,90,42,43,20.879744,82.002744,6.502985,202.935536,Rice,QCL,Crops and livestock products,...,Yield,113.0,Rice,2001,2001,kg/ha,3115.8,A,Official figure,


In [54]:
import pandas as pd

df = pd.read_csv("FAOSTAT_exploded_final.csv")


In [55]:
unique_items = sorted(df['Item'].dropna().unique())

print("Total unique items:", len(unique_items))
for item in unique_items:
    print("-", item)


Total unique items: 127
- Anise
- Apples
- Apricots
- Areca Nuts
- Aromatic Crops
- Badian
- Bananas
- Barley
- Beans
- Broccoli
- Cabbages
- Cantaloupes
- Caraway
- Cardamoms
- Carrots
- Cashew Nuts
- Cassava
- Castor Oil Seeds
- Cauliflowers
- Cherries
- Chick Peas
- Chicory
- Chillies
- Clementines
- Cocoa Beans
- Coconuts
- Coffee
- Coir
- Coriander
- Cucumbers
- Cumin
- Dry
- Dry (Capsicum Spp.
- Dry (Excluding Dehydrated)
- Eggplants (Aubergines)
- Excluding Shelled
- Fennel
- Figs
- Fresh
- Fresh N.E.C.
- Fruits Of The Genus Vaccinium N.E.C.
- Gherkins
- Ginger
- Gourds
- Grapefruits
- Grapes
- Green
- Green (Capsicum Spp.
- Green Garlic
- Groundnuts
- Guavas
- In Shell
- Juniper Berries
- Jute
- Kenaf
- Lemons
- Lentils
- Lettuce
- Limes
- Linseed
- Mace
- Maize (Corn)
- Mandarins
- Mangoes
- Mangosteens
- Millet
- Mushrooms
- N.E.C.
- Natural Rubber In Primary Forms
- Nectarines
- Nutmeg
- Okra
- Onions
- Oranges
- Other Beans
- Other Berries
- Other Citrus Fruit
- Other Fruit

In [56]:
import pandas as pd

df = pd.read_csv("Crop_recommendation.csv")


In [57]:
unique_labels = sorted(df['label'].dropna().unique())

print("Total unique labels:", len(unique_labels))
for label in unique_labels:
    print("-", label)


Total unique labels: 22
- apple
- banana
- blackgram
- chickpea
- coconut
- coffee
- cotton
- grapes
- jute
- kidneybeans
- lentil
- maize
- mango
- mothbeans
- mungbean
- muskmelon
- orange
- papaya
- pigeonpeas
- pomegranate
- rice
- watermelon


In [62]:
import pandas as pd

fao_df = pd.read_csv("FAOSTAT_exploded_final.csv")
crop_df = pd.read_csv("Crop_recommendation.csv")


In [63]:
fao_df['item_norm'] = fao_df['Item'].str.lower().str.strip()
crop_df['label_norm'] = crop_df['label'].str.lower().str.strip()


In [70]:
fao_to_crop_mapping = {
    "apples": "apple",
    "bananas": "banana",
    "chick peas": "chickpea",
    "coconuts": "coconut",
    "grapes": "grapes",
    "lentils": "lentil",
    "maize (corn)": "maize",
    "mangoes": "mango",
    "oranges": "orange",
    "papayas": "papaya",
    "watermelons": "watermelon",
    "seed cotton": "cotton",
    "pigeon peas": "pigeonpeas",
    "coffee": "coffee",
    "jute": "jute",
    "rice": "rice",
    "beans": "kidneybeans",
    "other beans": "kidneybeans",
    "other melons": "muskmelon"
}


In [77]:
final_crops = [
    'apple', 'banana', 'chickpea', 'coconut', 'coffee', 'cotton',
    'grapes', 'jute', 'kidneybeans', 'lentil', 'maize', 'mango',
    'muskmelon', 'orange', 'papaya', 'pigeonpeas', 'rice', 'watermelon'
]


In [71]:
fao_df['mapped_crop'] = fao_df['item_norm'].replace(fao_to_crop_mapping)


In [72]:
fao_crops = set(fao_df['mapped_crop'].dropna().unique())
crop_crops = set(crop_df['label_norm'].unique())

common_crops = sorted(fao_crops.intersection(crop_crops))

print("Total common crops:", len(common_crops))
print(common_crops)


Total common crops: 18
['apple', 'banana', 'chickpea', 'coconut', 'coffee', 'cotton', 'grapes', 'jute', 'kidneybeans', 'lentil', 'maize', 'mango', 'muskmelon', 'orange', 'papaya', 'pigeonpeas', 'rice', 'watermelon']


In [73]:
import pandas as pd

fao_df = pd.read_csv("FAOSTAT_exploded_final.csv")
crop_df = pd.read_csv("Crop_recommendation.csv")


In [74]:
fao_df['item_norm'] = fao_df['Item'].str.lower().str.strip()
crop_df['label_norm'] = crop_df['label'].str.lower().str.strip()


In [75]:
fao_to_crop_mapping = {
    "apples": "apple",
    "bananas": "banana",
    "chick peas": "chickpea",
    "coconuts": "coconut",
    "grapes": "grapes",
    "lentils": "lentil",
    "maize (corn)": "maize",
    "mangoes": "mango",
    "oranges": "orange",
    "papayas": "papaya",
    "watermelons": "watermelon",
    "seed cotton": "cotton",
    "pigeon peas": "pigeonpeas",
    "coffee": "coffee",
    "jute": "jute",
    "rice": "rice",
    "beans": "kidneybeans",
    "other beans": "kidneybeans",
    "other melons": "muskmelon"
}
fao_df['mapped_crop'] = fao_df['item_norm'].replace(fao_to_crop_mapping)


In [78]:
fao_final = fao_df[fao_df['mapped_crop'].isin(final_crops)].copy()


In [79]:
crop_final = crop_df[crop_df['label_norm'].isin(final_crops)].copy()


In [80]:
merged_18_df = pd.merge(
    crop_final,
    fao_final,
    left_on='label_norm',
    right_on='mapped_crop',
    how='inner'
)

print("Final merged dataset shape:", merged_18_df.shape)


Final merged dataset shape: (136800, 26)


In [81]:
merged_18_df.to_csv("AgriYield_18Crops_Final.csv", index=False)
print("✅ Final merged dataset saved as AgriYield_18Crops_Final.csv")


✅ Final merged dataset saved as AgriYield_18Crops_Final.csv


In [82]:
import pandas as pd

df = pd.read_csv("AgriYield_18Crops_Final.csv")
print("Original shape:", df.shape)


Original shape: (136800, 26)


  df = pd.read_csv("AgriYield_18Crops_Final.csv")


In [83]:
columns_to_drop = [
    'Domain Code', 'Domain',
    'Area Code', 'Area',
    'Element Code', 'Item Code',
    'Item', 'Year Code',
    'Year', 'Unit',
    'Lag Description', 'Note'
]

# Drop only if column exists (safe)
df.drop(columns=[c for c in columns_to_drop if c in df.columns],
        inplace=True)

print("After dropping columns:", df.shape)


After dropping columns: (136800, 17)


In [84]:
df = df[df['Element'] == 'Production']
print("After keeping only Production rows:", df.shape)


After keeping only Production rows: (45600, 17)


In [85]:
df.reset_index(drop=True, inplace=True)


In [86]:
df.to_csv("AgriYield_18Crops_ProductionOnly.csv", index=False)
print("✅ Clean dataset saved as AgriYield_18Crops_ProductionOnly.csv")


✅ Clean dataset saved as AgriYield_18Crops_ProductionOnly.csv


In [87]:
import pandas as pd

df = pd.read_csv("AgriYield_18Crops_ProductionOnly.csv")
print("Original shape:", df.shape)


Original shape: (45600, 17)


In [97]:
columns_to_drop = [
    'label_norm',
    'Area Code (M49)',
    'Item Code (CPC)',        # handles Itemcode / Item Code variation
    'Flag',
    'Flag Description',
    'item_norm',
    'mapped_crop'
]

# Drop safely (only if column exists)
df.drop(columns=[col for col in columns_to_drop if col in df.columns],
        inplace=True)

print("After dropping columns:", df.shape)


After dropping columns: (45600, 10)


In [98]:
df['label'] = df['label'].str.lower().str.strip()   # safety cleanup

df.sort_values(by='label', ascending=True, inplace=True)

df.reset_index(drop=True, inplace=True)


In [99]:
df['label'].unique()


array(['apple', 'banana', 'chickpea', 'coconut', 'coffee', 'cotton',
       'grapes', 'jute', 'kidneybeans', 'lentil', 'maize', 'mango',
       'muskmelon', 'orange', 'papaya', 'pigeonpeas', 'rice',
       'watermelon'], dtype=object)

In [100]:
df.to_csv("AgriYield_18Crops_Final_Clean.csv", index=False)
print("✅ Final cleaned & sorted dataset saved as AgriYield_18Crops_Final_Clean.csv")


✅ Final cleaned & sorted dataset saved as AgriYield_18Crops_Final_Clean.csv
