# Web Scrapping the official AGMARKNET website

In [1]:
# 🛠️ Install necessary libraries for web scraping and data handling.
# `bs4` (BeautifulSoup) is crucial for parsing HTML content. 🌐
# %pip install -U bs4

### 👀 Manual Data Verification Instructions 📋🔍

To manually verify whether onion price data for Uttar Pradesh exists on AGMARKNET: 🧅

1. Visit the official AGMARKNET "Search Reports" page: 🔗
   https://agmarknet.gov.in/SearchCmmMkt.aspx

2. In the opened form: 📝
   - For 'Commodity', select "Onion".
   - For 'State', choose "Uttar Pradesh".
   - (Optional: Select District or Market, or leave as 'All' for a broader search.) 🗺️
   - Choose the desired Date Range (for best results, use a recent week or month). 🗓️
   - Click the 'Submit' button to fetch results. ✅

3. The page will display a table with Data (Date, Market, Variety, Min Price, Max Price, Modal Price, Arrival Qty) if available. 📊

4. If results appear with valid prices and arrival quantities,
   ✅ The data exists and can be scraped or downloaded. 🎉
   ❌ If you see "No records found" or empty fields, that period or region has no posted data. 😔

5. You can also use the 'Download CSV' button provided on the result page to save a copy for inspection. 💾

⬆️ These steps let you confirm that real onion price data for UP exists BEFORE running your code or automating scraping. 🚀

#### 💡 Tip: Repeat this for different years or date ranges if you need historical data. 🕰️

In [3]:
# --- Section 1: Setup & Imports 📦 ---

# Let's get our essential tools ready! 🛠️
# We'll need these Python libraries to make web requests, parse HTML,
# and handle data like a pro. 📈

import requests                              # For making HTTP requests to websites 🌐
from bs4 import BeautifulSoup                # For parsing HTML content and navigating the DOM tree 🌳
import pandas as pd                          # For powerful data manipulation and analysis with DataFrames 📊
import os                                    # For interacting with the operating system (e.g., creating folders) 📁
import time                                  # For adding delays (important for polite scraping to avoid overwhelming servers! ⏳)
import urllib.parse                          # For URL encoding/decoding, used in AJAX response parsing and URL construction 🔗
from datetime import datetime, timedelta     # For handling dates and times precisely 📅

# Set pandas display options for clearer output in our notebook. ✨
pd.set_option('display.max_columns', 100)    # Show up to 100 columns 🔢
pd.set_option('display.width', 180)          # Widen the display for better readability of wide tables 📏

print("🚀 Essential libraries imported successfully!")
print("Pandas display options set for a cleaner view of our data. ✨")


🚀 Essential libraries imported successfully!
Pandas display options set for a cleaner view of our data. ✨


In [7]:
# --- ⚙️ Configuration & Constants ---
# Define the parameters for our data fetching operation. 🎯

BASE_URL_MAIN = "https://agmarknet.gov.in/SearchCmmMkt.aspx" # The main URL for AGMARKNET price search 🌐

# Commodity & state codes specific to AGMARKNET portal 🧅🗺️
COMMODITY_CODE = "23"    # For Onion, 23
STATE_CODE = "UP"        # For Uttar Pradesh, UP

# Set the date range for which we want to fetch the data 📅
# You can uncomment the lines below to dynamically set the date to "day before yesterday".
# latest_date = (datetime.today() - timedelta(days=2)).strftime("%d-%b-%Y")
# DATE_FROM = latest_date
# DATE_TO = latest_date

# For now, we'll use a fixed date range for consistency. 🗓️
DATE_FROM = '01-Jul-2025'
DATE_TO = '31-Jul-2025'

# Define the data directory and the raw CSV file path 📁💾
DATA_DIR = "data"
# Format dates for filename: e.g., '01Jul25_31Jul25'
formatted_date_from = DATE_FROM.replace('-', '').replace('20', '')
formatted_date_to = DATE_TO.replace('-', '').replace('20', '')
RAW_CSV_PATH = os.path.join(DATA_DIR, f"COMMODITY[{COMMODITY_CODE}]_{STATE_CODE}_{formatted_date_from}_{formatted_date_to}.csv")
AJAX_RESPONSE_PATH = os.path.join(DATA_DIR, f"debug_ajax_response_{formatted_date_from}_{formatted_date_to}.txt")

# Ensure the data directory exists; create it if it doesn't. ➕
os.makedirs(DATA_DIR, exist_ok=True)

print(f"""
⚙️ Configuration:
  Commodity: Onion (Code: {COMMODITY_CODE})
  State: Uttar Pradesh (Code: {STATE_CODE})
  Date Range: {DATE_FROM} to {DATE_TO}
  Data will be saved in: {RAW_CSV_PATH}
  AJAX response will be saved in: {AJAX_RESPONSE_PATH}
""")



⚙️ Configuration:
  Commodity: Onion (Code: 23)
  State: Uttar Pradesh (Code: UP)
  Date Range: 01-Jul-2025 to 31-Jul-2025
  Data will be saved in: data/COMMODITY[23]_UP_01Jul25_31Jul25.csv
  AJAX response will be saved in: data/debug_ajax_response_01Jul25_31Jul25.txt



In [41]:
# # --- 🧩 Parse AJAX response function ---
# # This function is designed to extract the relevant HTML fragment from the complex ASP.NET AJAX response. 📦

# def parse_ajax_response(ajax_text: str) -> str:
#     """
#     Parses ASP.NET pipe-delimited AJAX response to extract and decode the HTML fragment.
#     """
#     parts = ajax_text.split('|')
#     i = 0
#     while i < len(parts):
#         try:
#             length = int(parts[i])   # first is length of next block
#             update_type = parts[i+1] # usually '#' or 'updatePanel'
#             control_id = parts[i+2]
#             content = parts[i+3]
#             decoded = urllib.parse.unquote(content)
#             if "cphBody_GridPriceData" in decoded:
#                 return decoded
#             i += 4
#         except Exception:
#             i += 1
#     print("⚠ Could not parse AJAX properly. Returning raw text.")
#     return ajax_text


In [10]:
# --- 🤖 Fetch AGMARKNET data function ---
# This is the core function for scraping onion price data from AGMARKNET. 🧅💰

def fetch_agmarknet_data(commodity_code, state_code, date_from, date_to, verbose=True):

    session = requests.Session() # Create a session to persist parameters across requests 🤝
    df = pd.DataFrame() # Initialize an empty DataFrame to store results 📝

    if verbose:
        print(f"\n🚀 Starting data fetch for: {date_from} to {date_to}") # Inform the user about the process start 🚀

    try:
        # Step 1: GET the initial page to extract necessary ASP.NET ViewState and EventValidation tokens. 🔑
        r = session.get(BASE_URL_MAIN, timeout=15) # Make a GET request with a timeout ⏰
        r.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx) 🚨
        soup = BeautifulSoup(r.text, "html.parser") # Parse the HTML content 🧐

        def get_val(name): # Helper function to get input field values 💡
            tag = soup.find("input", {"name": name}) # Find the input tag by its name attribute 🔎
            return tag.get("value", "") if tag else "" # Return its value, or an empty string if not found 📝

        viewstate = get_val("__VIEWSTATE") # Extract __VIEWSTATE 🔑
        viewstategenerator = get_val("__VIEWSTATEGENERATOR") # Extract __VIEWSTATEGENERATOR 🔑
        eventvalidation = get_val("__EVENTVALIDATION") # Extract __EVENTVALIDATION 🔑

        if verbose:
            print(f"📍 ViewState found: {bool(viewstate)}, EventValidation found: {bool(eventvalidation)}") # Report token discovery 🎯

        # Step 2: Prepare the payload for the POST request. 📤
        # These parameters mimic a form submission on the AGMARKNET website.
        payload = {
            "__EVENTTARGET": "btnSubmit",  # Crucial for triggering the form submission 🎯
            "__EVENTARGUMENT": "",         # Usually empty for simple button clicks
            "__VIEWSTATE": viewstate,      # Required ASP.NET token
            "__VIEWSTATEGENERATOR": viewstategenerator, # Required ASP.NET token
            "__EVENTVALIDATION": eventvalidation, # Required ASP.NET token
            "__LASTFOCUS": "",             # Often empty
            "ddlCommodity": commodity_code, # Our selected commodity (Onion) 🧅
            "ddlState": state_code,        # Our selected state (Uttar Pradesh) 🗺️
            "txtDate": date_from,          # Start date for the search 📅
            "txtToDate": date_to,          # End date for the search 📅
            "btnSubmit": "Submit"          # The submit button action ✅
        }


        # Query parameters for the URL, though the main data fetch is via POST. 🌐
        query_params = {
            "Tx_Commodity": commodity_code,
            "Tx_State": state_code,
            "Tx_District": "0", # "0" typically means 'All Districts'
            "Tx_Market": "0",   # "0" typically means 'All Markets'
            "DateFrom": date_from,
            "DateTo": date_to,
            "Fr_Date": date_from, # Redundant but included for robustness
            "To_Date": date_to,
            "Tx_Trend": "0",     # Unused in this context
            "Tx_CommodityHead": "Onion",
            "Tx_StateHead": "Uttar Pradesh",
            "Tx_DistrictHead": "--Select--",
            "Tx_MarketHead": "--Select--"
        }

        # Construct the full URL for the POST request. 🔗
        full_post_url = BASE_URL_MAIN + "?" + urllib.parse.urlencode(query_params)

        # Define HTTP headers to mimic a real browser request. 🛡️
        headers = {
            'User-Agent': 'Mozilla/5.0',
            'Referer': BASE_URL_MAIN,
            'Origin': 'https://agmarknet.gov.in',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
        }

        if verbose:
            print(f"📤 POST URL: {full_post_url}") # Display the URL for debugging 🖥️
            print(f"🔑 Payload: {payload}")      # Display the payload sent 📦

        # Step 3: Send the POST request to get the data. 🚀
        resp = session.post(full_post_url, data=payload, headers=headers, timeout=30) # Send the POST request with data and headers 📨
        resp.raise_for_status() # Check for HTTP errors again 🚨

        if verbose:
            print(f"✅ AJAX POST status: {resp.status_code}") # Show response status code 👍
            print(f"📝 Response length: {len(resp.text)}")     # Show the length of the response text 📏

        # Save raw response for inspection during debugging. 💾
        with open(AJAX_RESPONSE_PATH, "w", encoding="utf-8") as f:
            f.write(resp.text)

        # The server didn’t include the table (cphBody_GridPriceData) in the AJAX response → parsing failed → no data.
        
        soup_result = BeautifulSoup(resp.text, "html.parser")
        table = soup_result.find("table", {"id": "cphBody_GridPriceData"})
        
        if table: # If the table is found 🎉
            # Extract table headers.
            headers = [th.get_text(strip=True) for th in table.find("tr").find_all("th")] # Get column headers 🏷️
            # Extract table rows (excluding the header row).
            rows = [[td.get_text(strip=True) for td in tr.find_all("td")] for tr in table.find_all("tr")[1:]] # Get all data rows 📝
            df = pd.DataFrame(rows, columns=headers) # Create a Pandas DataFrame from the extracted data 📊
            print(f"✅ Found table. Rows: {len(df)}") # Report success and row count 👍
        else:
            print("❌ Table not found. Possible reasons: wrong payload, no data, or structure change.") # Log failure reasons 😔
            # Check for "No records found" in the *extracted HTML fragment*
            if "No records found" in html_fragment: # Check for the "No records found" message in the parsed HTML ℹ️
                print("ℹ️ Detected: No records found message.")

    except Exception as e:
        print(f"⚠️ Error during data fetching: {e}") # Catch and report any exceptions during the process ❗
    finally:
        session.close() # Always close the requests session to release resources. 🧹

    return df # Return the DataFrame (might be empty if no data was found or an error occurred) 🔄

In [11]:
# --- 📝 Run scraper & save to CSV ---
# Execute the data fetching process and handle the results. 🚀💾

df_data = fetch_agmarknet_data(
    COMMODITY_CODE,        # Pass the onion commodity code 🧅
    STATE_CODE,            # Pass the Uttar Pradesh state code 🗺️
    DATE_FROM,             # Start date for the data fetch 📅
    DATE_TO,               # End date for the data fetch 📅
    False                  # Set verbose to False for cleaner output during execution, True for detailed logs 🤫
)

if not df_data.empty: # Check if the DataFrame contains any data 📥
    df_data.to_csv(RAW_CSV_PATH, index=False, encoding='utf-8') # Save the DataFrame to a CSV file 📄
    print(f"✅ Data saved to: {RAW_CSV_PATH}") # Confirm successful save 🎉
    display(df_data.head()) # Display the first few rows of the fetched data for a quick look 👀
else:
    print("⚠️ No data fetched. Please check configuration or website availability.") # Inform if no data was retrieved 😔

✅ Found table. Rows: 4965
✅ Data saved to: data/COMMODITY[23]_UP_01Jul25_31Jul25.csv


Unnamed: 0,Sl no.,District Name,Market Name,Commodity,Variety,Grade,Min Price (Rs./Quintal),Max Price (Rs./Quintal),Modal Price (Rs./Quintal),Price Date
0,1,Auraiya,Achalda,Onion,Red,FAQ,1200,1350,1300,01 Jul 2025
1,2,Auraiya,Achalda,Onion,Red,FAQ,1200,1350,1300,02 Jul 2025
2,3,Auraiya,Achalda,Onion,Red,FAQ,1200,1350,1300,15 Jul 2025
3,4,Auraiya,Achalda,Onion,Red,FAQ,1250,1450,1350,22 Jul 2025
4,5,Auraiya,Achalda,Onion,Red,FAQ,1250,1450,1350,30 Jul 2025
