In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import tabula
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [2]:
def create_session():
    session = requests.Session()
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3829.110 Safari/537.36"
        }
    session.headers.update(headers)
    # Retry mechanism for handling temporary issues like network failures
    retries = Retry(total=5, backoff_factor=0.3, status_forcelist=[500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries) 
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session

In [3]:
def fetch_pdf_link(url):
    session = create_session()
    try:
        response = session.get(url, timeout=5)
        response.raise_for_status() 
    # Raise an error for bad responses (4xx or 5xx) 
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return None
    soup = BeautifulSoup (response.text, "html.parser")
    weekly_data = soup.find_all("div", class_="field-content")
    if weekly_data:
        link = weekly_data[0].find("a").get("href")
        pdf_url = urljoin(url, link)
        return pdf_url
    else:
        print("No PDF link found.")
        return None

# Extract data from the pdf directly without downloading the pdf
def extract_data_from_pdf(pdf_url):
    try:
        dfs = tabula.read_pdf(pdf_url, pages='1,2,3,4', multiple_tables=True)
        print("Data extracted")
        return dfs
    except Exception as e:
        print(f"Error extracting data from PDF: {e}")
        return None

def extract_data():
    url = 'https://www.pbs.gov.pk/spi'
    pdf_url = fetch_pdf_link(url)
    if pdf_url:
        dfs = extract_data_from_pdf(pdf_url)
        return dfs
    else:
        print("Failed to fetch the PDF link")
        return None

dfs = extract_data()
dfs[1]

Failed to import jpype dependencies. Fallback to subprocess.
No module named 'jpype'
Got stderr: Nov 09, 2024 2:28:26 PM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider loadDiskCache
Nov 09, 2024 2:28:26 PM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider <init>
Nov 09, 2024 2:28:27 PM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider <init>



Data extracted


Unnamed: 0.1,1,Wheat Flour Bag 20 Kg,1906.67,1946.49,2000.00,Unnamed: 0,1906.66,1936.81,1960.00,1650.00,...,1730.00,1730.00.1,1730.00.2,1700.00,1700.00.1,1700.00.2,1580.00,1723.46,1800.00.2,1.1
0,2,Rice Basmati Broken (Average Quali1ty K) g,240.0,253.13,270.0,,220.0,233.2,250.0,180.0,...,180.0,193.7,210.0,200.0,200.0,200.0,170.0,170.0,170.0,2
1,3,Rice IRRI-6/9 (Sindh/Punjab) 1 Kg,150.0,171.14,190.0,,140.0,148.93,170.0,0.0,...,0.0,0.0,0.0,160.0,160.0,160.0,160.0,160.0,160.0,3
2,4,Bread plain (Small Size) Each,110.0,110.0,110.0,,110.0,110.0,110.0,110.0,...,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,4
3,5,Beef with Bone (Average Quality) 1 Kg,1200.0,1208.19,1250.0,,1000.0,1128.34,1200.0,1000.0,...,950.0,1018.31,1100.0,1200.0,1200.0,1200.0,900.0,900.0,900.0,5
4,6,Mutton (Average Quality) 1 Kg,2200.0,2224.58,2300.0,,2050.0,2234.64,2300.0,2000.0,...,2200.0,2232.84,2300.0,2200.0,2200.0,2200.0,1800.0,1800.0,1800.0,6
5,7,Chicken Farm Broiler (Live) 1 Kg,370.0,382.28,400.0,,370.0,376.64,380.0,358.0,...,348.0,348.0,348.0,360.0,360.0,360.0,365.0,365.0,365.0,7
6,8,Milk fresh (Un-boiled) 1 Ltr,220.0,221.64,230.0,,200.0,216.86,240.0,200.0,...,170.0,179.46,190.0,200.0,200.0,200.0,160.0,160.0,160.0,8
7,9,Curd (Dahi) Loose 1 Kg,240.0,240.0,240.0,,220.0,239.67,260.0,220.0,...,220.0,220.47,230.0,240.0,240.0,240.0,200.0,200.0,200.0,9
8,10,Powdered Milk NIDO 390 gm PolybaEgach,1030.0,1055.72,1080.0,,1030.0,1047.89,1080.0,1030.0,...,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,10
9,11,Eggs Hen (Farm) 1 Dozen,330.0,336.59,350.0,,326.0,331.44,340.0,330.0,...,325.0,325.0,325.0,330.0,336.63,340.0,325.0,325.0,325.0,11


In [1]:
from datetime import date, timedelta

# Calculate the starting date (5 years ago from today)
start_date = date.today() - timedelta(days=5 * 365)

# Calculate the end date (today)
end_date = date.today()

# Generate all dates from start_date to end_date
current_date = start_date
while current_date <= end_date:
    print(current_date)
    current_date += timedelta(days=1)

2020-01-28
2020-01-29
2020-01-30
2020-01-31
2020-02-01
2020-02-02
2020-02-03
2020-02-04
2020-02-05
2020-02-06
2020-02-07
2020-02-08
2020-02-09
2020-02-10
2020-02-11
2020-02-12
2020-02-13
2020-02-14
2020-02-15
2020-02-16
2020-02-17
2020-02-18
2020-02-19
2020-02-20
2020-02-21
2020-02-22
2020-02-23
2020-02-24
2020-02-25
2020-02-26
2020-02-27
2020-02-28
2020-02-29
2020-03-01
2020-03-02
2020-03-03
2020-03-04
2020-03-05
2020-03-06
2020-03-07
2020-03-08
2020-03-09
2020-03-10
2020-03-11
2020-03-12
2020-03-13
2020-03-14
2020-03-15
2020-03-16
2020-03-17
2020-03-18
2020-03-19
2020-03-20
2020-03-21
2020-03-22
2020-03-23
2020-03-24
2020-03-25
2020-03-26
2020-03-27
2020-03-28
2020-03-29
2020-03-30
2020-03-31
2020-04-01
2020-04-02
2020-04-03
2020-04-04
2020-04-05
2020-04-06
2020-04-07
2020-04-08
2020-04-09
2020-04-10
2020-04-11
2020-04-12
2020-04-13
2020-04-14
2020-04-15
2020-04-16
2020-04-17
2020-04-18
2020-04-19
2020-04-20
2020-04-21
2020-04-22
2020-04-23
2020-04-24
2020-04-25
2020-04-26
2020-04-27