In [46]:
# Import Libraries
import os
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from openai import OpenAI
from IPython.display import Markdown, display

In [47]:
# Get the API key
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

In [48]:
def scrape_tamil_website(url):
    """
    The function visits a webpage and downloads its content. It then collects and returns all the main text like headings, paragraphs, and bullet points.
    """
    
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch page: {response.status_code}")
    
    soup = BeautifulSoup(response.content, 'html.parser')
    text = ' '.join([tag.get_text(strip=True) for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'li'])])
    return text

In [56]:
def summarize_and_translate(text):
    """
    The summarize_and_translate function takes Tamil website content as input and uses the OpenAI gpt-4o-mini model to generate a simple English summary.
    """
    
    system_prompt = "You are an assistant that understands Tamil webpages and summarizes them in simple english"
    user_prompt = f"""The following content is from a Tamil website. Summarize it in English. Focus only on important high level details.
                        ```{text}```
                   """

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.5
    )

    return response.choices[0].message.content

In [57]:
def main():
    url = input("Enter Tamil webpage URL: ")
    try:
        tamil_text = scrape_tamil_website(url)
        summary = summarize_and_translate(tamil_text)
        display(Markdown(summary))
    except Exception as e:
        print(f"Error: {e}")

In [58]:
# Example 1: https://maduraimeenakshi.hrce.tn.gov.in
if __name__ == "__main__":
    main()

Enter Tamil webpage URL:  https://maduraimeenakshi.hrce.tn.gov.in


The text discusses the Arulmigu Meenakshi Sundareswarar Temple, commonly known as the Meenakshi Amman Temple, located in Madurai, on the southern bank of the Vaigai River. The temple is dedicated to the deities Meenakshi, an aspect of Parvati, and Sundareswarar, an aspect of Shiva. It has historical significance, being mentioned in ancient Sangam literature, and features a complex with 14 towering gopurams (temple towers), the tallest being the southern gopuram at 51.9 meters (170 feet).

The temple is known for its rich sculptures and architectural grandeur. A special event, the Thirukkudamuzhuku festival, is scheduled for July 14, 2025, with devotees allowed to visit until the evening of July 13, 2025. The temple is a major pilgrimage site and cultural landmark in Madurai.

In [59]:
# Example 2: https://tamil.indianexpress.com
if __name__ == "__main__":
    main()

Enter Tamil webpage URL:  https://tamil.indianexpress.com


The Tamil website provides updates on various topics including entertainment, lifestyle, education, job opportunities, sports, business, and technology in Tamil Nadu, India. A recent news update highlights a fire incident at a car spare parts warehouse in Chennai. Additionally, there is a report of monkey troubles at the Tiruvallur government hospital, causing dissatisfaction among patients. In sports, India has recently defeated England by a margin of 336 runs in the second Test match. The site encourages readers to subscribe to their newsletter for more updates.