In [9]:

## krishakjagat
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def scrape_section_page(section_url, class_names):
    response = requests.get(section_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    section_posts = []
    for class_name in class_names:
        if class_name == "none":
            continue
        news_list_section = soup.find_all('div', class_=class_name)
        for news_div in news_list_section:
            news_links = news_div.find_all('a', href=True, title=True)
            for link in news_links:
                post_title = link['title'].strip()
                post_url = urljoin(section_url, link['href'])
                if is_valid_url(post_url):
                    section_posts.append({'title': post_title, 'url': post_url})
                else:
                    print(f"Invalid URL found: {post_url}")

    return section_posts

def scrape_post_content(post_url):
    response = requests.get(post_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Remove unwanted elements
    for auth_div in soup.find_all('div', class_='auth-name-dt'):
        auth_div.decompose()
    for author_div in soup.find_all('div', class_='h-author'):
        author_div.decompose()
    for author_div in soup.find_all('div', class_='col-md-4'):
        author_div.decompose()
    for author_div in soup.find_all('div', class_='d-mags'):
        author_div.decompose()
    for author_div in soup.find_all('div', class_='d-social'):
        author_div.decompose()
    for author_div in soup.find_all('div', class_='d-nav-item-info'):
        author_div.decompose()

    # Extract content
    paragraphs = soup.find_all('p')
    content = "\n\n".join([p.text.strip() for p in paragraphs])
    
    return content

def is_valid_url(url):
    parsed = urlparse(url)
    return all([parsed.scheme, parsed.netloc])

def main(language_urls=language_urls):
    all_results = {}

    for lang, urls in language_urls.items():
        print(f"Scraping data for language: {lang}")
        
        base_url = urls["base"]
        weather_class = urls["weather_class"]
        khati_class = urls["khati_badi_class"]
        main_class = urls["main_class"]
        news_classes = urls["news_classes"]
        government_classes = urls["government_class"]
        
        # Scrape the main page
        main_posts = scrape_section_page(base_url, main_class)
        khabar_posts = scrape_section_page(base_url, news_classes)

        #Scrape the weather page if available
        weather_posts = scrape_section_page(base_url, weather_class)

       # Scrape the Khati Badi page
        khati_posts = scrape_section_page(base_url, khati_class)

        government_post = scrape_section_page(base_url, government_classes)

        # Set to track processed URLs
        processed_urls = set()

        # Dictionary to store the results for this language
        results = {
            "Main Page Posts": [],
            "Khabar Section Posts": [],
            "Weather Posts": [],
            "Khati Badi Posts": [],
            "government posts":[]
        }

        # Process main page posts
        for post in main_posts:
            if post['url'] not in processed_urls:
                content = scrape_post_content(post['url'])
                results["Main Page Posts"].append({
                    "title": post['title'],
                    "link": post['url'],
                    "content": content
                })
                processed_urls.add(post['url'])     

        # Process "Khabar" section posts
        for post in khabar_posts:
            if post['url'] not in processed_urls:
                content = scrape_post_content(post['url'])
                results["Khabar Section Posts"].append({
                    "title": post['title'],
                    "link": post['url'],
                    "content": content
                })
                processed_urls.add(post['url'])

        #Process weather posts
        for post in weather_posts:
            if post['url'] not in processed_urls:
                content = scrape_post_content(post['url'])
                results["Weather Posts"].append({
                    "title": post['title'],
                    "link": post['url'],
                    "content": content
                })
                processed_urls.add(post['url'])

        # Process Khati Badi posts
        for post in khati_posts:
            if post['url'] not in processed_urls:
                content = scrape_post_content(post['url'])
                results["Khati Badi Posts"].append({
                    "title": post['title'],
                    "link": post['url'],
                    "content": content
                })
                processed_urls.add(post['url'])
       
        for post in government_post:         
            if post['url'] not in processed_urls:
                content = scrape_post_content(post['url'])
                results["government posts"].append({
                    "title": post['title'],
                    "link": post['url'],
                    "content": content
                })
                processed_urls.add(post['url'])

        all_results[lang] = results

    return all_results

# Define the base URLs and class names for each language
language_urls = {
    "hindi": {
        "base": "https://www.krishakjagat.org/",
        "main_class": ["cm-first-post","cm-posts"],
        "news_classes": ["cm-post-content","cm-posts"],
        "weather_class": ["weather-home mt-5 mb-3"],
        "khati_badi_class": ["home-2-3-lst"],
        "government_class":["col-xs-12 col-sm-6 col-md-4 col-lg-4 cat-flex"]
     },
    # "english": {
    #     "base": "https://www.en.krishakjagat.org/",
    #     "main_class": ["cm-first-post","cm-posts"],
    #     "news_classes": ["cm-post-content","cm-posts"],
    #     "weather_class": ["none"],
    #     "khati_badi_class": ["home-2-3-lst"],
    #     "government_class":["none"]
    # },
    #"punjabi": {
    #     "base": "https://punjabi.krishijagran.com/",
    #     "main_class": ["none"],
    #     "news_classes": ["home-top-news-lst"],
    #     "weather_class": ["home-top-l"],
    #     "khati_badi_class": ["three"],
    #     "government_class":["home-2-3-lst"]
    # },
    # "marathi": {
    #     "base": "https://marathi.krishijagran.com/",
    #     "main_class": ["home-top-l"],
    #     "news_classes": ["home-top-news-lst"],
    #     "weather_class": ["weather-home mt-5 mb-3"],
    #     "khati_badi_class": ["none"],
    #     "government_class":["home-2-3-lst"],
  
    #  },
    # "tamil": {
    #     "base": "https://tamil.krishijagran.com/",
    #     "main_class": ["home-top-l"],
    #     "news_classes": ["home-top-news-lst"],
    #     "weather_class": ["none"],
    #     "khati_badi_class": ["home-2-3-lst"],
    #     "government_class":["news-list-wide shadow-sm"]
    # },
    # "malayam": {
    #     "base": "https://malayalam.krishijagran.com/",
    #     "main_class": ["home-top-l"],
    #     "news_classes": ["home-top-news-lst"],
    #     "weather_class": ["none"],
    #     "khati_badi_class": ["none"],
    #     "government_class":["h-cat-lst shadow-sm"]
    # },
    # "bengali": {
    #     "base": "https://bengali.krishijagran.com/",
    #     "main_class": ["home-top-l"],
    #     "news_classes": ["home-top-news-lst"],
    #     "weather_class": ["weather-home mt-5 mb-3"],
    #     "khati_badi_class": ["none"],
    #     "government_class":["h-cat-lst shadow-sm"]
    # },
    # "kannada": {
    #     "base": "https://kannada.krishijagran.com/",
    #     "main_class": ["home-top-l"],
    #     "news_classes": ["home-top-news-lst"],
    #     "weather_class": ["none"],
    #     "khati_badi_class": ["none"],
    #     "government_class":["h-cat-lst shadow-sm"]
    # },
    # "odia": {
    #     "base": "https://odia.krishijagran.com/",
    #     "main_class": ["home-top-l"],
    #     "news_classes": ["home-top-news-lst"],
    #     "weather_class": ["none"],
    #     "khati_badi_class": ["three-boxes"],
    #     "government_class":["h-cat-lst shadow-sm"]
    # },
    # "asomiya": {
    #     "base": "https://asomiya.krishijagran.com/",
    #     "main_class": ["home-top-l"],
    #     "news_classes": ["home-top-news-lst"],
    #     "weather_class": ["none"],
    #     "khati_badi_class": ["none"],
    #     "government_class":["h-cat-lst shadow-sm"]
    #  },
     }



In [10]:
data = main()

Scraping data for language: hindi


In [11]:
data

{'hindi': {'Main Page Posts': [{'title': 'नई दिल्ली में नवोन्मेषी कृषक सम्मेलन आज',
    'link': 'https://www.krishakjagat.org/national-news/innovative-farmers-conference-in-new-delhi-today/',
    'content': '\n\n06 जून 2024, नई दिल्ली: नई दिल्ली में नवोन्मेषी कृषक सम्मेलन आज – भारतीय कृषि अनुसन्धान संस्थान द्वारा नवोन्मेषी कृषक सम्मेलन आज 6 \xa0जून को संस्थान के डॉ बी पी पाल सभागार में प्रातः 10 बजे से किया जा रहा है। \xa0इस अवसर पर चयनित किसानों को भाकृअप -नवोन्मेषी कृषक और भाकृअप – फेलो कृषक के रूप में सम्मानित किया जाएगा। इस आयोजन में \xa0सम्मानित किसानों के विचार विमर्श से किसानों के बीच नवाचारों के प्रसार को बढ़ावा मिलेगा तथा वैज्ञानिकों एवं छात्रों को गांवों के स्थानीय मुद्दों और नवाचार \xa0प्रणालियों को समझने में मदद मिलेगी।\n\nउल्लेखनीय है कि हर वर्ष पूसा संस्थान चयनित किसानों को कृषि में प्रमुख नवाचारों के सृजन, परिशोधन और प्रसार में उनके महत्वपूर्ण योगदान के लिए पूसा कृषि विज्ञान मेले के दौरान भाकृअप -नवोन्मेषी कृषक और भाकृअप – फेलो कृषक के रूप में सम्मानित करता है। \xa0चूँकि

In [12]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def scrape_section_page(section_url, class_names):
    response = requests.get(section_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    section_posts = []
    for class_name in class_names:
        if class_name == "none":
            continue
        news_list_section = soup.find_all('div', class_=class_name)
        for news_div in news_list_section:
            news_links = news_div.find_all('a', href=True, title=True)
            for link in news_links:
                post_title = link['title'].strip()
                post_url = urljoin(section_url, link['href'])
                if is_valid_url(post_url):
                    section_posts.append({'title': post_title, 'url': post_url})
                else:
                    print(f"Invalid URL found: {post_url}")

    return section_posts

def scrape_post_content(post_url):
    response = requests.get(post_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Remove unwanted elements
    for auth_div in soup.find_all('div', class_='auth-name-dt'):
        auth_div.decompose()
    for author_div in soup.find_all('div', class_='h-author'):
        author_div.decompose()
    for author_div in soup.find_all('div', class_='col-md-4'):
        author_div.decompose()
    for author_div in soup.find_all('div', class_='d-mags'):
        author_div.decompose()
    for author_div in soup.find_all('div', class_='d-social'):
        author_div.decompose()
    for author_div in soup.find_all('div', class_='d-nav-item-info'):
        author_div.decompose()

    # Extract content
    paragraphs = soup.find_all('p')
    content = "\n\n".join([p.text.strip() for p in paragraphs])
    
    return content

def is_valid_url(url):
    parsed = urlparse(url)
    return all([parsed.scheme, parsed.netloc])

def main(language_urls):
    all_results = {}

    for lang, urls in language_urls.items():
        print(f"Scraping data for language: {lang}")
        
        base_url = urls["base"]
        weather_class = urls["weather_class"]
        khati_class = urls["khati_badi_class"]
        main_class = urls["main_class"]
        news_classes = urls["news_classes"]
        government_classes = urls["government_class"]
        
        # Scrape the main page
        main_posts = scrape_section_page(base_url, main_class)
        khabar_posts = scrape_section_page(base_url, news_classes)

        # Scrape the weather page if available
        weather_posts = scrape_section_page(base_url, weather_class)

        # Scrape the Khati Badi page
        khati_posts = scrape_section_page(base_url, khati_class)

        government_post = scrape_section_page(base_url, government_classes)

        # Set to track processed URLs
        processed_urls = set()

        # Dictionary to store the results for this language
        results = {
            "Main Page Posts": [],
            "Khabar Section Posts": [],
            "Weather Posts": [],
            "Khati Badi Posts": [],
            "government posts":[]
        }

        # Process main page posts
        for post in main_posts:
            if post['url'] not in processed_urls:
                content = scrape_post_content(post['url'])
                results["Main Page Posts"].append({
                    "title": post['title'],
                    "link": post['url'],
                    "content": content
                })
                processed_urls.add(post['url'])     

        # Process "Khabar" section posts
        for post in khabar_posts:
            if post['url'] not in processed_urls:
                content = scrape_post_content(post['url'])
                results["Khabar Section Posts"].append({
                    "title": post['title'],
                    "link": post['url'],
                    "content": content
                })
                processed_urls.add(post['url'])

        # Process weather posts
        for post in weather_posts:
            if post['url'] not in processed_urls:
                content = scrape_post_content(post['url'])
                results["Weather Posts"].append({
                    "title": post['title'],
                    "link": post['url'],
                    "content": content
                })
                processed_urls.add(post['url'])

        # Process Khati Badi posts
        for post in khati_posts:
            if post['url'] not in processed_urls:
                content = scrape_post_content(post['url'])
                results["Khati Badi Posts"].append({
                    "title": post['title'],
                    "link": post['url'],
                    "content": content
                })
                processed_urls.add(post['url'])
       
        for post in government_post:
            if "asomiya" in language_urls or "odia" in language_urls: 
                if post['url'] not in processed_urls:
                     if "https://asomiya.krishijagran.com/government-schemes" in post['url'] or "https://odia.krishijagran.com/government-schemes/" in post['url']:
                    #     content = scrape_post_content(post['url'])
                        results["government posts"].append({
                            "title": post['title'],
                            "link": post['url'],
                            "content": content
                        })
                        processed_urls.add(post['url'])
            else:          
                if post['url'] not in processed_urls:
                    if "government-schemes" in post['url']:
                        content = scrape_post_content(post['url'])
                        results["government posts"].append({
                            "title": post['title'],
                            "link": post['url'],
                            "content": content
                        })
                        processed_urls.add(post['url'])

        all_results[lang] = results

    return all_results

# Define the base URLs and class names for each language
language_urls = {
     "hindi": {
        "base": "https://hindi.krishijagran.com/",
        "main_class": ["home-top-l"],
        "news_classes": ["home-top-news-lst"],
        "weather_class": ["weather-home mt-5 mb-3"],
        "khati_badi_class": ["home-2-3-lst"],
        "government_class":["col-xs-12 col-sm-6 col-md-4 col-lg-4 cat-flex"]
    },
    # "english": {
    #     "base": "https://krishijagran.com/",
    #     "main_class": ["home-top-l"],
    #     "news_classes": ["home-top-news-lst"],
    #     "weather_class": ["none"],
    #     "khati_badi_class": ["home-2-3-lst"],
    #     "government_class":["none"]
    # },
    # "punjabi": {
    #     "base": "https://punjabi.krishijagran.com/",
    #     "main_class": ["none"],
    #     "news_classes": ["home-top-news-lst"],
    #     "weather_class": ["home-top-l"],
    #     "khati_badi_class": ["three"],
    #     "government_class":["home-2-3-lst"]
    # },
    # "marathi": {
    #     "base": "https://marathi.krishijagran.com/",
    #     "main_class": ["home-top-l"],
    #     "news_classes": ["home-top-news-lst"],
    #     "weather_class": ["weather-home mt-5 mb-3"],
    #     "khati_badi_class": ["none"],
    #     "government_base": "https://marathi.krishijagran.com/government-schemes/",
    #     "government_class":["row"],
  
    #  },
    # "tamil": {
    #     "base": "https://tamil.krishijagran.com/",
    #     "main_class": ["home-top-l"],
    #     "news_classes": ["home-top-news-lst"],
    #     "weather_class": ["none"],
    #     "khati_badi_class": ["home-2-3-lst"],
    #     "government_class":["news-list-wide shadow-sm"]
    # },
    # "malayam": {
    #     "base": "https://malayalam.krishijagran.com/",
    #     "main_class": ["home-top-l"],
    #     "news_classes": ["home-top-news-lst"],
    #     "weather_class": ["none"],
    #     "khati_badi_class": ["none"],
    #     "government_class":["h-cat-lst shadow-sm"]
    # },
    # "bengali": {
    #     "base": "https://bengali.krishijagran.com/",
    #     "main_class": ["home-top-l"],
    #     "news_classes": ["home-top-news-lst"],
    #     "weather_class": ["weather-home mt-5 mb-3"],
    #     "khati_badi_class": ["none"],
    #     "government_class":["h-cat-lst shadow-sm"]
    # },
    # "kannada": {
    #     "base": "https://kannada.krishijagran.com/",
    #     "main_class": ["home-top-l"],
    #     "news_classes": ["home-top-news-lst"],
    #     "weather_class": ["none"],
    #     "khati_badi_class": ["none"],
    #     "government_class":["h-cat-lst shadow-sm"]
    # },
    # "odia": {
    #     "base": "https://odia.krishijagran.com/",
    #     "main_class": ["home-top-l"],
    #     "news_classes": ["home-top-news-lst"],
    #     "weather_class": ["none"],
    #     "khati_badi_class": ["three-boxes"],
    #     "government_class":["h-cat-lst shadow-sm"]
    # },
    # "asomiya": {
    #     "base": "https://asomiya.krishijagran.com/",++++++++++++++++++++++++++++++++++++++++
    #     "main_class": ["home-top-l"],
    #     "news_classes": ["home-top-news-lst"],
    #     "weather_class": ["none"],
    #     "khati_badi_class": ["none"],
    #     "government_class":["h-cat-lst shadow-sm"]
    #  },
}

# Start the scraping process and get the results
scraped_data = main(language_urls)
print(scraped_data)


Scraping data for language: hindi
{'hindi': {'Main Page Posts': [{'title': 'भारत की 5 सबसे लोकप्रिय लीची की किस्में, प्रति पेड़ से मिलेगी 100 किलो तक उपज', 'link': 'https://hindi.krishijagran.com/lekh/gardening/india-is-5-most-popular-litchi-varieties-per-tree-yield-up-to-100-kg/', 'content': 'Litchi Varieties: लीची की खेती मुख्य रुप से भारत, चीन, बांग्लादेश, थाईलैंड और वियतनाम जैसे देशों में की जाती है. लेकिन देश में लीची का सबसे अधिक उत्पादन बिहार में किया जाता है. बिहार में उगाई जाने वाली शाही लीची, कस्बा लीची, चायना लीची, लोंगिया लीची, बेदाना लीची और पूर्वी लीची देश की कुछ प्रमुख किस्मों में शामिल है.\n\nTop 5 Litchi Varieties: लीची खाना तो लगभग सभी को पंसद है, यह एक रसीला होने के साथ-साथ स्वादिष्ट फल भी है. इस फल की खेती मुख्य रुप से भारत, चीन, बांग्लादेश, थाईलैंड और वियतनाम जैसे देशों में की जाती है. लेकिन देश में लीची का सबसे अधिक उत्पादन बिहार में किया जाता है. बिहार में उगाई जाने वाली शाही लीची, कस्बा लीची, चायना लीची, लोंगिया लीची, बेदाना लीची और पूर्वी लीची देश की कुछ प्रमुख

In [14]:

def store_scraped_data(language_urls):
    scraped_data = main(language_urls)

    for lang, results in scraped_data.items():
        # Get or create the Language instance
        language, created = Language.objects.get_or_create(name=lang)

        for section, posts in results.items():
            for post in posts:
                content = post['content']
                post_instance = Post.objects.create(
                    title=post['title'],
                    link=post['url'],
                    content=content,
                    section=section,
                    language=language
                )

# Call the function to store the scraped data
store_scraped_data(language_urls)

{'hindi': {'Main Page Posts': [{'title': 'भारत की 5 सबसे लोकप्रिय लीची की किस्में, प्रति पेड़ से मिलेगी 100 किलो तक उपज',
    'link': 'https://hindi.krishijagran.com/lekh/gardening/india-is-5-most-popular-litchi-varieties-per-tree-yield-up-to-100-kg/',
    'content': 'Litchi Varieties: लीची की खेती मुख्य रुप से भारत, चीन, बांग्लादेश, थाईलैंड और वियतनाम जैसे देशों में की जाती है. लेकिन देश में लीची का सबसे अधिक उत्पादन बिहार में किया जाता है. बिहार में उगाई जाने वाली शाही लीची, कस्बा लीची, चायना लीची, लोंगिया लीची, बेदाना लीची और पूर्वी लीची देश की कुछ प्रमुख किस्मों में शामिल है.\n\nTop 5 Litchi Varieties: लीची खाना तो लगभग सभी को पंसद है, यह एक रसीला होने के साथ-साथ स्वादिष्ट फल भी है. इस फल की खेती मुख्य रुप से भारत, चीन, बांग्लादेश, थाईलैंड और वियतनाम जैसे देशों में की जाती है. लेकिन देश में लीची का सबसे अधिक उत्पादन बिहार में किया जाता है. बिहार में उगाई जाने वाली शाही लीची, कस्बा लीची, चायना लीची, लोंगिया लीची, बेदाना लीची और पूर्वी लीची देश की कुछ प्रमुख किस्मों में शामिल है. शाह

In [2]:

def scrape_post_content(post_url):
    response = requests.get(post_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Remove unwanted elements
    for auth_div in soup.find_all('div', class_='auth-name-dt'):
        auth_div.decompose()
    for author_div in soup.find_all('div', class_='h-author'):
        author_div.decompose()
    for author_div in soup.find_all('div', class_='col-md-4'):
        author_div.decompose()
    for author_div in soup.find_all('div', class_='d-mags'):
        author_div.decompose()
    for author_div in soup.find_all('div', class_='d-social'):
        author_div.decompose()
    for author_div in soup.find_all('div', class_='d-nav-item-info'):
        author_div.decompose()

    # Extract content
    paragraphs = soup.find_all('p')
    content = "\n\n".join([p.text.strip() for p in paragraphs])
    
    return content

def is_valid_url(url):
    parsed = urlparse(url)
    return all([parsed.scheme, parsed.netloc])

def main(language_urls):
    all_results = {}

    for lang, urls in language_urls.items():
        print(f"Scraping data for language: {lang}")
        
        base_url = urls["base"]
        weather_class = urls["weather_class"]
        khati_class = urls["khati_badi_class"]
        main_class = urls["main_class"]
        news_classes = urls["news_classes"]
        government_classes = urls["government_class"]
        
        # Scrape the main page
        main_posts = scrape_section_page(base_url, main_class)
        khabar_posts = scrape_section_page(base_url, news_classes)

        # Scrape the weather page if available
        weather_posts = scrape_section_page(base_url, weather_class)

        # Scrape the Khati Badi page
        khati_posts = scrape_section_page(base_url, khati_class)

        government_post = scrape_section_page(base_url, government_classes)

        # Set to track processed URLs
        processed_urls = set()

        # Dictionary to store the results for this language
        results = {
            "Main Page Posts": [],
            "Khabar Section Posts": [],
            "Weather Posts": [],
            "Khati Badi Posts": [],
            "government posts":[]
        }

        # Process main page posts
        for post in main_posts:
            if post['url'] not in processed_urls:
                content = scrape_post_content(post['url'])
                results["Main Page Posts"].append({
                    "title": post['title'],
                    "link": post['url'],
                    "content": content
                })
                processed_urls.add(post['url'])     

        # Process "Khabar" section posts
        for post in khabar_posts:
            if post['url'] not in processed_urls:
                content = scrape_post_content(post['url'])
                results["Khabar Section Posts"].append({
                    "title": post['title'],
                    "link": post['url'],
                    "content": content
                })
                processed_urls.add(post['url'])

        # Process weather posts
        for post in weather_posts:
            if post['url'] not in processed_urls:
                content = scrape_post_content(post['url'])
                results["Weather Posts"].append({
                    "title": post['title'],
                    "link": post['url'],
                    "content": content
                })
                processed_urls.add(post['url'])

        # Process Khati Badi posts
        for post in khati_posts:
            if post['url'] not in processed_urls:
                content = scrape_post_content(post['url'])
                results["Khati Badi Posts"].append({
                    "title": post['title'],
                    "link": post['url'],
                    "content": content
                })
                processed_urls.add(post['url'])
       
        for post in government_post:
            if "asomiya" in language_urls or "odia" in language_urls: 
                if post['url'] not in processed_urls:
                     if "https://asomiya.krishijagran.com/government-schemes" in post['url'] or "https://odia.krishijagran.com/government-schemes/" in post['url']:
                    #     content = scrape_post_content(post['url'])
                        results["government posts"].append({
                            "title": post['title'],
                            "link": post['url'],
                            "content": content
                        })
                        processed_urls.add(post['url'])
            else:          
                if post['url'] not in processed_urls:
                    if "government-schemes" in post['url']:
                        content = scrape_post_content(post['url'])
                        results["government posts"].append({
                            "title": post['title'],
                            "link": post['url'],
                            "content": content
                        })
                        processed_urls.add(post['url'])

        all_results[lang] = results

    return all_results



SyntaxError: invalid syntax (516851523.py, line 137)