In [None]:
from datasets import load_dataset

ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train[:1%]")

print(ds[0])


In [None]:
import json

fever_unified = []

with open("train.jsonl", "r") as f:
    for line in f:
        item = json.loads(line)
        if item['label'] != "SUPPORTS":
            continue  

        for eg_idx, evidence_group in enumerate(item['evidence']):
            for ev_idx, ev in enumerate(evidence_group):
                article_title = ev[2] if ev[2] is not None else "NA"
                passage_id = f"fever_{item['id']}_{eg_idx}_{ev_idx}"
                entry = {
                    "id": passage_id,
                    "url": f"https://en.wikipedia.org/wiki/{article_title}" if article_title != "NA" else "NA",
                    "title": article_title,
                    "text": item['claim']  
                }
                fever_unified.append(entry)

print(f"Total SUPPORTS entries: {len(fever_unified)}")


In [None]:
fever_unified

In [None]:
import pandas as pd
fev_ds = pd.DataFrame(fever_unified)

In [None]:
ds_wiki = ds.to_pandas()

In [None]:
ds_wiki.head()

In [None]:
df_ret = pd.concat([ds_wiki, fev_ds])

In [None]:
df_ret

In [None]:
df_ret_texts_list = df_ret['text'].to_list()

In [None]:
type(df_ret_texts_list)

In [None]:
from dotenv import load_dotenv
import os
import requests

load_dotenv()
API_key = os.getenv("GOOGLE_FACT_CHECK_API")


url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"

params = {
    'key': API_key,
    'query': 'covid vaccine', 
    'languageCode': 'en-US',
    'pageSize': 100
}

response = requests.get(url,params=params)
len(response.json()['claims'])

In [None]:
response.json()['claims'][0]
i=0
for claim in response.json()['claims']:
    # print(claim.get('claimReview')[0].get('title',''))
    print(claim
          )
    i+=1
    if i==4:
        break



In [None]:
def fetch_google_facts(query,num_iter = 1, pages = 100):
    
    load_dotenv()
    API_key_google = os.getenv("GOOGLE_FACT_CHECK_API")


    url_google = "https://factchecktools.googleapis.com/v1alpha1/claims:search"

    ds=[]
    next_page_token = None
    for _ in range(num_iter):

        params = {
        'key': API_key_google,
        'query': query, 
        'languageCode': 'en-US',
        'pageSize': pages
        }

        if next_page_token:
            params['pageToken'] = next_page_token

        try:
            response = requests.get(url_google,params=params)
            response.raise_for_status()

            for claim in response.json()['claims']:
                date = claim.get('claimReview',[])[0].get('reviewDate','').split('T')[0]
                ds.append(
                    {'title':claim.get('claimReview',[])[0].get('title',''),
                     'text':claim.get('text',''),
                     'url':claim.get('claimReview',[])[0].get('url',''), 
                     'Published_Date':date if date else "No date available",
                     'source':claim.get('claimReview',[])[0].get('publisher',{}).get('name',"No source available")
                     })
                
            next_page_token = response.json().get('nextPageToken')
            if not next_page_token:
                break

        except requests.exceptions.RequestException as e:
            print(f"Request error: {e}")
            break
        except Exception as e:
            print(f"Error processing response: {e}")
            break

    print(f"Google fetched {len(ds)} articles")
    return ds


In [None]:
ds1 = fetch_google_facts('indian government')
ds1,len(ds1)

In [None]:
ds2 = fetch_google_facts('india')
len(ds2)

In [None]:
ds = ds1 + ds2
len(ds)

In [None]:
import pandas as pd
df = pd.DataFrame(ds)
df.head()

In [None]:

url_news_api = "https://newsapi.org/v2/everything"

load_dotenv()
api_key_news = os.getenv('NEWS_API')

sort_by  =['relevancy', 'popularity', 'publishedAt']
params = {
    "apiKey":api_key_news,
    "q":'indian actors',
    "sortBY":sort_by[0],
    "pageSize":69

}

res = requests.get(url_news_api,params=params)
res.json().get('articles')

In [None]:
arl=res.json().get('articles')
arl[0]
len(arl)
res.json()

In [None]:
def fetch_news_org(query: str, page_size: int = 100, num_iter: int = 12, sort_by_index: int = 0):
    url_news_api = "https://newsapi.org/v2/everything"

    load_dotenv()
    api_key_news = os.getenv("NEWS_API")
    if not api_key_news:
        raise ValueError("NEWS_API environment variable not set")

    sort_by = ["relevancy", "popularity", "publishedAt"]
    if sort_by_index < 0 or sort_by_index >= len(sort_by):
        sort_by_index = 0

    news_ds = []

    for page in range(1, num_iter + 1):
        params = {
            "apiKey": api_key_news,
            "q": query,
            "sortBy": sort_by[sort_by_index], 
            "pageSize": min(page_size, 100),
            "page": page,
        }

        try:
            res = requests.get(url_news_api, params=params, timeout=30)
            res.raise_for_status()

            if res.json().get("status") == "ok":
                print(f"status | {res.json()['status']}")

            data = res.json().get("articles")
            if not data:
                print(f"Could not find any article at page {page}")
                break

            for article in data:
                if not article.get("content"):
                    continue

                news_ds.append({
                    "title": article.get("title", ""),
                    "text": (article.get("content", "") or "") + (article.get("description", "") or ""),
                    "url": article.get("url", ""),
                    "source": article.get("source", {}).get("name", "No source available"),
                    "Published_Date": article.get("publishedAt", ""),
                })

            if len(news_ds) >= res.json().get("totalResults", 0):
                print(f"No More Results | reached {res.json().get('totalResults', 0)} Results")
                break

        except requests.exceptions.RequestException as e:
            print(f"Request error on page(iteration) {page}: {e}")
            break
        except Exception as e:
            print(f"Unexpected error on page(iter) {page}: {e}")
            break

    print(f"Fetched {len(news_ds)} news articles")
    return news_ds


In [None]:
from dotenv import load_dotenv
import os
news_db = fetch_news_org('conjuring',500,100,1)

In [None]:
news_db[0]['text']

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Referer": "https://www.ndtv.com/topics",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Cache-Control": "max-age=0",
}



url="https://archives.ndtv.com/articles/2025-01.html"

response  = requests.get(url,headers=headers)
response.status_code

In [None]:

def get_ndtv_rss_feeds():
   
    rss_feeds = {
        'top_stories': 'https://feeds.feedburner.com/ndtvnews-top-stories',
        'india': 'https://feeds.feedburner.com/ndtvnews-india-news',
        'world': 'https://feeds.feedburner.com/ndtvnews-world-news',
        'sports': 'https://feeds.feedburner.com/ndtvnews-sports',
        'entertainment': 'https://feeds.feedburner.com/ndtvnews-entertainment',
        'business': 'https://feeds.feedburner.com/ndtvnews-business'
    }
    
    all_articles = []
    
    for category, url in rss_feeds.items():
        try:
            print(f"Fetching RSS feed: {category}")
            response = requests.get(url, timeout=10)
            
            if response.status_code != 200:
                print(f"Failed to fetch RSS feed: {response.status_code}")
                continue
                
            soup = BeautifulSoup(response.text, 'html.parser')
            items = soup.find_all('item')
            
            for item in items:
                title = item.find('title')
                link = item.find('link')
                description = item.find('description')
                pub_date = item.find('pubDate')
                
                if title and link:
                    all_articles.append({
                        'title': title.get_text().strip(),
                        'url': link.get_text().strip(),
                        'text': description.get_text().strip() if description else '',
                        'Published_Date': pub_date.get_text().strip() if pub_date else '',
                        'source': 'NDTV RSS'
                    })
            
            
            
        except Exception as e:
            print(f"Error processing RSS feed {category}: {e}")
    
    print(f"Found {len(all_articles)} articles from RSS feeds")
    return all_articles

rss_articles = get_ndtv_rss_feeds()
rss_df = pd.DataFrame(rss_articles)

In [None]:
rss_df.iloc[0]

In [None]:
xml_url = "https://ddnews.gov.in/all-news-archive/" 

res = requests.get(xml_url)

res
soup = BeautifulSoup(res.text , 'html.parser')
a=soup.find('div',class_= 'moreStoriesItem')
# url = a.a['href']
# a.img['alt']
# type(url)
# # res = requests.get(url)
# # res
# url
# soup.find_all('div')
a

In [None]:
a.find('div',class_='moreStoriesText').text.strip().split('|')[0]

In [None]:
soap = BeautifulSoup(res.text , 'html.parser')

In [None]:
def fetch_dd_news_ugly(max_stories = 10):
    dd_url = "https://ddnews.gov.in/all-news-archive/" 
    d=[]
    try:
        res = requests.get(dd_url)
        res.raise_for_status()

        soup = BeautifulSoup(res.text , 'html.parser')
        stories=soup.find_all('div',class_= 'moreStoriesItem')

        
        for idx,story in enumerate(stories):
            article_url  = story.a['href']
            title = story.img['alt']
            url_image  =story.img['src']
            publishedtime = story.find('div',class_='moreStoriesText').text.strip().split('|')[0]

            try:
                print(f'Fetching article {idx+1}')
                res_article = requests.get(article_url)
                res_article.raise_for_status()

                soap = BeautifulSoup(res_article.text , 'html.parser')
                paras = soap.find('div',class_ = 'entry-content').find_all('p')

                full_para = ""
                for para in paras:
                    full_para += para.text
            except requests.exceptions.RequestException as e:
                print(f'Could not fetch article {idx}')
            except Exception as e:
                print(e)
            
            d.append({
                'title':title,
                'text':full_para,
                'url':article_url,
                'Published_Date':publishedtime,
                'source':"DD  News"
                
                })
            
            if idx+1>=max_stories:
                 break
            
    except requests.exceptions.RequestException as e:
            print('Could not fetch the link',e)
    except Exception as e:
            print(e)

    print(f"Total Articles Fetched {len(d)}")

    return d

         


In [None]:
from urllib.parse import urljoin
import time

def fetch_dd_news_fixed(max_stories=10, delay=1):
    base_url = "https://ddnews.gov.in"
    dd_url = "https://ddnews.gov.in/all-news-archive/"
    articles = []

    try:
        res = requests.get(dd_url)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, 'html.parser')
        stories = soup.find_all('div', class_='moreStoriesItem')

        for idx, story in enumerate(stories):
            if idx >= max_stories:
                break

            try:
                # Links and images (handle relative paths)
                article_url = urljoin(base_url, story.a['href']) if story.a else None
                title = story.img.get('alt', 'No title') if story.img else 'No title'
                url_image = urljoin(base_url, story.img['src']) if story.img and story.img.get('src') else ''

                publishedtime = (
                    story.find('div', class_='moreStoriesText').text.strip().split('|')[0]
                    if story.find('div', class_='moreStoriesText') else "Unknown date"
                )

                if not article_url:
                    continue

                # Fetch article page
                time.sleep(delay)  # be nice to server
                res_article = requests.get(article_url, timeout=15)
                res_article.raise_for_status()
                soap = BeautifulSoup(res_article.text, 'html.parser')

                # Extract content
                content_div = soap.find('div', class_='entry-content') or soap.find('div', class_='article-content')
                if content_div:
                    paras = content_div.find_all('p')
                    full_para = " ".join(p.get_text(strip=True) for p in paras)
                else:
                    full_para = soap.get_text(strip=True)

                articles.append({
                    'title': title,
                    'text': full_para,
                    'url': article_url,
                    'Published_Date': publishedtime,
                    'source': "DD News"
                })

                print(f"Fetched article {idx+1}: {title[:50]}...")

            except Exception as e:
                print(f"Error fetching article {idx+1}: {e}")
                continue

    except Exception as e:
        print(f"Error fetching archive: {e}")

    print(f"âœ… Total Articles Fetched: {len(articles)}")
    return articles


In [None]:
x = fetch_dd_news_ugly(3)
dd_db = pd.DataFrame(x)
dd_db

In [None]:
x = fetch_dd_news_fixed(3)
dd_db_a = pd.DataFrame(x)
dd_db_a

In [None]:
def striping(example):
    return ' '.join(example.split('\xa0'))

In [None]:
dd_db['text'][0]

In [None]:
dd_db['title'] = dd_db['title'].apply(lambda x : striping(x) )
dd_db['text'] = dd_db['text'].apply(lambda x : striping(x) )

In [None]:
dd_db['title'][0]
dd_db['text'][0]

In [None]:
import json
# def fetch_google_facts(query,num_iter = 1, pages = 100):
# def fetch_dd_news(max_articles=20, delay=1):
# def fetch_news_org(query:str , page_size:int = 100,num_iter = 12,sort_by_index:int = 0):
# df_ret --> wiki dataset

class fetch_all:
    def __init__(self, num_pages = 100 , num_iter = 1):
        self.num_pages  =num_pages
        self.num_iter  = num_iter
        self.articles  = []

    def dd_news(self,max_articles):
        print("DD News fetching, Just Pray their Server Dont die")
        try:
            self.articles.extend(fetch_dd_news_fixed(max_articles))
        except Exception  as e:
            print("DD News failed | ", e)

    def google(self,query = 'india'):
        print("Google Fetching")
        self.articles.extend(fetch_google_facts(query , self.num_iter , self.num_pages))

    def news_org(self, query = 'india' ,sort_idx = 0):
        print("News API fetching")
        self.articles.extend(fetch_news_org(query=query , page_size=self.num_pages , num_iter=self.num_iter , sort_by_index=sort_idx))

    def wiki(self):
        print("Wiki fetching")
        self.articles.extend(df_ret.to_dict('records'))

    def to_pandas(self):
        return pd.DataFrame(self.articles)
    
    def ndtv(self):
        self.articles.extend(get_ndtv_rss_feeds())

    def to_json(self):
        return json.dumps(self.articles)
    
    



In [None]:
db = fetch_all(100,500)
db.news_org('india',0)
db.wiki()
# db.google('indian')
db.ndtv()
df = db.to_pandas()
djson = db.to_json()
db.dd_news(10000)
df = db.to_pandas()
djson = db.to_json()

In [None]:
db.dd_news(10000)


In [None]:
df.shape

In [None]:
headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/115.0.0.0 Safari/537.36"
                      }

params = {
    "q":'india',
    "tbm":"nws",
    'start':10
    }

search_url = 'https://www.google.com/search'

res = requests.get(search_url,params=params,headers=headers)

res

In [None]:
soup = BeautifulSoup(res.text,'html.parser')

In [None]:
t= soup.find('div',class_='SoaBEf')

In [None]:
m=soup.select_one("div.SoaBEf a")
# for ar in m:
#     p =ar.find('div',class_= "n0jPhd ynAwRc MBeuO nDgy9d").text
#     n = m.find('div',class_ = "GI74Re nDgy9d").text
#     title  =p+n
#     url = 
m


In [None]:
m.find('div', class_ = "n0jPhd ynAwRc MBeuO nDgy9d").text

In [None]:
m.find('div',class_ = "GI74Re nDgy9d").text

In [None]:
m.find('div',class_="OSrXXb rbYSKb LfVVr").text

In [None]:
m.find('div',class_ = "MgUUmf NUnG9d").text

In [None]:
arc_url = m['href']
arc_url

In [None]:
import trafilatura as tra

down = tra.fetch_url(arc_url)
content = tra.extract(down) if down else "none extracted"
content = content if content else "No content extracted"

content


In [None]:
w = requests.get(arc_url,headers=headers)
w

In [None]:
soap = BeautifulSoup(w.text,'html.parser')


In [None]:
soap.find('h2').text

In [None]:
def fetch_google_search(query:str = 'india',num_pages:int = 1):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/115.0.0.0 Safari/537.36"
                      }
    
    articles_gg= []
    for pages in range(num_pages):
        params = {
            "q":query,
            "tbm":"nws",
            'start':pages * 10 
            }

        search_url = 'https://www.google.com/search'

        try:
            res = requests.get(search_url,params=params,headers=headers)
            soup = BeautifulSoup(res.text,'html.parser')

            article_list=soup.select("div.SoaBEf a")
            if not article_list:
                print("None Articles found")
            for article in article_list:
                h1 = article.find('div',class_= "n0jPhd ynAwRc MBeuO nDgy9d").text
                h2 = article.find('div',class_ = "GI74Re nDgy9d").text
                title = h1 + h2

                a_url = article['href']
                time = article.find('div',class_="OSrXXb rbYSKb LfVVr").text
                source  = article.find('div',class_ = "MgUUmf NUnG9d").text

                try:
                    down = tra.fetch_url(a_url)
                    content = tra.extract(down) if down else "none extracted"
                    content = content if content else "No content extracted"
                except Exception as e:
                    content = f"Error: {e}"

                articles_gg.append({
                    "title":title,
                    'url':a_url,
                    'text':content,
                    'pblished_date':time,
                    'source':source
                })
            
        except requests.exceptions.RequestException as e:
            print(f"Error Fething Google search | {e}")

        except Exception as e:
            print(f"Unforseen Error | {e}")

    return articles_gg

In [None]:
s = fetch_google_search(num_pages=30)
dc = pd.DataFrame(s)
dc

In [None]:
dc[dc['text']=='none extracted'].shape

In [None]:
len(db.articles)

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from urllib.parse import urlparse, quote_plus
from datetime import datetime

def fetch_google_search_better(query: str = 'india', num_pages: int = 1, delay: float = 2.0):
  
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
    ]
    
    articles_gg = []
    search_url = 'https://www.google.com/search'
    
    for page in range(num_pages):
        params = {
            "q": query,
            "tbm": "nws",
            'start': page * 10,
            "hl": "en"  
        }
        
        headers = {
            "User-Agent": random.choice(user_agents),
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate, br",
            "DNT": "1",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
        }
        
        try:
            time.sleep(delay + random.uniform(0, 1))
            
            res = requests.get(search_url, params=params, headers=headers, timeout=15)
            res.raise_for_status()
            
            soup = BeautifulSoup(res.text, 'html.parser')
            
            article_selectors = [
                "div.SoaBEf",  
                "div.dbsr",    
                "div.g",       
               "div.tNxQIb"   ]
            
            articles_found = False
            for selector in article_selectors:
                article_list = soup.select(selector)
                if article_list:
                    articles_found = True
                    break
            
            if not articles_found:
                print(f"No articles found on page {page + 1}. Google may have blocked the request or changed their HTML structure.")
                # # Save the HTML for debugging
                # with open(f"google_page_{page+1}.html", "w", encoding="utf-8") as f:
                #     f.write(soup.prettify())
                continue
            
            print(f"Found {len(article_list)} articles on page {page + 1}")
            
            for article in article_list:
                try:
                    title_elem = article.select_one(".n0jPhd, .ynAwRc, .MBeuO, .nDgy9d, [role='heading']")
                    title = title_elem.get_text().strip() if title_elem else "No title"
                    
                    link_elem = article.find('a')
                    a_url = link_elem.get('href') if link_elem else None
                    
                    if not a_url:
                        continue
                    
                    if a_url.startswith('/url?q='):
                        a_url = a_url.split('/url?q=')[1].split('&')[0]
                        a_url = requests.utils.unquote(a_url)
                    
                    time_elem = article.select_one(".OSrXXb, .rbYSKb, .LfVVr, .ZE0LJd")
                    time_text = time_elem.get_text().strip() if time_elem else "Unknown date"
                    
                    source_elem = article.select_one(".MgUUmf, .NUnG9d, .IH8C7b")
                    source = source_elem.get_text().strip() if source_elem else "Unknown source"
                    
                    snippet_elem = article.select_one(".GI74Re, .Y3v8qd, .l3AOke")
                    snippet = snippet_elem.get_text().strip() if snippet_elem else ""
                    
                    content = "No content extracted"
                    try:
                        article_res = requests.get(a_url, headers=headers, timeout=10)
                        if article_res.status_code == 200:
                            article_soup = BeautifulSoup(article_res.text, 'html.parser')
                            
                            content_selectors = [
                                'article',
                                'div.article-content',
                                'div.story-content',
                                'div.entry-content',
                                'div.post-content',
                                'div[class*="content"]',
                                'div[class*="body"]',
                                'main'
                            ]
                            
                            for content_selector in content_selectors:
                                content_elem = article_soup.select_one(content_selector)
                                if content_elem:
                                    paragraphs = content_elem.find_all('p')
                                    if paragraphs:
                                        content = " ".join([p.get_text().strip() for p in paragraphs[:5]])  # First 5 paragraphs
                                        break
                            
                            if content == "No content extracted":
                                all_text = article_soup.get_text()
                                content = " ".join(all_text.split()[:200])  # First 200 words
                    except Exception as e:
                        content = f"Error fetching content: {str(e)}"
                        if snippet and content.startswith("Error"):
                            content = snippet
                    
                    articles_gg.append({
                        "title": title + snippet,
                        'url': a_url,
                        'text': content,
                        'published_date': time_text,
                        'source': source,
                        
                    })
                    
                except Exception as e:
                    print(f"Error processing article: {e}")
                    continue
            
        except requests.exceptions.RequestException as e:
            print(f"Request error on page {page + 1}: {e}")
            # If we get blocked, break out of the loop
            if "429" in str(e) or "430" in str(e):
                print("Google is blocking requests. Stopping.")
                break
        except Exception as e:
            print(f"Unexpected error on page {page + 1}: {e}")
    
    print(f"Total articles fetched from Google Search: {len(articles_gg)}")
    return articles_gg



In [None]:
z = fetch_google_search_better('indian',30)
dz=pd.DataFrame(z)
dz

In [None]:
dz[dz['text']=="No content extracted"].shape

In [None]:

# def fetch_google_facts(query,num_iter = 1, pages = 100):
# def fetch_dd_news(max_articles=20, delay=1):
# def fetch_news_org(query:str , page_size:int = 100,num_iter = 12,sort_by_index:int = 0):
# df_ret --> wiki dataset
# def fetch_google_search_better(query: str = 'india', num_pages: int = 1, delay: float = 2.0):



In [None]:
import json
import pandas as pd
import hashlib

class DataAggregator:
    def __init__(self, num_pages=100, num_iter=1):
        self.num_pages = num_pages
        self.num_iter = num_iter
        self.articles = []
        self.chunks = []
        
    def dd_news(self, max_articles=20):
        print("Fetching DD News")
        try:
            articles = fetch_dd_news_fixed(max_articles)
            self.articles.extend(articles)
            print(f"Added {len(articles)} DD News articles")
        except Exception as e:
            print(f"DD News error: {e}")
        return self
    
    def google_facts(self, query="india"):
        print("Fetching Google Facts")
        try:
            articles = fetch_google_facts(query, self.num_iter, self.num_pages)
            self.articles.extend(articles)
            print(f"Added {len(articles)} Google Fact Check articles")
        except Exception as e:
            print(f"Google Facts error: {e}")
        return self
    
    def news_org(self, query="india", sort_idx=0):
        print("Fetching News API")
        try:
            articles = fetch_news_org(query, self.num_pages, self.num_iter, sort_idx)
            self.articles.extend(articles)
            print(f"Added {len(articles)} News API articles")
        except Exception as e:
            print(f"News API error: {e}")
        return self
    
    def wiki(self, wiki_df):
        print("Fetching Wikipedia")
        try:
            articles = wiki_df.to_dict('records')
            self.articles.extend(articles)
            print(f"Added {len(articles)} Wikipedia articles")
        except Exception as e:
            print(f"Wikipedia error: {e}")
        return self
    
    def search(self, query="india"):
        print("Fetching Google Search")
        try:
            articles = fetch_google_search_better(query, self.num_pages)
            self.articles.extend(articles)
            print(f"Added {len(articles)} Google Search articles")
        except Exception as e:
            print(f"Google Search error: {e}")
        return self
    
    def ndtv(self):
        print("Fetching NDTV RSS")
        try:
            articles = get_ndtv_rss_feeds()
            self.articles.extend(articles)
            print(f"Added {len(articles)} NDTV RSS articles")
        except Exception as e:
            print(f"NDTV RSS error: {e}")
        return self
    
    def chunk_text(self,text, chunk_size=500, overlap=50):

        words = text.split()
        chunks = []
        start = 0
        
        while start < len(words):
            end = start + chunk_size
            chunk = " ".join(words[start:end])
            chunks.append(chunk)
            start += chunk_size - overlap
        
        return chunks
    
    def create_chunks(self , chunk_size=500 , overlap=50):

        self.chunks = []
        for articles in self.articles:
            text = articles.get('text',"")
            if not text:
                continue
            chunked_text = self.chunked_text(text,chunk_size,overlap)

            for idx,chu_text in enumerate(chunked_text):
                chunk_data = articles.copy()
                chunk_data['chunk_text'] = chu_text
                chunk_data['chunk_text'] = chunk_text
                chunk_data['chunk_id'] = f"{article.get('url', '')}_{i}"
                chunk_data['is_chunk'] = True
                chunk_data['chunk_number'] = i
                chunk_data['total_chunks'] = len(text_chunks)
                
                self.chunks.append(chunk_data)
        
        print(f"Created {len(self.chunks)} chunks from {len(self.articles)} articles")
        return self


    def get_chunks_df(self):
        """Get chunks as DataFrame"""
        return pd.DataFrame(self.chunks)
    
    def get_chunks_json(self):
        """Get chunks as JSON"""
        return json.dumps(self.chunks)
    
    def save_chunks(self, filename, format="json"):
        """Save chunks to file"""
        if not self.chunks:
            print("No chunks available. Run create_chunks() first.")
            return self
        
        try:
            if format == "json":
                with open(f"{filename}_chunks.json", "w") as f:
                    json.dump(self.chunks, f, indent=2)
            elif format == "csv":
                pd.DataFrame(self.chunks).to_csv(f"{filename}_chunks.csv", index=False)
            elif format == "pkl":
                pd.DataFrame(self.chunks).to_pickle(f"{filename}_chunks.pkl")
            
            print(f"Saved {len(self.chunks)} chunks to {filename}_chunks.{format}")
        except Exception as e:
            print(f"Save chunks error: {e}")
        
        return self


    
    def to_pandas(self):
        return pd.DataFrame(self.articles)
    
    def to_json(self):
        return json.dumps(self.articles)
    
    def clear(self):
        self.articles = []
        print("Cleared all articles")
        return self
    
    def stats(self):
        df = self.to_pandas()
        return {
            'total': len(self.articles),
            'sources': df['source'].value_counts().to_dict() if 'source' in df.columns else {}
        }
    
    def remove_duplicates(self):
        """Remove duplicate articles based on title and URL"""
        if not self.articles:
            return self
            
        seen = set()
        unique_articles = []
        
        for article in self.articles:
            identifier = f"{article.get('title', '')}_{article.get('url', '')}"
            identifier_hash = hashlib.md5(identifier.encode()).hexdigest()
            
            if identifier_hash not in seen:
                seen.add(identifier_hash)
                unique_articles.append(article)
        
        removed = len(self.articles) - len(unique_articles)
        self.articles = unique_articles
        
        print(f"Removed {removed} duplicate articles")
        return self
    
    def save(self, filename, format="json"):
        """Save articles to file in specified format"""
        try:
            if format == "json":
                with open(f"{filename}.json", "w") as f:
                    json.dump(self.articles, f, indent=2)
            elif format == "csv":
                self.to_pandas().to_csv(f"{filename}.csv", index=False)
            elif format == "pkl":
                self.to_pandas().to_pickle(f"{filename}.pkl")
            else:
                raise ValueError("Format must be json, csv, or pkl")
                
            print(f"Saved {len(self.articles)} articles to {filename}.{format}")
        except Exception as e:
            print(f"Save error: {e}")
        
        return self

In [None]:
# aggregator = DataAggregator(num_pages=5, num_iter=2)

# # Collect data from multiple sources
# data = (aggregator
#     .dd_news(10)
#     .google_facts("covid")
#     .news_org("india", 2)
#     .search("current news")
#     .ndtv()
#     .wiki(df_ret)
#     .remove_duplicates()  # Remove duplicates
#     .save("my_articles", "json")  # Save as JSON
#     .save("my_articles", "csv")   # Save as CSV
#     .to_pandas()
# )

# print(f"Collected {len(data)} articles after deduplication")
# print(aggregator.stats())

In [None]:
data  = DataAggregator(num_iter=50)

In [None]:
data.google_facts("india").wiki(df_ret)

In [None]:
# data.save('first',"csv")

In [None]:
queries = [
    "India politics OR government OR parliament",
    "Lok Sabha OR Rajya Sabha OR Indian election",
    "India Supreme Court OR High Court",

    "India economy OR GDP OR inflation",
    "Reserve Bank of India OR RBI",
    "India stock market OR Sensex OR Nifty",
    "India startups OR unicorn OR funding",

    "India artificial intelligence OR AI OR machine learning",
    "ISRO OR Chandrayaan OR Gaganyaan",
    "India 5G OR technology OR semiconductors",
    "India renewable energy OR solar OR nuclear",

    "India education OR schools OR universities",
    "India healthcare OR hospitals OR vaccines",
    "India poverty OR unemployment OR inequality",
    "India climate change OR pollution OR environment",

    "India Pakistan OR border OR Kashmir",
    "India China OR LAC OR diplomacy",
    "India defense OR military OR DRDO",
    "India foreign policy OR G20 OR BRICS"
]


In [None]:
for query in queries:
    data.news_org(query=query,sort_idx=0)

In [None]:
for query in queries:
    data.search(query)

In [None]:
data.stats()

In [None]:
# from pathlib import Path

# path = Path.cwd()

# for files in path.glob('*.html'):
#     if files.is_file():
#         try:
#             files.unlink()
#             print(f"File '{files}' deleted successfully.")
#         except OSError as e:
#             print(f"Error deleting file '{files}': {e}")
#     else:
#         print(f"File '{files}' does not exist.")



In [None]:
# data.create_chunks()
# data.save_chunks('first','csv')
# data.save_chunks('first')
data.save('second','csv')
data.save('first')

In [None]:
data.stats()

In [None]:
def chunk_text(text, chunk_size=500, overlap=50):
    """
    Split text into chunks of `chunk_size` tokens (approx by words here),
    with `overlap` words overlapping.
    """
    text = str(text)
    words = text.split()
    chunks = []
    start = 0
    
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap
    
    return chunks


In [None]:
chunk_data = []

for article in data.articles:
    content = article["text"]
    chunks = chunk_text(content, chunk_size=500, overlap=50)

    for i, chunk in enumerate(chunks):
        chunk_data.append({
            "title": article["title"],
            "url": article["url"],
            "Published_Date": article.get("Published_Date","Not Available"),
            "source": article.get("source",'Not Available'),
            "chunk_id": i,
            "text": chunk
        })


In [None]:
len(data.articles)

In [None]:
len(chunk_data)

In [None]:

df = pd.DataFrame(chunk_data)
df

In [None]:
import pandas as pd
df = pd.read_csv('second.csv')

In [None]:
arc = df.to_dict(orient='records')
arc

In [None]:
len(arc)

In [None]:
def chunk_text(text, chunk_size=500, overlap=50):
    """
    Split text into chunks of `chunk_size` tokens (approx by words here),
    with `overlap` words overlapping.
    """
    text  =str(text)
    words = text.split()
    chunks = []
    start = 0
    
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap
    
    return chunks


In [None]:
chunk_data = []

for article in arc:
    content = article["text"]
    chunks = chunk_text(content, chunk_size=500, overlap=50)

    for i, chunk in enumerate(chunks):
        chunk_data.append({
            "title": article["title"],
            "url": article["url"],
            "Published_Date": article.get("Published_Date","Not Available"),
            "source": article.get("source",'Not Available'),
            "chunk_id": i,
            "text": chunk
        })

chunk_data

In [None]:
len(chunk_data)

In [None]:
z = pd.DataFrame(chunk_data)
z

In [None]:
z.to_csv('third_chunks.csv')

In [None]:
import json
with open('final_chunks.json','w') as f:
    json.dump(chunk_data,f,indent=2)
