# Google Crawler

In [1]:
from urllib.parse import urlparse, parse_qs

def extract_website(url):
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)
    actual_url = query_params.get('url', [None])[0]
    return actual_url

In [2]:
import requests
from bs4 import BeautifulSoup
import time
import tldextract
import pdb

def google_crawler(query, num=10, domain_exclude=[], currentPage = 0):
    results = dict()
    headers = {
        'User-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582'
    }    
    query = query.replace(' ', '+')
    while True:
        url = f"https://www.google.com/search?q={query}&start={currentPage}"
        response = requests.get(url, headers=headers)

        if response.status_code == 429:
            print("Stopping due to 429 code")
            return [429], 0

        if response.status_code != 200:
            break

        soup = BeautifulSoup(response.text, 'html.parser')
        search_items = soup.find_all('div', class_=['egMi0 kCrYT', 'Gx5Zad fP1Qef xpd EtOod pkphOe'])
        if not search_items:
            break
        for items in search_items:
            link = items.find('a', href=True)
            if link:
                link = link['href']
                link = extract_website(link)
                if link and '.pdf' not in link:
                    domain = tldextract.extract(link).domain
                    if domain not in domain_exclude:
                        results[link] = ""
        currentPage += 10
        time.sleep(1)
        if len(results) >= num:
            break

    return list(results.keys())[:num], currentPage

In [3]:
import requests
from bs4 import BeautifulSoup
from langdetect import detect
from requests.exceptions import RequestException, Timeout
from urllib3.exceptions import LocationParseError

timeout_seconds = 30

def extract_text_and_detect_language(url):
    # response_head = requests.head(url)
    # if response_head.headers.get('Content-Type') == 'application/pdf':
    #     print(f"{url} points to a PDF file. Skipping...")
    #     return '', ''
    # Fetch the content of the URL
    try:
        # print(f"Trying {url}")
        response = requests.get(url, timeout=timeout_seconds)
        # print(f"Got {url}")
    except Timeout:
        # print(f"Request timed out for {url}")
        return '', ''
    except RequestException as e:
        return '', ''
    except LocationParseError as e:
        print(f"Location parse error for {url}")
        return '', ''
    html_content = response.text
    
    # Parse the HTML content and extract text

    # print(f"Parsing {url}")
    soup = BeautifulSoup(html_content, 'html.parser')
    # print(f"Parsed {url}")
    # Extracting all text from paragraphs. This is a basic approach, and you might need to adjust it based on the website's structure.
    for unwanted in soup(['header', 'footer', 'nav', 'img', 'figcaption', 'h1', 'h2',]):
        unwanted.decompose()  # Remove these elements from the soup
    
    # Assuming the main content is within 'article' or 'main'. Adjust as needed.
    main_content = soup.find(class_=["printableindent", "content", "articlecontent", "story-content", "post-content", "page", "abp-story-detail", "mw-body", "text-formatted", "entry-content", "khbr_rght_sec", "innner-page-main-about-us-content-right-part", "article-desc ul_styling"])

    if not main_content:
        main_content = soup.find(['main', 'article'])
    text = main_content.get_text(separator=' ', strip=True) if main_content else ''

    
    
    # Detect the language of the extracted text
    try:
        language = detect(text)
    except Exception as e:
        language = "Language detection failed"
    
    return text, language


In [4]:
def get_query(df, idx):
    types = {'LOCATION': 3, 'PERSON': 3, 'ORGANIZATION': 1, 'TIMEX': 1}
    ent_list = list()
    sent = ''
    rec = df[idx]
    for ent in rec["all_list"]:
        if ent['entity_group'] in types.keys() and types[ent['entity_group']] > 0:
            if ent['word'] not in ent_list:
                ent_list.append(ent['word'])
                types[ent['entity_group']] -= 1
                sent = sent + ent['word'] + ' '
    return sent

def get_query_2(df, idx):
    types = {'LOCATION': 2, 'PERSON': 2, 'ORGANIZATION': 1, 'TIMEX': 1}
    ent_list = list()
    sent = ''
    rec = df[idx]
    for ent in rec["all_list"]:
        if ent['entity_group'] in types.keys() and types[ent['entity_group']] > 0:
            if ent['word'] not in ent_list:
                ent_list.append(ent['word'])
                types[ent['entity_group']] -= 1
                sent = sent + ent['word'] + ' '
    return sent

In [5]:
from transformers import AutoTokenizer, AutoModel
import torch
from scipy.spatial.distance import cosine

model_name = "bert-base-multilingual-cased"

def get_embedding(model, tokenizer, text):
    try:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    except:
        print(text)
        pdb.set_trace()
    with torch.no_grad():
        outputs = model(**inputs)
    
    last_hidden_states = outputs.last_hidden_state
    embeddings = torch.mean(last_hidden_states, dim=1)
    return embeddings

def check_link_valid(link):
    text, lang = extract_text_and_detect_language(link)
    if lang != 'hi' or text == '' or len(text.split(' ')) < 10:
        return False, '', ''

    return True, text, link

def get_score_and_valid_text(text_dict, text_orig, thresh):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    link_list = list(text_dict.keys())
    text_list = list(text_dict.values())
    
    embedding1 = get_embedding(model, tokenizer, text_list)
    embedding2 = get_embedding(model, tokenizer, text_orig)

    final_list = []
    for i in range(embedding1.shape[0]):
        cos_sim = 1 - cosine(embedding1[i].numpy(), embedding2[0].numpy())
        if (cos_sim >= thresh):
            final_list.append({
                "link": link_list[i],
                "text": text_list[i],
                "score": cos_sim
            })
            
    return final_list
    

In [6]:
import pandas as pd

df_ner = pd.read_json("../Datasets/Hindi_summarization/Long-short-news-dataset/NER/ner_hindi_train_1024.json")
df = pd.read_csv("../Datasets/Hindi_summarization/Long-short-news-dataset/clean_hindi_train.csv", lineterminator='\n')

In [7]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def process_links(res):
    text_dict = dict()
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_link = [executor.submit(check_link_valid, link) for link in res]

        for future in as_completed(future_to_link):
            check, link_text, link = future.result()
            if check:
                text_dict[link] = link_text

    return text_dict

In [8]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
from tqdm import tqdm
import os
import pdb
import numpy as np

def process_index(idx, df_ner, df, exclude, cos_sim_thresh):
    # print(idx)
    query = get_query(df_ner, idx)
    # print("Query")
    nextPage = 0
    final_list = []
    res, nextPage = google_crawler(query, 10, exclude, nextPage)
    # print("res")
    if len(res) == 1 and res[0] == 429:
        return idx, None, None
    # print(res)
    text_dict = process_links(res)
    if len(text_dict) > 0:
        final_list = get_score_and_valid_text(text_dict, df['article'][idx], cos_sim_thresh)
        final_list = sorted(final_list, key = lambda item: item['score'], reverse = True)
        
    if (len(final_list) == 0):
        print(f"Trying with smaller query.")
        query = get_query_2(df_ner, idx)
        nextPage = 0
        final_list = []
        res, nextPage = google_crawler(query, 10, exclude, nextPage)
        if len(res) == 1 and res[0] == 429:
            return idx, None
        text_dict = process_links(res)
        if len(text_dict) > 0:
            final_list = get_score_and_valid_text(text_dict, df['article'][idx], cos_sim_thresh)
            final_list = sorted(final_list, key = lambda item: item['score'], reverse = True)
        
    return df['Id'][idx], final_list[:10], query

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

def save_partial_results(data, save_doc):
    if os.path.exists(save_doc):
        with open(save_doc, 'a') as file:
            for key in list(data.keys()):
                record = {'key': key, 'data': data[key]}
                json_record = json.dumps(record, cls = NpEncoder)
                file.write(json_record + '\n')
    else:
        with open(save_doc, 'w') as file:
            for key in list(data.keys()):
                record = {'key': key, 'data': data[key]}
                json_record = json.dumps(record, cls = NpEncoder)
                file.write(json_record + '\n')

def get_saved_keys(save_doc):
    existing_keys = set()
    if os.path.exists(save_doc):
        with open(save_doc, 'r') as file:
            for line in file:
                existing_keys.add(json.loads(line)['key'])

    return existing_keys

def main(df, df_ner, exclude, cos_sim_thresh, save_doc):
    saved_keys = get_saved_keys(save_doc)
    data = {}
    cnt = 0
    for idx in tqdm(df.index):
        if df['Id'][idx] in saved_keys:
            continue
        # print(f"Working on {idx}")
        id, final_list, query = process_index(idx, df_ner, df, exclude, cos_sim_thresh)
        if final_list == None:
            break
        print(f"{idx} has {len(final_list)}")
        if (len(final_list) == 0):
            # print(f"{idx} has 0 len")
            cnt += 1
        data[id] = final_list

        # Optionally save after every 100 completions
        if len(data) % 20 == 0:
            save_partial_results(data, save_doc)
            data = {}

        # pdb.set_trace()

    # Final save
    save_partial_results(data, save_doc)
    print(f"{cnt} empty out of {len(df.index)}")

In [9]:
# {idx: [{link: , text: ,score: }]}

from tqdm import tqdm

import json

exclude = ["youtube", "instagram", "facebook", "twitter"]
num_docs = 10
cos_sim_thresh = 0.8

save_doc = "LSN-train-crawl.jsonl"

In [10]:
def get_extra_keys(save_doc, df):
    existing_keys = set()
    duplicate = set()
    if os.path.exists(save_doc):
        with open(save_doc, 'r') as file:
            for line in file:
                key = json.loads(line)['key']
                if key in existing_keys:
                    duplicate.add(key)
                existing_keys.add(json.loads(line)['key'])

    for idx in tqdm(df.index):
        if df['Id'][idx] in existing_keys:
            existing_keys.remove(df['Id'][idx])

    return existing_keys, duplicate

extra_keys, duplicate = get_extra_keys(save_doc, df)
print(f"Extra: {extra_keys}")
print(f"Duplicate: {duplicate}")

100%|█████████████████████████████████| 58084/58084 [00:00<00:00, 102557.33it/s]

Extra: set()
Duplicate: set()





In [None]:
# Call main function with appropriate arguments
main(df, df_ner, exclude, cos_sim_thresh, save_doc)

# Removing same articles

In [None]:
import pandas as pd
from indicnlp.tokenize import indic_tokenize
import json
import os

In [None]:
def save_partial_results(key, data, save_doc):
    if os.path.exists(save_doc):
        with open(save_doc, 'a') as file:
            record = {'key': key, 'data': data}
            json_record = json.dumps(record)
            file.write(json_record + '\n')
    else:
        with open(save_doc, 'w') as file:
            record = {'key': key, 'data': data}
            json_record = json.dumps(record)
            file.write(json_record + '\n')

In [None]:
df = pd.read_csv("../Datasets/Hindi_summarization/Long-short-news-dataset/clean_hindi_test.csv", lineterminator='\n')
df.set_index('Id', inplace = True)
save_doc = "LSN-test-crawl-clear.jsonl"

In [None]:
from tqdm import tqdm
with open("../Datasets/Hindi_summarization/Long-short-news-dataset/LSN-test-crawl.jsonl", 'rb') as file:
    for line in tqdm(file):
        rec = json.loads(line)
        key = rec['key']
        data= rec['data']

        text = df.loc[key, 'article']
        text_tokens = set(indic_tokenize.trivial_tokenize(text))
        if len(data) == 0:
            save_partial_results(key, data, save_doc)
            continue

        new_article_list = []
        for article in data:
            article_tokens = set(indic_tokenize.trivial_tokenize(article['text']))
            common = article_tokens.intersection(text_tokens)
            percent = (len(common)/len(text_tokens))*100

            if percent < 90:
                new_article_list.append(article)

        save_partial_results(key, new_article_list, save_doc)

In [None]:
from tqdm import tqdm

def get_extra_keys(save_doc, df):
    existing_keys = set()
    duplicate = set()
    if os.path.exists(save_doc):
        with open(save_doc, 'r') as file:
            for line in file:
                key = json.loads(line)['key']
                if key in existing_keys:
                    duplicate.add(key)
                existing_keys.add(json.loads(line)['key'])

    for idx in tqdm(df.index):
        if df['Id'][idx] in existing_keys:
            existing_keys.remove(df['Id'][idx])

    return existing_keys, duplicate

df = pd.read_csv("../Datasets/Hindi_summarization/Long-short-news-dataset/clean_hindi_test.csv", lineterminator='\n')
extra_keys, duplicate = get_extra_keys(save_doc, df)
print(f"Extra: {extra_keys}")
print(f"Duplicate: {duplicate}")

In [None]:
import json

cnt = 0
cnt2 = 0
with open(save_doc, 'r') as input_file:
    for i, line in enumerate(input_file):
        rec = json.loads(line)
        if len(rec['data']) == 0:
          # print(i)
          cnt += 1
        cnt2 += 1

print(f"Empty: {cnt}")
print(f"Total: {cnt2}")