!pip install -r requirement.txt

In [1]:
from collections import deque
import json
import os
import requests
import urllib.parse
import validators
from bs4 import BeautifulSoup
import time as time_lib
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pickle
import atexit
import signal


# Function to load the data (list and queue) from a file
def load_data(file_path = "queue_list.pickle"):
    try:
        with open(file_path, 'rb') as file:
            data = pickle.load(file)
        my_queue = deque(data['queue'])
        my_list = data['list']
        my_checked = data['checked']
        batch_count = data['batch_count']
        time_max = data['time_max']
        print("Data loaded successfully.")
        return my_queue, my_list, my_checked, batch_count, time_max
    except FileNotFoundError:
        print("No saved data found.")
        return deque(), [], [], 0, 0

def get_directory_size(directory):
    total_size = 0
    for entry in os.scandir(directory):
        if entry.is_file():
            total_size += entry.stat().st_size
        elif entry.is_dir():
            total_size += get_directory_size(entry.path)
    return total_size



directory = r"D:\deep_data\wiki_technicalTermLabeledData"
if not os.path.exists(directory):         # Ensures to make directory
    os.makedirs(directory)


queue_file_path = "queue_list.pickle"
q, data, checked, batch_count, time_max = load_data(queue_file_path)
if(len(q) == 0):
  parent_url = "https://en.wikipedia.org/wiki/Wikipedia:Contents/Technology_and_applied_sciences"
  q.append((parent_url,0))

def save_data():
    data = {
        'queue': list(q),
        'list': data,
        'checked': checked,
        'batch_count': batch_count,
        'time_max': time_max
    }
    with open(queue_file_path, 'wb') as file:
        pickle.dump(data, file)
    print("Data saved successfully.")

disk_space = 400
batch_size = 200
empty_dict=0
set_depth = 3
base_url="https://en.wikipedia.org"
# parent_url = "https://geoltime.github.io/?Ma=470"
cite_note = "#"
repeated=[]
excpt = []
cooling_time = 30
compression = "snappy"
time=0

count1 = count2 = count3 = count4 = 0
total_directory_size = 0

atexit.register(save_data)

def interrupt_handler(signal, frame):
    save_data()
    exit(0)

signal.signal(signal.SIGINT, interrupt_handler)

while(len(q) and total_directory_size < disk_space * 1024 * 1024 * 1024):
    front = q.popleft()                     # Basic BFS settings
    url = front[0]
    time = front[1]
    time_max = max(time_max, time)
    
    if(time_max>set_depth):
      q.appendleft((front,time))
      break
    
    try:
      response = requests.get(url)
    except:
      try:
        response = requests.get("https://www.google.com/")
        continue
      except:
        q.appendleft((url,time))
        print("sleeping for :",cooling_time,"url:",url)
        time_lib.sleep(cooling_time)
        continue
      
    soup = BeautifulSoup(response.content, "html.parser")
    paragraphs = soup.find_all("p")
    print(url,len(paragraphs),time)
    count1+=1
    for j in range(len(paragraphs)):              # Inserting new links (according to given depth) to queue (BFS fashion)
      count2+=1
      if len(paragraphs[j].find_all("a"))!=0:
        list2=[]
        for i in paragraphs[j].find_all("a"):
          try:
            count3+=1
            if i.get("href") not in checked and validators.url(i.get("href")) and cite_note not in i.get("href"):
              checked.append(i.get("href"))
              count4+=1
              q.append((i.get("href"),time+1))
            elif base_url+i.get("href") not in checked and validators.url(base_url+i.get("href")) and cite_note not in i.get("href"):
              checked.append(base_url+i.get("href"))
              count4+=1
              q.append((base_url+i.get("href"),time+1))
            else:
              repeated.append(url)
              count4+=0
          except Exception:
            excpt.append([url])
            print("cought exception for ",url,i)

    texts = []
    labels = []
    count = 0
    for paragraph in paragraphs:          # Extrcating labeled and unlabeled data
      text = paragraph.get_text()
      texts.append(text)
      links = paragraph.find_all("a")
      label = [link.get_text() for link in links]
      labels.append(label)
      count+=1

    dict1 = {}                            
    for i in range(count):
      dict1[texts[i]] = labels[i]
    if(len(dict1)>0):
      data.append({url:dict1})
    else:
      print(url, ": dictionary found empty")
      empty_dict+=1
      continue
    print(f"url:{len(data)}  batch:{batch_count+1}")
    
    if len(data) >= batch_size:
        df = pd.DataFrame(data)
        table = pa.Table.from_pandas(df)
        pq.write_table(table, os.path.join(directory, f"{batch_count}.parquet"),compression=compression)
        data = []
        batch_count += 1
    
    total_directory_size = get_directory_size(directory)

# Remaining data in last
if data:
    df = pd.DataFrame(data)
    table = pa.Table.from_pandas(df)
    pq.write_table(table, os.path.join(directory, f"{batch_count}.parquet"),compression=compression)


deque_list = list(q)      # Save the list to a JSON file
if(os.path.isfile(os.path.join(directory,"deque_data.json"))):                    # Saving
  with open(os.path.join(directory,"deque_data.json"),"r") as file:
    dataD = json.load(file)
  dataD.append(deque_list)
  with open(os.path.join(directory,"deque_data.json"),"w") as file:
    json.dump(dataD, file)
        
else:
  with open(os.path.join(directory,str(time)+".json"),"w") as file:
    json.dump(deque_list, file)

No saved data found.
https://en.wikipedia.org/wiki/Wikipedia:Contents/Technology_and_applied_sciences 28 0
url:1  batch:1
https://en.wikipedia.org/wiki/Human 95 1
url:2  batch:1
https://en.wikipedia.org/wiki/Prehistory 35 1
url:3  batch:1
https://en.wikipedia.org/wiki/Fire 34 1
url:4  batch:1
https://en.wikipedia.org/wiki/Wheel 43 1
url:5  batch:1
https://en.wikipedia.org/wiki/Printing_press 57 1
url:6  batch:1
https://en.wikipedia.org/wiki/Internet 101 1
url:7  batch:1
https://en.wikipedia.org/wiki/Communication 75 1
url:8  batch:1
https://en.wikipedia.org/wiki/Weapon 34 1
url:9  batch:1
https://en.wikipedia.org/wiki/Club_(weapon) 16 1
url:10  batch:1
https://en.wikipedia.org/wiki/Nuclear_bomb 94 1
url:11  batch:1
https://en.wikipedia.org/wiki/Applied_science 15 1
url:12  batch:1
https://en.wikipedia.org/wiki/Natural_science 58 1
url:13  batch:1
https://en.wikipedia.org/wiki/Engineering 91 1
url:14  batch:1
https://en.wikipedia.org/wiki/Research_and_development 21 1
url:15  batch:1
ht

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


http://www.fao.org/3/CA8753EN/CA8753EN.pdf 0 2
http://www.fao.org/3/CA8753EN/CA8753EN.pdf : dictionary found empty
https://commons.wikimedia.org/wiki/File:The_State_of_the_World%E2%80%99s_Forests_2020._In_brief.pdf 9 2
url:140  batch:43
https://doi.org/10.4060/ca8985en 0 2
https://doi.org/10.4060/ca8985en : dictionary found empty
https://en.wikipedia.org/wiki/Open-source_license 26 2
url:141  batch:43
https://en.wikipedia.org/wiki/Liberty 36 2
url:142  batch:43
https://en.wikipedia.org/wiki/Free_Software_Foundation 52 2
url:143  batch:43
https://en.wikipedia.org/wiki/Berkeley_Software_Distribution 28 2
url:144  batch:43
https://en.wikipedia.org/wiki/Richard_Stallman 63 2
url:145  batch:43
https://en.wikipedia.org/wiki/GNU_Project 28 2
url:146  batch:43
https://en.wikipedia.org/wiki/Operating_system 134 2
url:147  batch:43
https://en.wikipedia.org/wiki/Software_license 22 2
url:148  batch:43
https://en.wikipedia.org/wiki/Public_domain 43 2
url:149  batch:43
https://en.wikipedia.org/wiki

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'D:\\deep_data\\wiki_technicalTermLabeledData'

In [None]:
print(len(checked),len(repeated))
print(count1,"level 1 urls + 1(for level0) + 1(for level2)")
print(count2,"sum(len(paragraphs of level0))")
print(count3,"sum(all links at level 1)")
print(count4,"sum of unchecked + checked links")

15891 11960
173 level 1 urls + 1(for level0) + 1(for level2)
6349 sum(len(paragraphs of level0))
27851 sum(all links at level 1)
15891 sum of unchecked + checked links
