In [2]:
import requests
import pickle
from timeit import default_timer as timer
import time
import pathlib
import pandas as pd

In [11]:
pd.options.display.max_colwidth = 500

In [2]:
main_url = "https://en.wikipedia.org/wiki"
WIKIPEDIA_API_URL = "https://en.wikipedia.org/w/api.php"
categories_processed = []
ignore_list = ["Strategic_management", "Business_intelligence_organizations",
               "Analytics_companies",
               "Financial_analysts", "Market_trends", "Forecasting_competitions", 
               "Economic_forecasting", "Indicators", "Medical_monitoring", "Forecasting_organizations",
               "Technology_forecasting", "Weather_prediction", "Rankings", 
               "Analysis_of_collective_decision-making", "Industrial_robotics", 
               "Industrial_engineering", "Operations_research_awards", "Operations_research_societies", 
               "Management_systems", "Multiple-criteria_decision_analysis", "Networks", 
               "Network_scientists", "Operations_researchers", "Survey_methodology", "Data_scientists",
               "Computational_statistics_journals", "Statistical_databases", "Statistical_software",
               "Actuarial_science", "Choice_modelling", "Coding_theory", "Information_theorists",
               "Height", "Lists_by_length", "Longest_things", "Vertical_extent", "Vertical_position",
               "Population_models", "Deep_learning_software", "Neural_network_software", 
               "Artificial_intelligence_conferences", "Signal_processing_conferences", 
               "Data_mining_and_machine_learning_software", "Social_network_analysis_software", 
               "Machine_learning_researchers", "Natural_language_processing_researchers"
              ]

In [3]:
def get_subcategories(category, depth=1):
    params = {
        'action': 'query',
        'format': 'json',
        'list': 'categorymembers',
        'cmtitle': f'Category:{category}',
        'cmlimit': 'max',
        'cmtype': 'subcat'
    }

    subcategories = []
    for _ in range(depth):
        response = requests.get(WIKIPEDIA_API_URL, params=params)
        data = response.json()
        subcategories.extend([item['title'][9:] for item in data['query']['categorymembers']])  # Remove "Category:" prefix
        if 'continue' not in data:
            break
        params['cmcontinue'] = data['continue']['cmcontinue']

    return subcategories

def get_page_urls(category, main_url):
    params = {
        'action': 'query',
        'format': 'json',
        'list': 'categorymembers',
        'cmtitle': f'Category:{category}',
        'cmlimit': 'max',
        'cmtype': 'page'
    }

    response = requests.get(WIKIPEDIA_API_URL, params=params)
    data = response.json()

    return [main_url + "/" + item['title'].replace(" ","_") for item in data['query']['categorymembers']]

def build_category_tree(category, depth=1, debug=False):
    if (category not in ignore_list) and (category not in categories_processed):
        category_tree = {'name': category, 'subcategories': [], 'page_urls': []}
        if debug:
            print(f"Categories Processed: {category}")
        categories_processed.append(category)
        if depth > 1:
            subcategories = get_subcategories(category, depth)
            for subcategory in subcategories:
                sub_category_tree = build_category_tree(subcategory.replace(" ","_"), depth - 1, debug)
                if sub_category_tree:
                    category_tree['subcategories'].append(sub_category_tree)

        page_urls = get_page_urls(category, main_url)
        category_tree['page_urls'].extend(page_urls)

        return category_tree

def create_index(category_tree, index=None):
    if index is None:
        index = {}
    index[category_tree['name']] = category_tree

    for subcategory_tree in category_tree['subcategories']:
        create_index(subcategory_tree, index)

    return index

def find_category_in_tree(category_tree, target_category):
    if category_tree['name'] == target_category:
        return category_tree
    for subcategory in category_tree['subcategories']:
        result = find_category_in_tree(subcategory, target_category)
        if result:
            return result
    return None

def print_category_tree(category_tree, depth=0):
    indent = '  ' * depth
    print(f"{indent}{category_tree['name']}")

    for subcategory in category_tree['subcategories']:
        print_category_tree(subcategory, depth + 1)

    for page_url in category_tree['page_urls']:
        print(f"{indent}  - {page_url}")
        
def debug_category(category):
    depth = 10
    category_tree = build_category_tree(category, depth, debug = True)
    
def retrieve_all_urls(category_tree):
    all_urls = category_tree['page_urls']
    for subcategory_tree in category_tree['subcategories']:
        all_urls.extend(retrieve_all_urls(subcategory_tree))
    return all_urls    

In [4]:
categories = [
'Categorical_data',
'Decision_theory',
'Machine_learning',
'Statistical_methods',
'Statistical_theory'
]

depth = 10

files = [str(f) for f in pathlib.Path().glob("./data/*.pkl")]

categories_processed = []

for category in categories:
    try:
        dump_file_name = f"{category}.pkl"
        if (not files) or (dump_file_name not in files):
            start_time = timer()
              # Specify the depth of subcategories to explore
            category_tree = build_category_tree(category, depth)
            with open("./data/" + dump_file_name,"wb") as file:
                pickle.dump(category_tree, file)
            end_time = timer()
            print(f"Scraping time for Category:{category} = {(end_time - start_time):0.3f} seconds")
            #category_index = create_index(category_tree)
            #print_category_tree(category_tree)
            time.sleep(5)
    except Exception as e:
        print(f"Exception in Retrieving details for Category: {category}, Error: {str(e)}")

Scraping time for Category:Categorical_data = 3.686 seconds
Scraping time for Category:Decision_theory = 30.094 seconds
Scraping time for Category:Machine_learning = 7.571 seconds
Scraping time for Category:Statistical_methods = 33.840 seconds
Scraping time for Category:Statistical_theory = 39.475 seconds


In [17]:
# Read the category pkl files and extract urls from each
files = [str(f) for f in pathlib.Path().glob("./data/*.pkl")]
urls = []
for cat_file in files:
    with open(cat_file,"rb") as file:
        cat_tree = pickle.load(file)
    urls.extend(list(set(retrieve_all_urls(cat_tree))))

In [22]:
# Retrieve unique urls from the extracted list
final_urls = []
for url in urls:
    if url not in final_urls:
        final_urls.append(url)

final_urls_df = pd.DataFrame(final_urls, columns= ["url"])
final_urls_df.to_csv("./data/final_urls.csv", index=False)

In [12]:
df = pd.read_csv("./data/final_urls.csv")
print(df)

                                                                            url
0                                         https://en.wikipedia.org/wiki/Neuroph
1                                         https://en.wikipedia.org/wiki/Craiyon
2      https://en.wikipedia.org/wiki/Chi-square_automatic_interaction_detection
3                        https://en.wikipedia.org/wiki/Recursive_neural_network
4                          https://en.wikipedia.org/wiki/Support_vector_machine
...                                                                         ...
16251                         https://en.wikipedia.org/wiki/Gap_(chart_pattern)
16252                                 https://en.wikipedia.org/wiki/Forest_plot
16253                                      https://en.wikipedia.org/wiki/Sweave
16254                                        https://en.wikipedia.org/wiki/Doji
16255                              https://en.wikipedia.org/wiki/Thomas_Kailath

[16256 rows x 1 columns]
