In [27]:
import pandas as pd
import wikipedia

In [28]:
def get_wiki_page(title):
    """
    Get the wikipedia page given a title
    """
    try:
        return wikipedia.page(title)
    except wikipedia.exceptions.DisambiguationError as e:
        return wikipedia.page(e.options[0])
    except wikipedia.exceptions.PageError as e:
        return None

In [29]:
def recursively_find_all_pages(titles, titles_so_far=set(), max_pages=2):
    """
    Recursively find up to a specified number of pages linked to the Wikipedia titles in the list
    """
    all_pages = []
    
    titles = list(set(titles) - titles_so_far)

    titles_so_far.update(titles)

    for title in titles:
        page = get_wiki_page(title)
        if page is None:
            continue
        all_pages.append(page)

        if len(all_pages) >= max_pages:
            break

        new_pages = recursively_find_all_pages(page.links, titles_so_far, max_pages=max_pages)
        for pg in new_pages:
            if pg.title not in [p.title for p in all_pages]:
                all_pages.append(pg)
                if len(all_pages) >= max_pages:
                    break

        titles_so_far.update(page.links)
        
        if len(all_pages) >= max_pages:
            break
        
    return all_pages

In [30]:
topics = ['Constitution of India', 'Code of Civil Procedure (India)', 'Code of Criminal Procedure (India)', 'Indian Evidence Act', 'Indian Penal Code']
# topics = ['Code of Civil Procedure (India)']

In [31]:
pages1 = recursively_find_all_pages(topics[0])
print(pages1)



  lis = BeautifulSoup(html).find_all('li')


[<WikipediaPage 'O'>, <WikipediaPage 'Ø'>]


In [32]:
pages2 = recursively_find_all_pages(topics[1])
print(pages2)

[<WikipediaPage 'E'>, <WikipediaPage 'Close-mid central unrounded vowel'>]


In [33]:
pages3 = recursively_find_all_pages(topics[2])
print(pages3)

DisambiguationError: "Matthew Perry (disambiguation)" may refer to: 
Matthew C. Perry
Matthew Perry Monument (Newport, Rhode Island)
USNS Matthew Perry (T-AKE-9)
Matthew J. Perry
Matt Perry (rugby union)
Matt Parry
Matthew Parry (cricketer)

In [None]:
pages4 = recursively_find_all_pages(topics[3])
print(pages4)

In [None]:
pages5 = recursively_find_all_pages(topics[4])
print(pages5)

In [None]:
pages = pages1 + pages2 + pages3 + pages4 + pages5
print(pages)
len(pages)

In [None]:
import re
from typing import Set
from transformers import GPT2TokenizerFast

import numpy as np
from nltk.tokenize import sent_tokenize

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

In [None]:

def count_tokens(text: str) -> int:
    """count the number of tokens in a string"""
    return len(tokenizer.encode(text))

In [None]:
import nltk
nltk.download('punkt')

In [None]:
def reduce_long(
    long_text: str, long_text_tokens: bool = False, max_len: int = 120
) -> str:
    """
    Reduce a long text to a maximum of `max_len` tokens by potentially cutting at a sentence end
    """
    if not long_text_tokens:
        long_text_tokens = count_tokens(long_text)
    if long_text_tokens > max_len:
        sentences = sent_tokenize(long_text.replace("\n", " "))
        ntokens = 0
        for i, sentence in enumerate(sentences):
            ntokens += 1 + count_tokens(sentence)
            if ntokens > max_len:
                return ". ".join(sentences[:i]) + "."

    return long_text

In [None]:
discard_categories = ['See also', 'References', 'External links', 'Further reading', "Footnotes",
    "Bibliography", "Sources", "Citations", "Literature", "Footnotes", "Notes and references",
    "Photo gallery", "Works cited", "Photos", "Gallery", "Notes", "References and sources",
    "References and notes",]

In [None]:
def extract_sections(
    wiki_text: str,
    title: str,
    max_len: int = 1500,
    discard_categories: Set[str] = discard_categories,
) -> str:
    """
    Extract the sections of a Wikipedia page, discarding the references and other low information sections
    """
    if len(wiki_text) == 0:
        return []

    # find all headings and the coresponding contents
    headings = re.findall("==+ .* ==+", wiki_text)

    for heading in headings:
        wiki_text = wiki_text.replace(heading, "==+ !! ==+")

    contents = wiki_text.split("==+ !! ==+")

    contents = [c.strip() for c in contents]
    assert len(headings) == len(contents) - 1

    cont = contents.pop(0).strip()
    outputs = [(title, "Summary", cont, count_tokens(cont)+4)]

    # discard the discard categories, accounting for a tree structure
    max_level = 100
    keep_group_level = max_level
    remove_group_level = max_level
    nheadings, ncontents = [], []
    for heading, content in zip(headings, contents):
        plain_heading = " ".join(heading.split(" ")[1:-1])
        num_equals = len(heading.split(" ")[0])
        if num_equals <= keep_group_level:
            keep_group_level = max_level

        if num_equals > remove_group_level:
            if (
                num_equals <= keep_group_level
            ):
                continue
        keep_group_level = max_level
        if plain_heading in discard_categories:
            remove_group_level = num_equals
            keep_group_level = max_level
            continue
        nheadings.append(heading.replace("=", "").strip())
        ncontents.append(content)
        remove_group_level = max_level

    # count the tokens of each section
    ncontent_ntokens = [
        count_tokens(c)
        + 3
        + count_tokens(" ".join(h.split(" ")[1:-1]))
        - (1 if len(c) == 0 else 0)
        for h, c in zip(nheadings, ncontents)
    ]

    # Create a tuple of (title, section_name, content, number of tokens)
    outputs += [(title, h, c, t) if t<max_len 
                else (title, h, reduce_long(c, max_len), count_tokens(reduce_long(c,max_len))) 
                    for h, c, t in zip(nheadings, ncontents, ncontent_ntokens)]
    
    return outputs

In [None]:
res = []
for page in pages:
    res += extract_sections(page.content, page.title)
df = pd.DataFrame(res, columns=["title", "heading", "content", "tokens"])
df = df[df.tokens>40]
df = df.drop_duplicates(['title','heading'])
df = df.reset_index().drop('index',axis=1) # reset index
df = df.iloc[:5]
df

In [None]:
df.to_csv('processed_data/processed_data.csv', index=False)