In [1]:
import json
import wikipediaapi
import pandas as pd

from collections import Counter

In [2]:
wiki_wiki = wikipediaapi.Wikipedia(user_agent='PawfectMate App', language='en')

### Get all pages under the category

In [3]:
def print_categorymembers(categorymembers, level=0, max_level=1):
    for c in categorymembers.values():
        print("%s: %s (ns: %d)" % ("*" * (level + 1), c.title, c.ns))
        if c.ns == wikipediaapi.Namespace.CATEGORY and level < max_level:
            print_categorymembers(c.categorymembers, level=level + 1, max_level=max_level)

In [4]:
cat = wiki_wiki.page("Category:FCI breeds")
print("Category members: Category:FCI breeds")
print_categorymembers(cat.categorymembers)

Category members: Category:FCI breeds
*: Affenpinscher (ns: 0)
*: Afghan Hound (ns: 0)
*: Aidi (ns: 0)
*: Airedale Terrier (ns: 0)
*: Akita (dog breed) (ns: 0)
*: Alaskan Malamute (ns: 0)
*: Alpine Dachsbracke (ns: 0)
*: American Cocker Spaniel (ns: 0)
*: American Foxhound (ns: 0)
*: American Staffordshire Terrier (ns: 0)
*: American Water Spaniel (ns: 0)
*: Andalusian Terrier (ns: 0)
*: Anglo-Français de Petite Vénerie (ns: 0)
*: Appenzeller Sennenhund (ns: 0)
*: Ariège Pointer (ns: 0)
*: Ariégeois (ns: 0)
*: Artois Hound (ns: 0)
*: Australian Cattle Dog (ns: 0)
*: Australian Kelpie (ns: 0)
*: Australian Shepherd (ns: 0)
*: Australian Silky Terrier (ns: 0)
*: Australian Stumpy Tail Cattle Dog (ns: 0)
*: Australian Terrier (ns: 0)
*: Austrian Black and Tan Hound (ns: 0)
*: Azawakh (ns: 0)
*: Barak hound (ns: 0)
*: Barbet (dog breed) (ns: 0)
*: Basenji (ns: 0)
*: Basset Artésien Normand (ns: 0)
*: Basset Bleu de Gascogne (ns: 0)
*: Basset Fauve de Bretagne (ns: 0)
*: Basset Hound (ns: 0

In [5]:
all_breeds = cat.categorymembers.keys()

### Get all sections for each wiki-page

In [6]:
all_breeds_sections = {}
all_breeds_list = [x for x in list(all_breeds) if not x.startswith("Category")]

for breed in all_breeds_list:
    page_py = wiki_wiki.page(breed)
    subpage = []
    for s in page_py.sections:
        subpage.append(s.title)
    all_breeds_sections[breed]=subpage

In [7]:
all_breeds_sections

{'Affenpinscher': ['History', 'Description', 'Health', 'Notes'],
 'Afghan Hound': ['History',
  'Description',
  'Variants',
  'Health',
  'In popular culture',
  'See also',
  'References',
  'Further reading',
  'External links'],
 'Aidi': ['History',
  'Appearance',
  'See also',
  'References',
  'Further reading'],
 'Airedale Terrier': ['Description',
  'Health',
  'History',
  'Notable Airedales',
  'See also',
  'References',
  'Further reading',
  'External links'],
 'Akita (dog breed)': ['Breed name',
  'History',
  'Gallery',
  'Temperament',
  'Health',
  'Working life',
  'See also',
  'References',
  'External links'],
 'Alaskan Malamute': ['Lineage',
  'Appearance',
  'Health',
  'History',
  'See also',
  'Footnotes'],
 'Alpine Dachsbracke': ['Description', 'History', 'See also'],
 'American Cocker Spaniel': ['History',
  'Appearance',
  'Temperament',
  'Health',
  'In popular culture',
  'References',
  'External links'],
 'American Foxhound': ['History',
  'Descriptio

In [8]:
all_values = []
for values in all_breeds_sections.values():
    all_values.extend(values)

counter = Counter(all_values)

print("Unique sections:", list(counter.keys()))
print("Count for each section:", dict(counter))

Unique sections: ['History', 'Description', 'Health', 'Notes', 'Variants', 'In popular culture', 'See also', 'References', 'Further reading', 'External links', 'Appearance', 'Notable Airedales', 'Breed name', 'Gallery', 'Temperament', 'Working life', 'Lineage', 'Footnotes', 'Behavior', 'Trigg Hound', 'Breed-specific legislation and restrictions', 'Popularity', 'Similar breeds', 'History and use', 'Health and temperament', 'Characteristics', 'Use', 'Overview', 'As pets', 'Health and lifespan', 'Citations', 'General and cited references', 'Notable Kelpies', 'Show coat colours', 'Explanatory notes', 'Activities', 'Breed recognition', 'Uses', 'Etymology', 'Breed history', 'Notable Barbets', 'Name', 'Recognition', 'Recognition and categorisation', 'Hunting with Bassets', 'Sense of smell', 'Variations', 'Hunting', 'Detection', 'Use in animal testing', 'Other roles', 'Notable Beagles', 'Bibliography', 'In art', 'Notable examples', 'Care', 'Notable dogs', 'Rescues', 'Resources', 'Colour types'

In [9]:
df = pd.DataFrame(list(dict(counter).items()), columns=["key", "value"])

In [10]:
df.sort_values('value', ascending=False).head(10)

Unnamed: 0,key,value
0,History,310
6,See also,256
7,References,240
2,Health,176
1,Description,160
9,External links,136
10,Appearance,105
14,Temperament,89
25,Characteristics,62
3,Notes,42


### Prepare dataset

In [11]:
# section to take: History, Health, Description/Characteristics, Appearance, Temperament

In [12]:
sections_to_take = ['History', 'Health', 'Description', 'Characteristics', 'Appearance', 'Temperament']

In [13]:
rag_dataset = {}

for breed in all_breeds_list:
    content = {}
    page_py = wiki_wiki.page(breed)
    for s in sections_to_take:
        try:
            section_text = page_py.section_by_title(s).text
            content[s] = section_text
        except:
            content[s] = ''
    rag_dataset[breed] = content

In [14]:
rag_dataset

{'Affenpinscher': {'History': "The word 'Affenpinscher' derives from Affe, German for 'ape' or 'monkey'; it is sometimes translated as 'Monkey Terrier', although the dog is a pinscher and not a terrier.\nThe origins of the Pinscher group of dogs are unknown. Dogs of this type, both rough-haired and smooth-haired, were traditionally kept as carriage dogs or as stable dogs, and so were sometimes known as Stallpinscher; they were capable ratters. Until the late nineteenth century, both rough-haired and smooth-haired types were known as Deutscher Pinscher, and came from the same lineage; puppies of both types could occur in the same litter.\nIn 1880 the Pinscher was recorded in the Deutschen Hundestammbuch of the Verein zur Veredelung der Hunderassen. In 1895 Ludwig Beckmann described five varieties of Pinscher – the rough- and smooth-haired Pinscher, the rough- and smooth-haired Miniature Pinscher, and the Affenpinscher. In 1895 a breed society, the Pinscher-Schnauzer-Klub, was establishe

In [15]:
df = pd.DataFrame.from_dict(rag_dataset, orient='index').reset_index(names=['breed_name'])

In [16]:
df.head()

Unnamed: 0,breed_name,History,Health,Description,Characteristics,Appearance,Temperament
0,Affenpinscher,"The word 'Affenpinscher' derives from Affe, Ge...",A UK study found a life expectancy of 9.3 year...,The Affenpinscher generally weighs 4–6 kg (9–1...,,,
1,Afghan Hound,The Afghan Hound has been identified as a basa...,,The dogs in this breed occur in many different...,,,
2,Aidi,The Aidi is a breed native to the Atlas Mounta...,,,,Standing 52–62 cm (20–24 in) in height and wei...,
3,Airedale Terrier,"Airedale, a valley (dale) in the West Riding o...",A UK study found a life expectancy of 12 years...,,,The Airedale is the largest of the British ter...,The Airedale can be used as a working dog and ...
4,Akita (dog breed),,,,,"As a spitz breed, the appearance of the Akita ...",The Akita is generally seen as territorial abo...


In [17]:
df.columns = df.columns.str.lower()

In [18]:
df.insert(0, 'id', df.index)

In [19]:
df.to_csv('../data/rag_dataset.csv', index=False)