In [23]:
import requests
from pprint import pprint
import re
import lxml.html
from bs4 import BeautifulSoup


In [3]:
# init session 
S = requests.Session()

# set URL
URL = "https://en.wikipedia.org/w/api.php"

In [32]:
page_id = "353810"
topic = 'english_town'

In [5]:
# get sections
PARAMS = {
    "action": "parse",
    "pageid": page_id,
    "format": "json",
    "prop": "sections"
}

sectionsR = S.get(url=URL, params=PARAMS)
sectionDATA = sectionsR.json()



In [6]:
sections = [sec['anchor'] for sec in sectionDATA["parse"]["sections"]]
sections

['Chartered_towns_and_town_councils',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'Y',
 'Designated_new_towns',
 'See_also',
 'Notes',
 'References']

In [7]:
# create section list filtering out non-data sections
filter = [
    'Chartered_towns_and_town_councils',
    'See_also',
    'External_links',
    'Designated_new_towns',
    'Notes',
    'References'
]

sec_list = [sec for sec in sectionDATA["parse"]["sections"] if not sec['anchor'] in filter]


len(sec_list)

24

In [15]:
# make section dict if no nested sections
section_dict = {sec['anchor']:[] for sec in sec_list}


{'A': [],
 'B': [],
 'C': [],
 'D': [],
 'E': [],
 'F': [],
 'G': [],
 'H': [],
 'I': [],
 'J': [],
 'K': [],
 'L': [],
 'M': [],
 'N': [],
 'O': [],
 'P': [],
 'Q': [],
 'R': [],
 'S': [],
 'T': [],
 'U': [],
 'V': [],
 'W': [],
 'Y': []}

In [17]:
# get links
PARAMS = {
    "action": "parse",
    "pageid": page_id,
    "format": "json",
    "prop": "links"
}

linkR = S.get(url=URL, params=PARAMS)
linkDATA = linkR.json()

pprint(linkDATA["parse"]["links"])

[{'*': 'List of towns in Europe', 'exists': '', 'ns': 0},
 {'*': 'List of towns in Vatican City', 'ns': 0},
 {'*': 'List of towns in Kosovo', 'exists': '', 'ns': 0},
 {'*': 'List of towns in the European Union', 'ns': 0},
 {'*': 'List of towns in the European Economic Area', 'ns': 0},
 {'*': 'List of towns in the Sovereign Military Order of Malta', 'ns': 0},
 {'*': 'England', 'exists': '', 'ns': 0},
 {'*': 'Index of England-related articles', 'ns': 0},
 {'*': 'History of England', 'exists': '', 'ns': 0},
 {'*': 'Geography of England', 'exists': '', 'ns': 0},
 {'*': 'Administrative divisions of England', 'exists': '', 'ns': 0},
 {'*': 'Politics of England', 'exists': '', 'ns': 0},
 {'*': 'Governance of England', 'exists': '', 'ns': 0},
 {'*': 'Government of England', 'exists': '', 'ns': 0},
 {'*': 'Military of England', 'exists': '', 'ns': 0},
 {'*': 'Economy of England', 'exists': '', 'ns': 0},
 {'*': 'Culture of England', 'exists': '', 'ns': 0},
 {'*': 'Demographics of England', 'exis

In [20]:
links = [link['*'] for link in linkDATA["parse"]["links"]]
for i, link in enumerate(links):
    print(i, link)

0 List of towns in Europe
1 List of towns in Vatican City
2 List of towns in Kosovo
3 List of towns in the European Union
4 List of towns in the European Economic Area
5 List of towns in the Sovereign Military Order of Malta
6 England
7 Index of England-related articles
8 History of England
9 Geography of England
10 Administrative divisions of England
11 Politics of England
12 Governance of England
13 Government of England
14 Military of England
15 Economy of England
16 Culture of England
17 Demographics of England
18 Religion in England
19 National symbols of England
20 Outline of England
21 Bibliography of England
22 Abingdon-on-Thames
23 Accrington
24 Acle
25 Acton, London
26 Adlington, Lancashire
27 Alcester
28 Aldeburgh
29 Aldershot
30 Alford, Lincolnshire
31 Alfreton
32 Alnwick
33 Alsager
34 Alston, Cumbria
35 Alton, Hampshire
36 Altrincham
37 Amble
38 Ambleside
39 Amersham
40 Amesbury
41 Ampthill
42 Ancient borough
43 Andover, Hampshire
44 Appleby-in-Westmorland
45 Arlesey
46 Ar

985 Winterton, Lincolnshire
986 Wirksworth
987 Wisbech
988 Witham
989 Withernsea
990 Witney
991 Wiveliscombe
992 Wivenhoe
993 Woburn, Bedfordshire
994 Woburn Sands
995 Woking
996 Wokingham
997 Wolsingham
998 Wolverton and Greenleys
999 Wood Green
1000 Woodbridge, Suffolk
1001 Woodley, Berkshire
1002 Woodstock, Oxfordshire
1003 Wooler
1004 Workington
1005 Worksop
1006 Worthing
1007 Wotton-under-Edge
1008 Wragby
1009 Wymondham
1010 Yarm
1011 Yarmouth, Isle of Wight
1012 Yate
1013 Yateley
1014 Yeovil
1015 Acts of Union 1707
1016 Afternoon Tea
1017 Angevin Empire
1018 Anglo-Saxon England
1019 Anglophile
1020 Association Football in England
1021 British Regency
1022 British country clothing
1023 Church of England
1024 Counties of England
1025 Cricket in England
1026 Demography of England
1027 Districts of England
1028 Economy of England in the Middle Ages
1029 Education in England
1030 Edwardian period
1031 Elizabethan era
1032 Elizabethan government
1033 England cricket team
1034 England n

In [30]:
# visualize link data
for link in links[22:23]:
    # get page info
    PARAMS = {
        "action": "query",
        "titles": link,
        "format": "json",
        "prop": "info",
        'inprop': "displaytitle"
    }

    infoR = S.get(url=URL, params=PARAMS)
    infoDATA = infoR.json()
    pprint(infoDATA)

{'batchcomplete': '',
 'query': {'pages': {'155482': {'contentmodel': 'wikitext',
                                'displaytitle': 'Abingdon-on-Thames',
                                'lastrevid': 931443368,
                                'length': 47515,
                                'ns': 0,
                                'pageid': 155482,
                                'pagelanguage': 'en',
                                'pagelanguagedir': 'ltr',
                                'pagelanguagehtmlcode': 'en',
                                'title': 'Abingdon-on-Thames',
                                'touched': '2019-12-23T21:39:04Z'}}}}


In [34]:
item_infos = []
for index, title in enumerate(items):
    with open('info_error_log.txt', 'w+', encoding="utf-8") as filepath:
        
        url = 'https://en.wikipedia.org/w/index.php?title=' + title + '&action=info'

        #print(url)
        response = requests.get(url)
        doc = lxml.html.fromstring(response.content)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        try:
            # info_table = doc.xpath('//*[@class="wikitable mw-page-info"]')[0]
            watchers = doc.xpath('//*[@id="mw-pageinfo-watchers"]')[0][1].text
            
        except:
            topic_error_line = f"watchers parse error"
            item_line = f"{str(index)}, {title}, {url}"
            print(log_error(topic_error_line, item_line))
            filepath.write(str(index) + ' : ' + title + ' : ' + url + '\n')
            watchers = "#error#"

        try:
            redirects = doc.xpath('//*[contains(text(),"redirects")]/../../td')[1].text
            #redirects = doc.xpath('//*[contains(text(),"redirects")]/parent::node()/parent::node()')
            #redirects = doc.xpath('/html/body/div[3]/div[3]/div[3]/table[2]/tr[9]/td')[1].text
            
        except:
            topic_error_line = f"redirects parse error"
            item_line = f"{str(index)}, {title}, {url}"
            print(log_error(topic_error_line, item_line))
            filepath.write(str(index) + ' : ' + title + ' : ' + url + '\n')
            redirects = "#error#"
             
        try:
            views = doc.xpath('//*[@id="mw-pvi-month-count"]/td/div')[0].text
            
        except:
            topic_error_line = f"views parse error"
            item_line = f"{str(index)}, {title}, {url}"
            print(log_error(topic_error_line, item_line))
            filepath.write(str(index) + ' : ' + title + ' : ' + url + '\n')
            views = "#error#"        
        
        try:
            edits = doc.xpath('//*[@id="mw-pageinfo-edits"]/td')[1].text
            #edits = doc.xpath('/html/body/div[3]/div[3]/div[3]/table[4]/tr[5]/td')[1].text
    
        except:
            topic_error_line = f"edits parse error"
            item_line = f"{str(index)}, {title}, {url}"
            print(log_error(topic_error_line, item_line))
            filepath.write(str(index) + ' : ' + title + ' : ' + url + '\n')
            edits = "#error#"
        
        item_infos.append({
            topic: title,
            'watchers': watchers,
            'redirects': redirects,
            'views': views,
            'edits': edits
        })
    

In [28]:
item_infos

[{'item': 'Abingdon-on-Thames',
  'watchers': '74',
  'redirects': '8',
  'views': '9,867',
  'edits': '1,162'},
 {'item': 'Accrington',
  'watchers': '43',
  'redirects': '3',
  'views': '4,159',
  'edits': '1,272'},
 {'item': 'Acle',
  'watchers': 'Fewer than 30 watchers',
  'redirects': '2',
  'views': '541',
  'edits': '154'},
 {'item': 'Acton, London',
  'watchers': '34',
  'redirects': '6',
  'views': '4,949',
  'edits': '815'},
 {'item': 'Adlington, Lancashire',
  'watchers': 'Fewer than 30 watchers',
  'redirects': '1',
  'views': '2,395',
  'edits': '314'},
 {'item': 'Alcester',
  'watchers': 'Fewer than 30 watchers',
  'redirects': '5',
  'views': '1,565',
  'edits': '394'},
 {'item': 'Aldeburgh',
  'watchers': '38',
  'redirects': '3',
  'views': '2,875',
  'edits': '572'},
 {'item': 'Aldershot',
  'watchers': '63',
  'redirects': '7',
  'views': '7,901',
  'edits': '1,499'},
 {'item': 'Alford, Lincolnshire',
  'watchers': 'Fewer than 30 watchers',
  'redirects': '0',
  'vie

In [33]:
# transfer item_infos to json file
import json
with open(topic + '_infos.json', 'w') as filepath:
    json.dump(item_infos, filepath)

In [None]:
# get data
PARAMS = {
    "action": "parse",
    "pageid": page_id,
    "format": "json",
    "prop": "wikitext"
}

textR = S.get(url=URL, params=PARAMS)
textDATA = textR.json()



In [None]:
# parse data

lines = textDATA["parse"]["wikitext"]["*"].split('\n')[3:-10] # [::-1]
this_sec = {}
sec = ''
sub_sec = ''
with open('error_lines.txt', 'w+', encoding="utf-8") as filepath:
    for index, line in enumerate(lines):
        if index == 23:
            print(line)
        if re.match('==\w', line[0:3]):
            sec = line.strip().replace("=", "")
            if len(this_sec) > 0:
                section_dict[secs[secs.index(sec) - 1]] = this_sec
            this_sec = {}
        if type(section_dict[sec]).__name__ == 'list':
            if re.match('\*\[\[', line[0:3]):
                try:
                    poet = re.search(r'\[\[(.*?)\]\]', line).group(1)
                    #print(poet)

                except:
                    print("##########################")
                    print("#### poet parse error ####")
                    print(poet_method, index, line)
                    filepath.write(line + '\n')
                    print(line[-2:])
                    print("##########################")

                date_m = re.search('\((.*)\)', line)

                try:
                    date = date_m.group(0)
                except:
                    print("##########################")
                    print("#### date parse error ####")
                    print(index, line)
                    filepath.write(line + '\n')
                    date = ""
                    print("##########################")
                try:
                    # description = re.search('\)*,*.*', line).group(1)
                    # description = re.search('\),+.*', line).group(1)
                    # description = re.search('\]\s.*\)(.*)', line).group(1)
                    description = re.search('\]\](\)*,*.*)', line).group(1)
                except Exception as e:
                    print("#################################")
                    print("#### description parse error ####")
                    print(e)
                    print(index, line)
                    filepath.write(line + '\n')
                    print("#################################")
                # print(this_sec)
                # print(sub_sec)
                section_dict[sec].append({
                    'poet': poet,
                    'date': date,
                    'description': description
                })
        else:
            #print(sec)
            if re.match('===\w', line[0:4]):
                sub_sec = line.strip().replace("=", "")
                #print(sub_sec)
            if line.strip().replace("=", "") in section_dict[sec]:
                #print(sub_sec)
                this_sec[sub_sec] = []
            if sub_sec != '':
                if re.match('\*\[\[', line[0:3]):
                    try:
                        poet = re.search(r'\[\[(.*?)\]\]', line).group(1)
                        #print(poet)
                    except:
                        print("##########################")
                        print("#### poet parse error ####")
                        print(index, line)
                        filepath.write(line + '\n')
                        print(line[-2:])
                        print("##########################")
                    date_m = re.search('\((.*)\)', line)
                    try:
                        date = date_m.group(0)
                    except Exception as e:
                        print("##########################")
                        print("#### date parse error ####")
                        print(e)
                        print(index, line)
                        filepath.write(line + '\n')
                        date = ""
                        print("##########################")
                    try:
                        # description = re.search('\)*,*.*', line).group(1)
                        # description = re.search('\),+.*', line).group(0)
                        # description = re.search('\]\s.*\)(.*)', line).group(0)
                        description = re.search('\]\](\)*,*.*)', line).group(1)

                    except Exception as e:
                        print("#################################")
                        print("#### description parse error ####")
                        print(e)
                        print(index, line)
                        filepath.write(line + '\n')
                        description = ""
                        print("#################################")

                    this_sec[sub_sec].append({
                        'poet': poet,
                        'date': date,
                        'description': description
                    })
