Hello fellow Kagglers,

This notebook demonstrates the method used to create the [Simple/Normal Wikipedia Abstracts V1](https://www.kaggle.com/markwijkhuizen/simplenormal-wikipedia-abstracts-v1) dataset.

An example of using this dataset to pretrain for the CommonLit competition dataset can be found [here](https://www.kaggle.com/markwijkhuizen/simple-normal-wikipedia-abstracts-pretraining).

In short, all titles on simple Wikipedia pages are extracted and for each title both the simple and normal abstracts is acquired using the public Wikipedia API.

Documentation for the Wikipedia API can be found [here](https://www.mediawiki.org/wiki/API:Main_page).

I will keep on working to improve the dataset by adding page content next to abstracts.

***Before rerunning or editing this notebook, read the [Wikipedia API Etiquette](https://www.mediawiki.org/wiki/API:Etiquette)***

In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET

from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from multiprocessing import cpu_count
from pympler.asizeof import asizeof

import os
import sys
import nltk
import string
import math
import spacy
import gc
import re
import requests
import time
import joblib
import sys

# Download Stopword Corpus
nltk.download('stopwords')

# NLP stats
nlp = spacy.load('en_core_web_sm')
        
# Stop Words
STOP_WORDS = set(stopwords.words())

tqdm.pandas()

print(f'python version: P{sys.version}')

# Simple English Titles

Extract all page titles from the simple Wikipedia XML dump, this dump and many other dumps can be found [here](https://meta.wikimedia.org/wiki/Data_dump_torrents#Simple_English_Wikipedia).

In [None]:
# Preload extracted titles
if os.path.exists('/kaggle/input/wikipedia-abstracts/simple_titles.npy'):
    simple_titles = np.load('/kaggle/input/wikipedia-abstracts/simple_titles.npy')
else:
    root = ET.parse('../input/wikipedia-abstracts/simplewiki-20170820-pages-meta-current.xml').getroot()
    ns = { 'ns': 'http://www.mediawiki.org/xml/export-0.10/' }
    simple_titles = []

    # Simple english
    for idx, child in enumerate(tqdm(root)):
        if child.tag == '{http://www.mediawiki.org/xml/export-0.10/}page':
            # Only use articles
            if child.find('ns:ns', ns).text == '0':
                title = child.find('ns:title', ns).text
                simple_titles.append(title)
    
print(f'Found {len(simple_titles)} valid simple English Wikipedia pages')

In [None]:
# titles need to be split in sets of 20 as this is the limit of per API call
simple_titles_chunks = np.array_split(simple_titles, math.ceil(len(simple_titles) / 20))
print(f'Created {len(simple_titles_chunks)} title chunks')

# Crawl Abstract

In [None]:
# Crawl urlURL's
simple_wiki_url = 'https://simple.wikipedia.org/w/api.php?exintro&explaintext'
wiki_url = 'https://en.wikipedia.org/w/api.php?exintro&explaintext'

In [None]:
# Function to clean the abstracts
def clean_abstract(s):
    # replace all combination of line breaks and spaces with a single space
    for m in re.findall('[\s|\\n]+', s):
        s = s.replace(m, ' ')
        
    # replace missing spaces after sentence ending
    for (a, b, c) in re.findall('([a-zA-Z])([\.|!|?])([a-zA-Z])', s):
        s = s.replace(f'{a}{b}{c}', f'{a}{b} {c}')
    
    return s

def process_page(page_data, target_dict):
    title = page_data['title']
    
    abstract_original = page_data['extract']
    # Filter out abstract with styling
    if (
        # Abstract is missing
        len(abstract_original) == 0 or
        # Abstract is a referal page to other pages
        any([s in sent_tokenize(abstract_original)[0].lower() for s in [
                'displaystyle', 'may refer to', 'could mean', 'might mean', 'may mean', 'can mean', 'may be', 'can refer to',
                'might refer to', 'has several meanings', 'can mean different things', 'refers to', 'may mean the following',
                'can mean several things', 'may stand for',
            ]]) or
        # Abstract is a referal page to other pages
        any([s in title.lower() for s in ['list of', 'Lists of', 'disambiguation']])
    ):
        return
    else:
        abstract_clean = clean_abstract(abstract_original)
        target_dict[f'{title}'] = {
            'original': abstract_original,
            'clean': abstract_clean,
        }
        # Add title to valid titles
        GLOBAL_TITLES.add(title)

In [None]:
# Result Dictionaries
wiki_dict = dict()
simple_wiki_dict = dict()
GLOBAL_TITLES = set()

In [None]:
# Used to automatically stop after predefined timeout
CRAWL_TIMEOUT = 60
T_START = time.time()

This loop retrieves the abstracts for simple and normal Wikipedia for each title chunk. Take note the each loop can take between 1-20 seconds and is generally quite fast for the first few minutes after which it significantly slows down. This loop took ~15 hours when running locally.

In [None]:
for idx, title_chunk in enumerate(tqdm(simple_titles_chunks)):
    # Titles seperated with "|"
    titles = '|'.join(title_chunk)

    params = dict({
        'format': 'json',
        'action': 'query',
        'prop': 'extracts',
        'exlimit': 'max',
        'redirects': 1,
        'titles': titles,
    })

    # Make Requests
    resp_wiki = requests.get(url=wiki_url, params=params)
    resp_simple_wiki = requests.get(url=simple_wiki_url, params=params)
    # Convert response to JSON
    data_wiki = resp_wiki.json()
    data_simple_wiki = resp_simple_wiki.json()
    
    # SIMPLE WIKIPEDIA
    for page_idx, page_data in data_simple_wiki['query']['pages'].items():
        if all([s in page_data.keys() for s in ['title', 'extract']]):
            process_page(page_data, simple_wiki_dict)
            
    # WIKIPEDIA
    for page_idx, page_data in data_wiki['query']['pages'].items():
        if all([s in page_data.keys() for s in ['title', 'extract']]):
            process_page(page_data, wiki_dict)
            
    # Automatically stop after predifined time
    if time.time() - T_START > CRAWL_TIMEOUT:
        break

In [None]:
# Some statistics of the retrieved abstracts
intersection_len = len(set(simple_wiki_dict.keys()).intersection(set(wiki_dict.keys())))
simple_len = len(simple_wiki_dict.keys())

print(f'N Simple Wikipedia abstract: {len(simple_wiki_dict)}, N Normal Wikipedia Articles: {len(wiki_dict)}')
print(f'N Unique Titles: {len(GLOBAL_TITLES)}')
print(f'Intersection percentage: {intersection_len/simple_len*100:.2f}%')

# Linguistic Features

In [None]:
def get_linguistic_features(s):
    # Only NLP stats for cleaned abstract
    s_doc = nlp(s)
    s_nlp = dict({
        'lemma': [],
        'pos': [],
        'tag': [],
        'dep': [],
        'shape': [],
        'is_alpha': [],
        'is_stop': [],
    })

    for token in s_doc:
        s_nlp['lemma'].append(token.lemma_)
        s_nlp['pos'].append(token.pos_)
        s_nlp['tag'].append(token.tag_)
        s_nlp['dep'].append(token.dep_)
        s_nlp['shape'].append(token.shape_)
        s_nlp['is_alpha'].append(token.is_alpha)
        s_nlp['is_stop'].append(token.is_stop)

    return s_nlp

def remove_stopwords(words):
    return [w for w in words if not w in STOP_WORDS]

In [None]:
def process_wiki_abstracts(title, abstracts, label):
    DEL = chr(0)
    # Abstracts
    abstract_original = abstracts['original']
    abstract_clean = abstracts['clean']
    # Words
    clean_words = word_tokenize(abstract_clean)
    words_wo_stopwords = remove_stopwords(clean_words)
    # Word Count
    n_words = len(clean_words)
    n_words_wo_stopwords = len(words_wo_stopwords)
    # Sentences
    sentences = sent_tokenize(abstract_clean)
    n_sentences = len(sentences)

    # Only NLP stats for cleaned abstract
    linguistic_features = get_linguistic_features(abstract_clean)

    row = {
        'title': title,
        'abstract_original': abstract_original,
        'abstract_clean': abstract_clean,
        # Words
        'clean_words': DEL.join(clean_words),
        'words_wo_stopwords': DEL.join(words_wo_stopwords),
        # Word Count
        'n_words': n_words,
        'n_words_wo_stopwords': n_words_wo_stopwords,
        # Sentences
        'sentences': sentences,
        'n_sentences': n_sentences,
        # Simple Wiki Linguistic Features
        'lemma': DEL.join(linguistic_features['lemma']),
        'pos': DEL.join(linguistic_features['pos']),
        'tag': DEL.join(linguistic_features['tag']),
        'dep': DEL.join(linguistic_features['dep']),
        'shape': DEL.join(linguistic_features['shape']),
        'is_alpha': linguistic_features['is_alpha'],
        'is_stop': linguistic_features['is_stop'],
        # label
        'label': label,
        'label_int': 0 if label == 'Simple Wikipedia' else 1,
    }
    
    return row

In [None]:
# SIMPLE WIKIPEDIA
jobs = [joblib.delayed(process_wiki_abstracts)(title, abstracts, 'Simple Wikipedia') for title, abstracts in simple_wiki_dict.items()]
simple_wiki_rows = joblib.Parallel(
    n_jobs=cpu_count(),
    verbose=1,
    batch_size=8,
    require='sharedmem',
)(jobs)
# NORMAL WIKIPEDIA
jobs = [joblib.delayed(process_wiki_abstracts)(title, abstracts, 'Normal Wikipedia') for title, abstracts in wiki_dict.items()]
normal_wiki_rows = joblib.Parallel(
    n_jobs=cpu_count(),
    verbose=1,
    batch_size=8,
    require='sharedmem',
)(jobs)

# Create DataFrame

In [None]:
df = pd.DataFrame.from_dict(simple_wiki_rows + normal_wiki_rows)

In [None]:
display(df.info())

In [None]:
display(df.head())

In [None]:
# All lists are saved as NULL seprated strings to save memory.
print(df.loc[0, 'clean_words'].split(chr(0)))

# Statistics

Show the shortest abstracts to check if filtering process was succesful.

In [None]:
pd.options.display.max_colwidth = 64
display(df.sort_values('n_words').head(25))

In [None]:
# Error bar configuration
errorbar_config = {
    'capsize': 10, 'ecolor': 'black', 'capthick': 2, 'elinewidth': 2, 'markersize': 10, 'fmt': 'o',
}

# Words

In [None]:
display(df['n_words'].describe())

In [None]:
plt.figure(figsize=(12,8))
plt.title(f'Distribution of number of words per abstract', size=18)
df['n_words'].plot(kind='hist', bins=64)
plt.errorbar(df['n_words'].mean(), plt.gca().get_ylim()[1] * 0.25, xerr=df['n_words'].std(), color='red', **errorbar_config)
plt.show()

# Sentences

In [None]:
display(df['n_sentences'].describe())

In [None]:
plt.figure(figsize=(12,8))
plt.title(f'Distribution of number of sentences per abstract', size=18)
df['n_sentences'].plot(kind='hist', bins=32)
plt.errorbar(df['n_sentences'].mean(), plt.gca().get_ylim()[1] * 0.25, xerr=df['n_sentences'].std(), color='red', **errorbar_config)
plt.show()

# Class distribution

In [None]:
display(df['label'].value_counts().to_frame())

In [None]:
plt.figure(figsize=(8,8))
plt.title(f'Distribution of class occurances', size=18)
df['label'].value_counts().plot(kind='pie', legend=True, autopct='%1.1f%%')
plt.show()

# Save DataFrame

In [None]:
df.to_pickle('wikipedia_abstracts.pkl')