In [1]:
import re
import requests
import nltk
from bs4 import BeautifulSoup

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\qmok9\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\qmok9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\qmok9\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\qmok9\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"

In [49]:
def get_categories(title):
    """Get all categories an article belongs to.
    
    Parameters
    ----------
    title : str
        The title of the article
        
    Returns
    -------
    result : list
        A list of categories of the article (strings)
    """
    global URL, S
    PARAMS = {
        "action": "query",
        "format": "json",
        "prop": "categories",
        "titles": title
    }
    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()
    
    PAGES = DATA["query"]["pages"]
    result = []

    for k, v in PAGES.items():
        for cat in v['categories']:
            result.append(cat["title"])
    return result


def is_geographical(title):
    """Determine if an article with given title is a geographical article
    by searching in its categories for geographical keywords.
    
    Parameters
    ----------
    title : str
        The title of the article
        
    Returns
    -------
    boolean
        Whether the artical is geographical or not
    """
    categories = " ".join(get_categories(title)).lower()
    return "geography" in categories or "geographical" in categories


def get_random_articles():
    """Get random articles from Wikipedia.
        
    Returns
    -------
    geographical : set
        A set of geographical article titles
    non_geographical : set
        A set of non-geographical article titles
    """
    global URL, S
    PARAMS = {
        "action": "query",
        "format": "json",
        "list": "random",
        "rnlimit": "max",
        "rnnamespace": "0"
    }
    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()
    geographical = []
    non_geographical = []
    
    for article in DATA["query"]["random"]:
        if is_geographical(article["title"]):
            geographical.append(article["title"])
        else:
            non_geographical.append(article["title"])
    return geographical, non_geographical


def get_clean_text(title):
    """Get clean text from an article.
    
    Parameters
    ----------
    title : str
        The title of the article
        
    Returns
    -------
    result : list
        Clean text without HTML tags and special characters
    """
    global URL, S
    PARAMS = {
        "action": "parse",
        "format": "json",
        "page": title,
        "prop": "text"
    }
    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()
    raw_text = BeautifulSoup(DATA["parse"]["text"]["*"], 'html.parser').get_text()
    result = re.sub('[^A-Za-z]+', ' ', raw_text)
    return result


def get_clean_tokens(title):
    """Get preprocessed word tokens of an article.
    
    Parameters
    ----------
    title : str
        The title of the article
        
    Returns
    -------
    result : list
        A list of 
    """
    stemmer = nltk.stem.porter.PorterStemmer()
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
    
    tokenized = nltk.word_tokenize(get_clean_text(title))
    stopwords_removed = [word for word in tokenized if not word.lower() in nltk.corpus.stopwords.words("english")]
    result = [lemmatizer.lemmatize(stemmer.stem(word)) for word in stopwords_removed]
    return result

In [32]:
def fetch_data(min_geographical_articles, min_non_geographical_articles):
    """Get random geographical articles and random non-geographical articles.
    
    Parameters
    ----------
    min_geographical_articles : int
        Minimum number of geographical articles to be fetched
    min_non_geographical_articles : int
        Minimum number of non-geographical articles to be fetched
        
    Returns
    -------
    result_geographical : set
        A set of geographical article titles
    result_non_geographical : set
        A set of non-geographical article titles
    """
    global URL, S
    geographical_articles_count = 0
    result_geographical = set()
    result_non_geographical = set()
    
    while len(result_geographical) < min_geographical_articles or len(result_non_geographical) < min_non_geographical_articles:
        geographical_articles, non_geographical_articles = get_random_articles()
        result_geographical.update(geographical_articles)
        result_non_geographical.update(non_geographical_articles)
    return result_geographical, result_non_geographical

def prepare_bows(vocabulary, bows, data, tag, limit):
    """Update the current vocabulary and Bag of Words model with new text data.
    
    Parameters
    ----------
    vocabulary : set
        A set of the complete vocabulary to be updated
    bows : list
        A list of Bag of Words data to be updated
    data : list
        A list of new articles to be added to the current vocabulary and bows
    tag : string
        Class of the articles in data (geographical/non-geographical)
    limit: int
        Number of articles to be added
    """
    vocabulary = vocabulary
    bows = bows

    for article in data:
        word_counts = {}

        clean_tokens = get_clean_tokens(article)
        vocabulary.update(clean_tokens)

        for word in clean_tokens:
            if word in word_counts:
                word_counts[word] += 1
            else:
                word_counts[word] = 1

        bows.append((word_counts, tag))
        if len(bows) >= limit:
            return

### Fetch Wikipedia Data and create BoWs

In [37]:
data_geographical, data_non_geographical = fetch_data(50, 50)
vocabulary = set()
bows_geographical = []
bows_non_geographical = []
prepare_bows(vocabulary, bows_geographical, data_geographical, "geographical", 50)
prepare_bows(vocabulary, bows_non_geographical, data_non_geographical, "non_geographical", 50)

### Prepare Training and Test Data Sets

In [47]:
import random
bows = bows_geographical[:50]+bows_non_geographical[:50]
random.shuffle(bows)
train_set = bows[:50]
test_set = bows[50:]

### Classification

In [48]:
classifier = nltk.classify.NaiveBayesClassifier.train(train_set)
print("Classifier accuracy:", nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features()

Classifier accuracy: 0.6
Most Informative Features
                 control = 1              medica : non_me =      3.3 : 1.0
                 databas = 1              medica : non_me =      3.3 : 1.0
                    link = 1              medica : non_me =      3.2 : 1.0
                       p = 1              medica : non_me =      3.2 : 1.0
                   place = None           medica : non_me =      3.2 : 1.0
                    time = None           medica : non_me =      3.2 : 1.0
                    sinc = None           non_me : medica =      3.2 : 1.0
                wikidata = None           non_me : medica =      3.2 : 1.0
                    turn = None           non_me : medica =      3.1 : 1.0
                     doi = None           non_me : medica =      2.8 : 1.0
