In [100]:
# !pip install lxml
# !pip install numpy
# !pip install --upgrade pip

In [101]:
from lxml import html
import requests
import numpy as np
import re
# -*- coding: utf-8 -*-

In [102]:
base_url = "https://www.irishtimes.com"
blog_url = "/crosaire-blog"
paginated_url = "?sectionTeaserPage-7.4322291="
crosaire_url = base_url + blog_url + paginated_url

In [103]:
def pull_clues(page_count):
    clues = []
    
    for i in range(page_count):
        page = requests.get(crosaire_url + str(i))
        tree = html.fromstring(page.content)
        links = tree.xpath('''
            //body/div[@id="wrapper"]
            /div[@id="torso"]/div[@class="container"]/div[@class="bt-container"]
            /div[@class="row"]/div[@id="content_left"]/div[@class="row sectionteaser"]
            /div/a''')

        for elt in links:
            clue_page_url = base_url + elt.attrib['href']
            clue_page = requests.get(clue_page_url)
            clue_tree = html.fromstring(clue_page.content)

            clue_list = clue_tree.xpath('''//body/div[@id="wrapper"]
                /div[@id="torso"]/div[@class="container"]/div[@class="bt-container"]
                /div[@class="row"]/div[@id="content_left"]/article[@class="article row"]
                /div[@class="article_holder span8 genre-advice"]/section[@class="article_body"]
                /div[@class="article_bodycopy"]/p''')

            for clue in clue_list:
                clues.append(clue.text_content().replace("â","\'"))
    
    return clues

In [173]:
clues = pull_clues(100)

In [182]:
def getClue(explanation):
    return re.sub("[\(].*?[\)]", "", explanation)

def getParts(explanation):
    parts = []
    for part in re.findall('\(.*?\)', explanation):
        part = part.encode("ascii", "ignore")
        parts.append(str(part).replace("b\'(", "").replace("b\"(", "").replace(")\"", "").replace(")\'", ""))
    return parts

def getAnswerBits(parts):
    words = []
    for part in parts:
        for bit in re.findall('([A-Z]+(?:(?!\s?[A-Z][a-z])\s?[A-Z])+)', part):
            words.append(bit.encode("ascii", "ignore"))
    return words

def getAnswer(answerBits):
    if len(answerBits) > 0:
        return sorted(answerBits, key=len)[-1]
    return ""

class ClueSegment:
    def __init__(self, cluePart, answerPart):
        self.cluePart = cluePart
        self.answerPart = answerPart
        
    def __repr__(self):
        return "Clue Part: %s; Answer Part: %s"  % (self.cluePart, self.answerPart)

def getClueSegments(explanation):
    clueParts = []
    for part in explanation.split(")"):
        if "(" in part:
            split = part.split("(")
            clueParts.append(ClueSegment(split[0], split[1]))
    return clueParts

class Clue:    
    def __init__(self, explanation):
        self.explanation = explanation.lstrip('0123456789.- ').rstrip(' ,')
        self.clue = getClue(self.explanation)
        self.parts = getParts(self.explanation)
        self.answerBits = getAnswerBits(self.parts)
        self.answer = getAnswer(self.answerBits)
        self.clueSegments = getClueSegments(self.explanation)

In [183]:
anagramIndicators = set()
reversalIndicators = set()
homophoneIndicators = set()
positionIndicators = set()
otherIndicators = set()

In [184]:
clueObjects = []
for exp in clues:
    if (exp != 'Across:' and exp != 'Down:'):
        clue = Clue(exp)
        clueObjects.append(clue)
#         print(clue.clueSegments)

In [185]:
for clueObj in clueObjects:
    for clueSeg in clueObj.clueSegments:
        ans = clueSeg.answerPart
        if "indicator" in ans:
            indicator = clueSeg.cluePart.rstrip(" ").lstrip(" ")
            if "anagram" in ans:
                anagramIndicators.add(indicator)
            elif "revers" in ans:
                reversalIndicators.add(indicator)
            elif "homophone" in ans:
                homophoneIndicators.add(indicator)
            elif "position" in ans:
                positionIndicators.add(indicator)
            else:
                otherIndicators.add(ans)

In [186]:
print("Anagram Indicators:\n", anagramIndicators)
print("\nReversal Indicators:\n", reversalIndicators)
print("\nHomophone Indicators:\n", homophoneIndicators)
print("\nPosition Indicators:\n", positionIndicators)

Anagram Indicators:
 {'travelling', 'damaging', 'handicap', 'consumed by baked', 'by awful', 'drawn', 'collapse', 'shot', 'to train', 'question', 'for damaged', 'Batty', 'mix-up', 'Broke', 'pie', 'and odd', 'action?', 'unsettles', 'stew', 'rotten', 'order', 'medley', 'produced by awful', 'play?', 'criminal', 'dish', 'Collected', 'Salad', '-fixing', 'engineer', 'tipsy', 'novel?', 'set', 'plays', 'manufactured', 'fabricating', 'salad', 'finds criminal', 'Wild', 'of new', 'trouble', 'all over the place', 'Angry', 'Fresh', 'cook', ', soup', 'Rocky', 'bizarre', 'Criminal', 'playing', 'of spoil', 'of trouble', 'settlement', 'rebel', 'Damaged', 'badly', 'in damaged', 'distributing', 'riot', 'from wild', 'Dizzy', 'switch', 'for new', 'running amok', 'blows up', 'of wild', 'working?', 'Building', 'worried', 'in mischievous', 'confused', 'falling into rocking', 'in unstable', 'fudge', 'swimming', 'in terrible', 'Lost', 'Translate', 'welcomes jazz', 'false', 'rewrites', 'rig', 'cocktail', 'jockey