In [163]:
import time
import json
import requests

from random import randint as ri
from bs4 import BeautifulSoup

def parse_avherald(initial_link='http://avherald.com/?list=&opt=1', 
                   max_requests=2, base_url='http://avherald.com', delay=0):
        
    data = {}
    requests_count = 0
    next_link = initial_link
    while requests_count < max_requests:
        """
        Fetching next page..
        """
        try:
            print('Fetching', next_link, '\n', ('-'*40))
            response = requests.get(next_link)
            if response.status_code == 200:
                page = response.content
            if page:
                soup = BeautifulSoup(page, 'html.parser')
                
                # get next page link
                for img in soup.select('a > img'): 
                    if img.get('alt', None) == 'Next':
                        next_link = img.parent['href'].replace('h', '')
                        next_link = base_url + next_link
                        
                # get current page data
                data_container = \
                    soup.findAll('span', {'class': 'headline_avherald'})
                for item in data_container:
                    event_class = \
                        item.parent.parent.parent.select('img')[0]['title']
                    event_description = item.get_text()
                    if event_class in data:
                        data[event_class].append(event_description)
                    else:
                        data[event_class] = [event_description, ]
                data['next_link'] = next_link
        except Exception as e:
            print(e)
        finally:
            requests_count += 1
            if requests_count < max_requests:
                time.sleep(delay)  # delay between requests
    return data
    
# fetching
data = parse_avherald(max_requests=1000, delay=ri(1,5))

json = json.dumps(data)
with open('avherald_data.json', 'w+') as f:
    f.write(json)
    print('Data saved to avherald_data.json')

Fetching http://avherald.com/?list=&opt=1 
 ----------------------------------------
Fetching http://avherald.com/?list=&opt=1&offset=20161005000000%2B49f292af 
 ----------------------------------------
Fetching http://avherald.com/?list=&opt=1&offset=20160926000000%2B49e85e5c 
 ----------------------------------------
Fetching http://avherald.com/?list=&opt=1&offset=20160918000000%2B49eb3f54 
 ----------------------------------------
Fetching http://avherald.com/?list=&opt=1&offset=20160908000000%2B49efb31c 
 ----------------------------------------
Fetching http://avherald.com/?list=&opt=1&offset=20160829000000%2B49d50d48 
 ----------------------------------------
Fetching http://avherald.com/?list=&opt=1&offset=20160822000000%2B49cfe1b9 
 ----------------------------------------
Fetching http://avherald.com/?list=&opt=1&offset=20160814000000%2B49cc5991 
 ----------------------------------------
Fetching http://avherald.com/?list=&opt=1&offset=20160806000000%2B49c516c7 
 ------------

KeyboardInterrupt: 

In [162]:
MONTHS = [  # need extra space to excelude matching with (for ex.: Novosibirsk)
    'Jan ', 'Feb ', 'Mar ', 'Apr ',  
    'May ', 'Jun ', 'Jul ', 'Aug ', 
    'Sep ', 'Oct ', 'Nov ', 'Dec ' 
]

def parse_info(article, fail_silently=False):
    """Processes article text to get data separated by 
    airline, aircraft, date, place, description
    """
    parts = None
    for m in MONTHS:
        if m in article:
            month_index = article.index(m)
            on_index = month_index - 3
            parts = [ article[:on_index], article[month_index:] ]
    
    if parts is None:
        print(article)
        if not fail_silently:
            raise ValueError('Couldn\'t get parts.')
        return tuple(i for i in '-'*5)
    
    date_and_case = parts[1]
    airline_aircraft_place = parts[0].split(' ')
    date_and_case_words = date_and_case.split(',')
    date = date_and_case_words[0]
    case = ''.join( date_and_case_words[1:] )
    
    place_flag_index = None
    for idx, word in enumerate(airline_aircraft_place):
        if word in ['at', 'over', 'near', 'between']:
            place_flag_index = idx
    if place_flag_index:
        place = ' '.join(airline_aircraft_place[place_flag_index + 1:])
    elif 'enroute' in airline_aircraft_place:
        place = 'enroute'
        place_flag_index = airline_aircraft_place.index('enroute')
    else:
        print('Failed to parse article:', article)
        raise ValueError('Parser error, place_flag_index could not be found.')
        
    airline_aircraft = airline_aircraft_place[:place_flag_index]
    aircraft = airline_aircraft[-1]
    airline = ' '.join(airline_aircraft[:-1])
    
    return airline.strip(), aircraft.strip(), date.strip(), place.strip(), case.strip().capitalize()


def build_dataset(data):
    """Builds structured dataset dictionary
    """
    dataset = {}
    for event_type, events in data.items():
        if event_type not in ['News', 'next_link']:
            for event in events:
                airline, aircraft, date, place, case = parse_info(event, fail_silently=True)
                print(' | '.join([airline, aircraft, date, place, case]))
                
    raise NotImplementedError('TODO')
    return dataset

dataset = build_dataset(data)

KLM | A332 | Oct 14th 2016 | Mediterranean | Turbulence causes injuries
Sriwijaya | B733 | Oct 11th 2016 | Surabaya | Debris of runway surface punctures fuselage on landing
Polar | AN26 | Oct 11th 2016 | Belaya Gora | Touched down off runway
Swiss | RJ1H | Oct 10th 2016 | Geneva | Rejected takeoff due to oil fumes in cockpit
Sunstate | DH8D | Oct 10th 2016 | Canberra | Turbulence injures 3
Nordstar | B738 | Oct 9th 2016 | Moscow | Unsafe gear crack found in gear leg
ANZ | B763 | Oct 9th 2016 | South China Sea | Turbulence causes 4 injuries
BoraJet | E195 | Oct 19th 2016 | Budapest | Cracked windshield
British Airways | A319 | Oct 19th 2016 | Manchester | Nose wheel steering fault
Algerie | AT72 | Oct 19th 2016 | Algiers | Nose wheel departed aircraft on landing
Aeroflot | A320 | Oct 18th 2016 | Munich | Lightning strike
Westjet | B763 | Oct 18th 2016 | Sondrestrom | Avionics cooling failure
Lufthansa | B744 | Oct 18th 2016 | Atlantic | Smoke in cockpit
Fedex | MD11 | Oct 18th 2016 | Me

NotImplementedError: TODO