## Webpage Class

Define the core class which would represent every web page 

In [80]:
import html2text
from goose import Goose
from warnings import warn


In [81]:
class Webpage:
    """ The core Webpage Class 
    
    Every page which we analyze would be an instance of this class
    
    Attributes:
        url   : the url for this webpage
        text  : The cleaned text of the main article of the webpage
        title : 
        tags  : 
        meta_description:
        meta_keywords   :
        html : Complete HTML of the webpage
        __goose_article__: The inherent goose article which was used to populate these fields
    
    
    Methods:
        load(): Will load the url, and parse it to populate the data members of the object
        
        
    Not a lot of time has been spent in designing this class 
        
    """
    
    def __init__(self,url = ''):
        self.url = url
        self.text= ''
        self.title = ''
        self.tags = []
        self.meta_description = ''
        self.meta_keywords = []
        self.html = ''
        self.__goose_article__ = None
        
    def set_url(self,url):
        """ Function to set the url for this webpage"""
        # if needed check to make sure you are not over writing og url 
        self.url = url
        
    def load(self):
        """ Load, parse the URL and store its results into instance variables
        
        If goose is unable to extract the article correctly, we switch to html2text to do the same
        """
        g = Goose()
        if not self.url:
            raise ValueError('URL not set please use webpage.set_url(url) to set it')
        self.__goose_article__ = g.extract(url = self.url)
        
        # Setting the dependent attributes
        self.html = self.__goose_article__.raw_html
        self.title= self.__goose_article__.title
        self.tags = self.__goose_article__.tags
        self.meta_description = self.__goose_article__.meta_description
        self.meta_keywords = self.__goose_article__.meta_keywords
        
        # Now check if goose was able to extract the main content from the url correctly or not
        if not self._valid_text_():
            # if it is not valid, we fall back to html2txt for extracting the main text
            h = html2text.HTML2Text()
            h.ignore_links = True
            self.text = h.handle(self.html.decode('utf8'))
        else:
            self.text = self.__goose_article__.cleaned_text
            
        # If you do not have anything into text now, we raise an error because we are not very sure about
        # the accuracy/performance of the parsing, so we raise a valueerror
        
        # Now this error is raised after all the required parameters in the instance are populated,
        # so even if the user chooses to do nothing with the error, it would work
        if len(self.text.strip()) == 0:
            #    raise ValueError('Unable to correctly parse given URL:%s' %self.url)

            # I guess just warning the user is enough, then it is his call to proceed with this or not
            # if we raise an error we are telling,no, yelling at him to stop
            warn('Unable to correctly parse given URL:%s' %self.url)
        
    
    def _valid_text_(self):
        """ Check if the text extracted by goose is valid or not base on some heuristic 
        
        Here we can have multiple definitions checking if goose works or not
        The most trivial (and quick to compute) one would be if length of the extracted text is less than 100 letters
        
        Input : 
            - webpage object
        Return : 
            - boolean
        """
        if not self.__goose_article__:
            raise ValueError("goose article not initialized, use webpage.load() to initialize it")
        
        return len(self.__goose_article__.cleaned_text) > 100

In [82]:
w = webpage(test_url['espn'])

In [72]:
w.load()

In [73]:
print w.text.strip()

ESPN FC

US Edition

Scores

Transfers

Teams

Leagues

Cups

Highlights

More

ESPN.com

Search

Site Terms

Featured Matches

Featured Matches

Previous

Mallorca

Huesca

0

2

FT

Game Details

Live football odds with bet365. Bet Now »

Flamengo

Cruzeiro

2

0

FT

Game Details

Live football odds with bet365. Bet Now »

Levante

Sevilla FC

6:30 PM UTC

Game Details

Home: 4/1  Draw: 13/5  Away: 3/4

Odds from bet365

bet365

Borussia Monchengladbach

Hamburg SV

6:30 PM UTC

Game Details

Home: 4/6  Draw: 3/1  Away: 4/1

Odds from bet365

bet365

Paris Saint-Germain

Bordeaux

6:30 PM UTC

Game Details

Home: 1/4  Draw: 19/4  Away: 12/1

Odds from bet365

bet365

Reading

Ipswich Town

7:00 PM UTC

Game Details

Home: 5/4  Draw: 23/10  Away: 13/5

Odds from bet365

bet365

New York Red Bulls

Chicago Fire

11:00 PM UTC

Game Details

Home: 8/15  Draw: 11/4  Away: 6/1

Odds from bet365

bet365

Queretaro

Morelia

12:30 AM UTC Sep 12, 2015

Game Details

Live football odds with b

In [74]:
w.__goose_article__.cleaned_text,w.text

(u'',
 u"ESPN FC\n\nUS Edition\n\nScores\n\nTransfers\n\nTeams\n\nLeagues\n\nCups\n\nHighlights\n\nMore\n\nESPN.com\n\nSearch\n\nSite Terms\n\nFeatured Matches\n\nFeatured Matches\n\nPrevious\n\nMallorca\n\nHuesca\n\n0\n\n2\n\nFT\n\nGame Details\n\nLive football odds with bet365. Bet Now \xbb\n\nFlamengo\n\nCruzeiro\n\n2\n\n0\n\nFT\n\nGame Details\n\nLive football odds with bet365. Bet Now \xbb\n\nLevante\n\nSevilla FC\n\n6:30 PM UTC\n\nGame Details\n\nHome: 4/1  Draw: 13/5  Away: 3/4\n\nOdds from bet365\n\nbet365\n\nBorussia Monchengladbach\n\nHamburg SV\n\n6:30 PM UTC\n\nGame Details\n\nHome: 4/6  Draw: 3/1  Away: 4/1\n\nOdds from bet365\n\nbet365\n\nParis Saint-Germain\n\nBordeaux\n\n6:30 PM UTC\n\nGame Details\n\nHome: 1/4  Draw: 19/4  Away: 12/1\n\nOdds from bet365\n\nbet365\n\nReading\n\nIpswich Town\n\n7:00 PM UTC\n\nGame Details\n\nHome: 5/4  Draw: 23/10  Away: 13/5\n\nOdds from bet365\n\nbet365\n\nNew York Red Bulls\n\nChicago Fire\n\n11:00 PM UTC\n\nGame Details\n\nHome: 8/15

In [31]:
w._check_valid_text_()

False

In [83]:
for site in test_url:
    w = Webpage(test_url[site])
    w.load()
    print site,w.tags
    



blog [u'Life Stages', u'Hike', u'Jennifer Davis-Flynn', u'camp', u'Tips']
wiki []
amazon



 []
goal []
espn



 []
cnn []
reddit



 []
nyt []


In [2]:
g = Goose()

In [78]:
test_url = {
    'amazon':'http://www.amazon.com/Cuisinart-CPT-122-Compact-2-Slice-Toaster/dp/B009GQ034C/ref=sr_1_1?s=kitchen&ie=UTF8&qid=1431620315&sr=1-1&keywords=toaster',
    'blog':'http://blog.rei.com/camp/how-to-introduce-your-indoorsy-friend-to-the-outdoors/',
    'cnn':'http://www.cnn.com/2013/06/10/politics/edward-snowden-profile/',
    'goal':"http://www.goal.com/en/news/9/england/2015/09/11/15240922/im-scared-of-van-gaals-bulldog-face-admits-rojo?ICID=HP_HN_1",
    'wiki':'https://en.wikipedia.org/wiki/Georgia_Institute_of_Technology_College_of_Computing',
    'reddit':'https://www.reddit.com/r/Barca/',
    'espn':'http://www.espnfc.us/club/atletico-madrid/1068/video/2605489/barca-look-to-continue-success-against-atleti',
    'nyt':'http://www.nytimes.com/2015/09/11/world/netanyahu-makes-quick-pivot-from-loss-on-iran-deal.html'
    
    
}

<span style="color:red"> Nothing seems to work for nytimes </span>

Looking at the source of a page we can see that the main story exists in a ```<p>``` tag with class ="story-body-text story-content"

If the work involves focus on such sites where the parsers fail, we can create new parsers using Beautiful Soup, but am skipping it for now in the interesst of time