In [None]:
# default_exp scraper

# The Guardian Scraper

> Scraping Premier League Previews from the Guardian.

<div style="font-size: 200px">
    
|            Issues                 |          Solutions          |
|------------------------------     |-------------------|
|   4 possible formats for previews(old format, new format,Cup's format and a particular format) |Select the appropriate html tags|
|   Preview titles are not the same ( we can find Squad Sheets or match preview)|Pick only the names of the teams and eliminate the rest|
|   The date of the match is not always available |Pick the preview date|
|   The order of the elements and labels are not the same |Using regex patterns to get information|
|   Missing values for betting odds |We treat the general case separately and we set up specific regex patterns for these particular cases|
|   Odds format is different|We treat the general case separately and we set up specific regex patterns for these particular cases|
|   We can find non-numeric values for Odds like (Evens,evens,Eve)|Replace evens by 1-1|
|   There are some previews that don't have author and text|For previews that have no text, we put 'n/a' (not available)|
|   The existence of previews for the FA CUP,Carabao Cup,Champions league,World Cup|Filter previews by title,link,topic,aside html section and preview text and allow only Premier League previews|
|   We are not sure if the names of the teams are the same as the ones in Opta|Set up a dictionary or check manually to map teams to their IDs|
|When we send many requests, the guardian server blocks your IP address, which is interpreted as a DDOS attack|Do a sleep of a random x seconds between requests or change your IP and work with rotating proxy|
</div >


### Import Libraries and Modules

In [None]:
# hide
from nbdev.showdoc import *

In [None]:
# export
import re
import dateparser
import pandas as pd
import numpy as np
import pymongo
import mongoengine
import json
import random 
import logging
from mongoengine import * 
from typing import *
from requests_html import HTMLSession
from bs4 import BeautifulSoup
from time import sleep
from datetime import datetime
from os import listdir
from os.path import isfile, join

### Parser Class

##### This class is used to parse pages and has 3 functions:

1- <b> parse_page </b> function:retrieves the html format of a given web page link.

2- <b> store_page_locally </b> function: save a preview page in a specified local folder.

3- <b> get_next_page </b> function: retrieves the link to the next page and determines if it is the last page of previews in order to stop scraping. 


In [None]:
# export
class Parser:
    """
    A class to represent previews pages parser.

    ...

    Methods
    -------
    parse_page(page_url, session)
        returns the html format of the page.
    store_page_locally(page, page_url)
        save a given page in a local folder.
    get_next_page(page)
        returns the link of the following page and if it's the last page.
    """

    @staticmethod
    def parse_page(page_url: str, session: HTMLSession) -> BeautifulSoup:
        """
            returns the html format of the page.

        Parameters
        ----------
        page_url: str
            the url of the page
        session: requests_html.HTMLSession
            the scraper session

        Returns
        -------
        page: bs4.BeautifulSoup
              the html format of the page

        """
        # Request the url
        request = session.get(page_url)
        # Get the html document of the page
        page = BeautifulSoup(request.text, "html.parser")
        return page

    @staticmethod
    def store_page_locally(page: BeautifulSoup, page_url: str) -> None:
        """
            Save a given page in a local folder.

        Parameters
        ----------
        page: bs4.BeautifulSoup
            the html format of the page
        page_url: str
            the url of the page

        Returns
        -------
        None

        """
        # Get the preview url
        # Delete the "https://www." part
        # Replace "/" by "_"

        page_url = page_url.replace("https://www.", "").replace("/", "_")
        directory_path = ".//previews//"
        file_path = directory_path + page_url + ".html"
        # Create a file and save the html content of the page
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(str(page))
        # Close the file
        file.close()

    @staticmethod
    def get_next_page(page: BeautifulSoup) -> Tuple[str, bool]:
        """
            returns the link of the following page and if it's the last page.

        Parameters
        ----------
        page : bs4.BeautifulSoup
            the html format of the page

        Returns
        -------
        url: str
          the url of the next page
        last_page: bool
          True if it's the last page, False otherwise.

        """
        # If we are at the last page , last_page = True else last_page = False
        last_page = False
        # Pick up the pagination HTML part
        pagination_section = page.find("div", {"class": "pagination__list"})
        # If we don't find the "next" button (it's the last page)
        # We are in the last page
        if not page.find("a", {"rel": "next"}):
            # We pick up the number of the page and we return the link
            html_location = dict({"aria-label": "Current page"})
            page_number = page.find("span", html_location).text
            url = (
                "https://www.theguardian.com/football/series/match-previews?page="
                + page_number
            )
            last_page = True
            return url, last_page
        # If it's not the last page, we pick up the link of the following page
        else:
            url = page.find("a", {"rel": "next"})["href"]
            return url, last_page

### PageExtractor Class

##### This class has five functions for extracting data from a given football preview:

1- <b> get_values_matching_regex </b> returns values that match a regex expression.

&emsp;Because the "Guardian" website has two possible formats, we defined two possible classifiers for the p tags <br>&emsp;containing the information to be extracted.<br>
&emsp;We go through each p section, and if we find the result, we return it; otherwise, a None is returned.<br>
&emsp;The result is a list of tuples, with each tuple representing a value that matches the regex pattern.<br> &emsp;Unsatisfied patterns for regexes that include <b>OR</b> conditions will be empty tuples. That's why you need to get rid of it.

2- <b> extract_teams_names </b> returns the names of the two teams in a football preview.

&emsp;The preview includes team names at the title level.
 <br>&emsp;example:
          &emsp;&emsp;{{Squad Sheets: Team A v Team B}} 
         or &emsp;&emsp;{{Team A v Team B: match preview}} 
         or &emsp;&emsp;{{Team A v Team B: Squad Sheets}}
<br>&emsp;As a result, our strategy is to delete the text preceding or following the names and recover each name <br>&emsp;individually.
<br>&emsp;If we were successful in obtaining the names, they will be returned in a Python dictionary; <br>&emsp;otherwise, the values will be 'n/a'(Not available).

3- <b> extract_text_authors </b>returns the text and author of a football preview.

&emsp;It's difficult to determine the position of the text, but it's almost certainly the block with the most <br>&emsp;characters.
<br>&emsp;To proceed, we store each paragraph and its size in a Python dictionary, and then we take the <br>&emsp;block with the largest size.
<br>&emsp;To be sure, we double-check by only accepting texts with a size greater than 160 because there <br>&emsp;are football previews with no text or author.
<br>&emsp;Furthermore, the author information is always under the text section, more specifically in a <br>&emsp;strong tag, so if the text does not exist, the author is missing as well.
If we were successful in <br>&emsp;obtaining the text and the author, they will be returned in a Python dictionary. Otherwise, the <br>&emsp;values will be 'n/a'(Not available).

4- <b> extract_preview_date </b> returns the date of publication of a football preview.

&emsp;We have distinguished two dates for the date of publication: the first is the date of publication, <br>&emsp;and the second is the date of the most recent modification.
In this sense, we go through the <br>&emsp;section where the two dates are located and take only the first and use 'dateparser' to convert <br>&emsp;the string into a date in "yyyy-mm-dd" format.
If we were successful in obtaining the date, it will <br>&emsp; be returned. Otherwise, the value will be 'n/a'(Not available).

5- <b> extract_match_infos </b> returns a football match information (venue, referee, odds).

&emsp;Here, we'll call the first function <b>get_values_matching_regex</b> , which will allow us to retrieve this<br>&emsp;information by specifying a regex expression for each.<br>&emsp;If this data is not available, the value will be 'n/a'. 

In [None]:
# export
class PageExtractor:
    """
    A class to represent an information extractor from a football preview.

    ...

    Methods
    -------
    get_values_matching_regex(page, regex)
        return all matched patterns from a preview page.
    extract_teams_names(title)
        returns team names from the preview title.
    extract_text_authors(page)
        returns the text and author of the preview.
    extract_preview_date(page)
        returns the publication date of the preview.
    extract_match_infos(page, venue_regex, referee_regex, odds_regex)
        returns a football match information (venue,referee,odds).
    """

    @staticmethod
    def get_values_matching_regex(
        page: BeautifulSoup, regex: str
    ) -> Union[List[str], None]:
        """
        returns all matched patterns from a preview page.

        Parameters
        ----------
        page: bs4.BeautifulSoup
            the html format of the page
        regex: str
            the regex expression

        Returns
        -------
        result: list of str
          matched values of the regex expression, None otherwise

        """
        # All Information are located in the "p tag" of html
        # We pick up all the p tags
        # some previews in 2009 have a different html tags and classes
        all_p_tags_new_formats = page.find_all("p", {"class": "dcr-bixwrd"})
        all_p_tags_old_format = page.select("div > p")
        # if exist
        if all_p_tags_new_formats:
            paragraphs = all_p_tags_new_formats
        else:
            paragraphs = all_p_tags_old_format

        for paragraph in paragraphs:
            # We pick up the string values located in the paragraph
            # For "odds" information, "Evens" or "Evs" are replaced by 1-1
            pattern_odds = re.compile("Evens|Evs", re.IGNORECASE)
            section = pattern_odds.sub("1-1", paragraph.text)
            # To extract our information regex pattern
            # To ignore case sensitivity we use re.I
            pattern_returned_values = re.compile(regex, re.IGNORECASE)
            # If a regex match is found, we return the list of values.
            # otherwise, an empty array is returned.
            if pattern_returned_values.findall(section):
                matching_result = pattern_returned_values.findall(section)
                # remove empty tuples from the list
                # example of a matching_result value
                # [('12-5', '11-10', '23-10', '', '')]
                result = [element for element in matching_result[0] if element]
                return result
        return None

    @staticmethod
    def extract_teams_names(title: str) -> Dict[str, object]:
        """
        returns team names from the preview title.

        Parameters
        ----------
        title: str
            the title of the preview

        Returns
        -------
        names: dict of object

        """
        # 3 possible formats for previews title
        # For example:
        # {Squad Sheets: Team A v Team B} or
        # {{Team A v Team B : match preview}} or
        # {{Team A v Team B : Squad sheets}}
        # We remove text before or after team names
        pattern = re.compile(
            "Squad Sheets:|: Squad[\s]sheets|Squad sheets|Squad sheet:|: match preview",
            re.IGNORECASE,
        )
        preview_title = pattern.sub("", title).strip()
        # Names are located in the title of the preview
        # Home team
        try:
            home_team = preview_title.split(" v ")[0]
        except Exception as e:
            home_team = None
        # Away team
        try:
            away_team = preview_title.split(" v ")[1].split("\t")[
                0
            ]  # for some preview we find team A v Team B \t date
        except Exception as e:
            away_team = None
        # we return names
        names = dict({"home": home_team, "away": away_team})
        return names

    @staticmethod
    def extract_text_authors(page: BeautifulSoup) -> Dict[str, str]:
        """
        returns the text and author of the preview.

        Parameters
        ----------
        page: bs4.BeautifulSoup
            the html format of the page

        Returns
        -------
        preview_text_author: dict of str

        """
        # Preview may not have text and author,
        # We initialize author and text to 'n/a' (not available),
        author = None
        text = None
        # all items are stored in a p tag
        # Some previews in 2009 have different html tags and classes
        all_p_tags_new_formats = page.find_all("p", {"class": "dcr-bixwrd"})
        all_p_tags_old_format = page.select("div > p")
        # if exist
        if all_p_tags_new_formats:
            all_p_tags = all_p_tags_new_formats
        else:
            all_p_tags = all_p_tags_old_format

        # it's quite difficult to determine which section is the text
        # the length of the text is usually the longest
        # dictionnary to store each p and its length
        length_texts = {}
        for p in all_p_tags:
            section = p.text
            length_texts[p] = len(section)

        # we pick the section with the largest size
        possible_text_section = max(length_texts, key=length_texts.get)
        # We double-check and only select texts with a size greater than 160
        if len(possible_text_section.text) > 160:
            text_section = possible_text_section
            text = text_section.text
            # the author name is located inside the text section
            # it is located in the strong tag
            possible_author_section = text_section.find("strong")
            # for some previews the author information is not found
            # if it's available we take it , else it will be 'n/a'
            if str(possible_author_section) != "None":
                author = possible_author_section.text

        preview_text_author = dict({"text": text, "author": author})
        return preview_text_author

    @staticmethod
    def extract_preview_date(page: BeautifulSoup) -> Union[datetime, None]:
        """
          returns the publication date of the preview.

        Parameters
        ----------
        page: bs4.BeautifulSoup
            the html format of the page

        Returns
        -------
        preview_date: datetime.date
          if not found None

        """
        # there are 2 dates for the preview
        # the first is the date of publication
        # the second is the date of the last modification which is hidden
        # we pick only the first one
        try:
            # Some preview in 2009 have different html tags and classes
            html_new_location = dict({"class": "dcr-km9fgb"})
            html_old_location = dict({"itemprop": "datePublished"})
            dates_section_new_format = page.find("div", html_new_location)
            dates_section_old_format = page.find("time", html_old_location)
            if dates_section_new_format:
                dates_section = dates_section_new_format.strings
            else:
                dates_section = dates_section_old_format.strings

            for date in dates_section:
                preview_date = dateparser.parse(date).date()
                break
        except Exception as e:
            logging.error('error: Preview date is not available')
            preview_date = None

        return preview_date

    @staticmethod
    def extract_match_infos(
        page: BeautifulSoup, venue_regex: str, referee_regex: str, odds_regex: str
    ) -> Dict[str, str]:
        """
          returns a football match information (venue,referee,odds).

        Parameters
        ----------
        page: bs4.BeautifulSoup
            the html format of the page
        venue_regex: str
            venue regex expression
        referee_regex: str
            referee regex expression
        odds_regex: str
            odds regex expression

        Returns
        -------
        match_infos: dict of str

        """
        # Extract venue, referee and odds values
        try:
            venue = PageExtractor.get_values_matching_regex(page, venue_regex)[
                0
            ].strip()
        except Exception as e:
            logging.error('error: Venue information is not available')
            venue = None
        try:
            referee = PageExtractor.get_values_matching_regex(page, referee_regex)[
                0
            ].strip()
        except Exception as e:
            logging.error('error: Referee information is not available')              
            referee = None

        odds = PageExtractor.get_values_matching_regex(page, odds_regex)

        match_infos = dict({"venue": venue, "referee": referee, "odds": odds})
        return match_infos

### ScrapingTheGuardian Class

##### This class represents a scraper from the "Guardian" website and has 4 functions:

1- <b> calculate_betting_odds </b> returns decimal odds.

&emsp;In this section, we will calculate the odds derived from the football preview.
<br>&emsp;Considering the following example:
<br>&emsp;&emsp; ["9-20","29-5","6-5"] 
<br>&emsp;&emsp;We calculate each sport's rating separately using the following formula:
<br>&emsp;&emsp;&emsp; home = (9/20) + 1 
<br>&emsp;&emsp;&emsp; away = (29/5) + 1
<br>&emsp;&emsp;&emsp; draw = (6/5) + 1
<br>&emsp;If we were successful in obtaining decimal odds, they will be returned in a Python dictionary.<br>&emsp;Otherwise, the values will be 'n/a'(Not available).


2- <b> extract_preview_items </b>returns the entire contents of a football preview.

&emsp;In this section, we will call the functions defined in the PageExtractor class and return a Python dictionary containing all of this information.
<br>&emsp;But first, we use the <b>calculate_betting_odds</b> function to calculate the sports odds for the home team's victory, the away team's victory, and a draw.

"home team","away team","text","author","venue","referee","odds","odds home team","odds away team","odds draw", "preview date","preview_link" are the returned values.

3- <b> save_previews_locally </b> save all browsed previews in a local folder.

&emsp;we verify if we have reached the last extracted preview date,
if yes, we will stop the scraper
<br>&emsp;For a given page, we retrieve all the previews and go through them one by one, taking the link, title, subject, and aside section.
<br>&emsp;if the words "cup" or "champions league" do not belong in these sections, we save the preview in a local folder
<br>&emsp;Otherwise, we move on to the next preview.
<br>&emsp;The <b>store_page_locally</b> function, will be called here to save the preview page.

4- <b> extract_previews_information </b> returns the information of all local previews.

&emsp;In a local folder, we extract the information for each file (preview in html format) by calling the <b> extract_preview_items</b> function <br>&emsp;and then we save the information for each preview in a list.

In [None]:
# export
class ScrapingTheGuardian:
    """
    A class to represent a scraper from the "Guardian" website.

    ...

    Attributes
    ----------
    session : requests_html.HTMLSession
        a web session
    VENUE_REGEX : str
        venue regex expression
    REFEREE_REGEX : str
        referee regex expression
    ODDS_REGEX : str
        odds regex expression

    Methods
    -------
    calculate_betting_odds(odds)
        returns decimal odds.
    extract_preview_items(page,title)
        returns all information of a football preview.
    save_previews_locally(self,page,last_date_stop,last_preview)
        save all browsed previews in a local folder.
    extract_previews_information(self,folder_path)
        returns all the information of all local previews.

    """

    # venue, referee, odds pattern regex
    # in some previews, all of the information is on the same line.
    VENUE_REGEX = "Venue(.*)Tickets|Venue(.*),|Venue(.*)"
    REFEREE_REGEX = "Referee(.*)This season's|Referee(.*)Last season's|Referee(.*)Odds|Referee(.*)|Ref(.*)Odds"
    # {Odds H 11-8 A 11-8 D 11-8}
    # {Odds Liverpool 11-8 Aston Villa 11-8 Draw 11-8}
    # missing label {Odds H 11-8 11-8 D 11-8}
    # missing value {Odds H 11-8 A 11-8}
    ODDS_REGEX = "Odds[\s]*.*[\s]+(\d{1,3}-\d{1,3})[\s]*.*[\s]+(\d{1,3}-\d{1,3})[\s]*.*[\s]+(\d{1,3}-\d{1,3})|Odds[\s]*.*[\s]+(\d{1,3}-\d{1,3})[\s]*.*[\s]+(\d{1,3}-\d{1,3})"

    def __init__(self):

        # Initialize session to start scraping
        self.session = HTMLSession()

    @staticmethod
    def calculate_betting_odds(odds: list) -> Dict[str, float]:
        """
          returns decimal odds.

        Parameters
        ----------
        odds: list of str
            odds values

        Returns
        -------
        betting_odds: dict of float

        """
        # Initialize betting odds to n/a (not available)
        # Some previews may not include odds
        odds_home = None
        odds_away = None
        odds_draw = None

        if odds is not None:  # If odds exist
            # example of odds:
            # {H 4-6 A 43-10 D 3-1}
            # {liverpool 4-6 Tottenham 43-10 Draw 3-1}
            # {H 4-6 43-10 D 3-1}
            # {H 4-6 A 43-10}
            # The formula will be (4/6)+1 , (43/10)+1 , (3/1)+1
            # Home team odds
            betting_odds_home = odds[0]
            try:
                odds_home = (
                    int(betting_odds_home.split("-")[0])
                    / int(betting_odds_home.split("-")[1])
                ) + 1
            except ZeroDivisionError:
                logging.error('error: Home team odds are wrong')
                pass
            # Away team odds
            betting_odds_away = odds[1]
            try:
                odds_away = (
                    int(betting_odds_away.split("-")[0])
                    / int(betting_odds_away.split("-")[1])
                ) + 1
            except ZeroDivisionError:
                logging.error('error: Away team odds are wrong')
                pass
            # if we have the normal format of odds
            # we will have 3 parts(odds_home,odds_away,odds_draw)
            if len(odds) == 3:
                # Draw odds
                betting_odds_draw = odds[2]
                try:
                    odds_draw = (
                        int(betting_odds_draw.split("-")[0])
                        / int(betting_odds_draw.split("-")[1])
                    ) + 1
                except ZeroDivisionError:
                    logging.error('error: Draw odds are wrong')
                    pass

        betting_odds = dict(
            {"odds_home": odds_home, "odds_away": odds_away, "odds_draw": odds_draw}
        )
        return betting_odds

    @staticmethod
    def extract_preview_items(
        page: BeautifulSoup, title: str, link: str
    ) -> Dict[str, object]:
        """
          returns all information of a football preview

        Parameters
        ----------
        page: bs4.BeautifulSoup
            the html format of the page
        title: str
            the title of the preview
        link: str
            the link of the preview

        Returns
        -------
        preview_items: dict of object

        """
        # meth1: extract team names
        names = PageExtractor.extract_teams_names(title)
        # Home team and  Away Team
        home_team = names["home"]
        away_team = names["away"]
        # meth2: extract match infos (venue,referee,odds)
        match_infos = PageExtractor.extract_match_infos(
            page,
            ScrapingTheGuardian.VENUE_REGEX,
            ScrapingTheGuardian.REFEREE_REGEX,
            ScrapingTheGuardian.ODDS_REGEX,
        )
        venue = match_infos["venue"]
        referee = match_infos["referee"]
        odds = match_infos["odds"]
        # meth3: extract text and author of the preview
        text_author = PageExtractor.extract_text_authors(page)
        text = text_author["text"]
        author = text_author["author"]
        # meth4: extract preview date
        preview_date = PageExtractor.extract_preview_date(page)
        # meth5: calculate betting odds
        betting_odds = ScrapingTheGuardian.calculate_betting_odds(odds)
        # Home team betting odds
        odds_home_team = betting_odds["odds_home"]
        # Away team betting odds
        odds_away_team = betting_odds["odds_away"]
        # Draw betting odds
        odds_draw = betting_odds["odds_draw"]
        # Return preview items
        preview_items = dict(
            {
                "home_team": home_team,
                "away_team": away_team,
                "text": text,
                "author": author,
                "venue": venue,
                "referee": referee,
                "odds": odds,
                "odds_home_team": odds_home_team,
                "odds_away_team": odds_away_team,
                "odds_draw": odds_draw,
                "preview_date": preview_date,
                "preview_link": link,
            }
        )
        return preview_items

    def save_previews_locally(
        self, page: BeautifulSoup, last_date_stop: datetime, last_preview: bool
    ) -> bool:
        """
          save all browsed previews in local

        Parameters
        ----------
        page: bs4.BeautifulSoup
            the html format of the page
        last_date_stop : datetime
            the last extracted preview in the database
        last_preview: bool
            an indicator to know when we should stop the scraper

        Returns
        -------
        bool

        """
        # We pick all of the match previews on the webpage.
        previews = page.findAll("div", {"class": "fc-item__content"})
        # for each preview we extract its information.

        for preview in previews:
            # we pick the preview date and we parse it in a date format
            preview_date = preview.find("time")["datetime"]
            preview_date = dateparser.parse(preview_date).date()
            # if the date selected from the previews database exists
            # and has been reached by the preview date, we stop the loop
            # and mark last_preview as True.
            if last_date_stop and preview_date <= last_date_stop.date():
                logging.info(
                    "Finish: The Scraper is stopped. The last preview date is {}".format(
                        preview_date
                    )
                )
                last_preview = True
                break
            # Pick up the preview link
            preview_link = preview.find("a")["href"]
            logging.info("Preview link: {}".format(preview_link))
            # Pick up the football match preview page
            preview_page = Parser.parse_page(preview_link, self.session)
            # We need only Premier League Previews
            # To filter previews we need to Find the title of the preview
            # Champions league and Cups are not allowed
            preview_title = preview_page.find("h1").text
            # Check if "cup" or "Champions league" exists in:
            # title, link, preview topic section,preview aside section
            # we pick preview topic
            try:
                preview_topic = preview_page.find("div", {"class": "dcr-lwa3gj"}).text
            except Exception as e:
                # some previews in 2009 have different html tags
                preview_topic = preview_page.find("div", {"class": "submeta"}).text
            # we pick preview_aside
            try:
                preview_aside = preview_page.find(
                    "aside", {"data-gu-name": "title"}
                ).text
            except Exception as e:
                # some previews in 2009 have different html tags
                preview_aside = preview_page.find(
                    "div", {"class": "content__labels"}
                ).text
            # if the preview is not a cup or not for Champions league:
            # we proceed the extraction

            not_premier_league_found = False
            eliminated_matches = ["Champions League", "champions-league", "cup"]
            for word in eliminated_matches:
                # test if the word in the preview title
                if re.search(word, preview_title, re.IGNORECASE):
                    not_premier_league_found = True
                    break
                # test if the word in the preview link
                if re.search(word, preview_link, re.IGNORECASE):
                    not_premier_league_found = True
                    break
                # test if the word in the preview topic
                if re.search(word, preview_topic, re.IGNORECASE):
                    not_premier_league_found = True
                    break
                # test if the word in the preview aside
                if re.search(word, preview_aside, re.IGNORECASE):
                    not_premier_league_found = True
                    break
            # some previews include the type of competition in the text
            # we find FA Cup – Kick-off
            # so we want to eliminate these previews
            cup_in_text = PageExtractor.get_values_matching_regex(
                preview_page, "FA Cup – Kick-off"
            )

            if not not_premier_league_found and not cup_in_text:
                # save preview in a local folder
                Parser.store_page_locally(preview_page, preview_link)

        return last_preview

    def extract_previews_information(self, folder_path: str) -> List[Dict[str, object]]:
        """
          returns all the information of all previews saved in a local folder

        Parameters
        ----------
        folder_path: str
            the local folder where previews are saved

        Returns
        -------
        List of(dict of object)

        """
        # get all previews files
        files = [f for f in listdir(folder_path) if isfile(join(folder_path, f))]
        # a list to store previews information
        all_previews_information = []
        # We will proceed with the information extraction for each file (preview).
        for f in files:
            # get the preview path
            preview_path = join(folder_path, f)
            # open the file
            preview = open(preview_path)
            # get the preview link from the preview name
            # The preview's name is its link without the "https://www." and the extension html
            preview_link = (
                preview_path.replace(folder_path, "https://www.")
                .replace("_", "/")
                .replace(".html", "")
            )
            # get the html format of the preview using beautifulSoup
            preview_page = BeautifulSoup(preview, "html.parser")
            preview_title = preview_page.find("h1").text
            # get all information
            preview_infos = ScrapingTheGuardian.extract_preview_items(
                preview_page, preview_title, preview_link
            )
            # store information in the "all_previews_information" list
            all_previews_information.append(preview_infos)
            # just for testing
            logging.info("Preview data: {}".format(preview_infos))
            logging.info("-----------------------------------------------------")

        return all_previews_information

### PreviewsMapping Class

##### This class represents a mapper from the "Opta" MongoDb database and has 3 functions:

1- <b> get_team_id </b> returns the "opta" ID of a given team.

&emsp;The use of a predefined dictionary containing the teams and their various names facilitates us in matching the team names extracted from the guardian and <br>&emsp;their IDs in the opta database. The use of a dictionary was required because the team names in the previews differ and sometimes use abbreviations or<br>&emsp;nicknames.

2- <b> get_game_id_date </b>returns the id and the date of a given game.

&emsp;After completing the scraping task and extracting the information, the previews must be matched with their ids in the opta.fixture database.
We will query this <br>&emsp;database by specifying the home team, the away team, the closest gamedate to the preview publication date, and the Premier League competitionId, which  <br>&emsp;is equal to 8 in the database.

3- <b> save_mapped_data </b> save all the mapped previews in a MongoDb collection.

&emsp;For each preview in the data extracted from the Guardian, we will look for the id of the home team and the away team and match it with the <b>opta.fixture</b><br>&emsp;database to get the gameID and gameDate and finally we save it in a MongoDb collection.
<br>&emsp;To complete our mission, we will call the last two functions <b> get_team_id </b> and <b>get_game_id_date</b>.

In [None]:
# export
class PreviewsMapping:
    """
    A class to represent a data mapper from a mongo database.

    ...

    Methods
    -------
    get_team_id(team_name, df_teams)
        returns the "opta" ID of a given team.
    get_game_id_date(home_team_id, away_team_id, preview_date)
        returns the id and the date of a given game.
    get_mapped_data(data,df_teams)
        save all the mapped previews in a MongoDb collection.
    """

    @staticmethod
    def get_team_id(team_name: str, df_teams: pd.DataFrame) -> int:
        """
          returns the "opta" ID of a given team.

        Parameters
        ----------
        team_name: str
            the name of a given team
        df_teams: pd.DataFrame
            a dataframe that contains teams and their different names

        Returns
        -------
        int

        """
        # The name of a given team
        # Filter the dictionary
        # If the given team name exists in the dataframe
        # We return its optaID
        # Else we return -1
        team_name = team_name.strip().lower()
        df_filtred = df_teams[
            (df_teams["name"].str.lower() == team_name)
            | (df_teams["shortClubName"].str.lower() == team_name)
            | (df_teams["optaName"].str.lower() == team_name)
            | (df_teams["whoScoredName"].str.lower() == team_name)
            | (df_teams["sofifaName"].str.lower() == team_name)
            | (df_teams["statsName"].str.lower() == team_name)
            | (df_teams["inStatName"].str.lower() == team_name)
            | (df_teams["transfermarktName"].str.lower() == team_name)
            | (df_teams["fotmobName"].str.lower() == team_name)
            | (df_teams["oddsportalName"].str.lower() == team_name)
            | (df_teams["fminsideName"].str.lower() == team_name)
            | (df_teams["nickName1"].str.lower() == team_name)
            | (df_teams["nickName2"].str.lower() == team_name)
            | (df_teams["nickName3"].str.lower() == team_name)
        ]

        if len(df_filtred) > 0:
            return df_filtred["optaId"][df_filtred.index[0]]

        return -1

    @staticmethod
    def get_game_id_date(
        home_team_id: str, away_team_id: str, preview_date: datetime
    ) -> Dict[str, object]:
        """
          returns the id and the date of a given game.

        Parameters
        ----------
        home_team_id: str
            the opta id of a home team.
        away_team_id: str
            the opta id of an away team.

        Returns
        -------
        dict of object

        """
        # Initialize a MongoDb instance with mongoengine
        mongoengine_client = MongoClient.connect("0")
        # Do a MongoDb query
        # Filter data by gameDate,competitionId,
        # homeTeamId, awayTeamId
        # Query
        game_filter = {
            "gameDate": {"$gt": preview_date},
            "competitionId": 8,
            "homeTeamId": int(home_team_id),
            "awayTeamId": int(away_team_id),
        }
        # Get only gameId and gameDate fields
        projection = {"gameId": 1, "gameDate": 1, "_id": 0}
        # Get data
        result = MongoClient.find(
            mongoengine_client, "opta", "Fixture", game_filter, projection
        ).limit(1)
        game_id = None
        game_date = None
        # If there is a match
        # We pick the game ID and date
        query = list(result)

        if len(query) > 0:
            game_id = query[0]["gameId"]
            game_date = query[0]["gameDate"]

        return dict({"gameId": game_id, "gameDate": game_date})

    @staticmethod
    def save_mapped_data(data: pd.DataFrame, df_teams: pd.DataFrame) -> None:
        """
          returns all the mapped information.

        Parameters
        ----------
        data: pd.DataFrame
            all previews information extracted from the Guardian.
        df_teams: pd.DataFrame
            a dataframe that contains teams and their different names.

        Returns
        -------
        None

        """
        # Copy previews data
        X = data.copy()
        X = X.replace({np.nan: None})
        # For each preview
        # We search home team and away team opta ID's
        # We pick the game ID and date from the opta.Fixture MongoDb collection
        for index, row in X.iterrows():
            # pick the home team name from the preview
            home_team = row["home_team"]
            # pick the away team name from the preview
            away_team = row["away_team"]
            # get their opta ID's
            home_team_id = PreviewsMapping.get_team_id(home_team, df_teams)
            away_team_id = PreviewsMapping.get_team_id(away_team, df_teams)
            # pick the preview date
            preview_date = dateparser.parse(row["preview_date"])
            # get the id and the date of the game
            game = PreviewsMapping.get_game_id_date(
                home_team_id, away_team_id, preview_date
            )
            logging.info('Game {} in {}: {} Vs {} '.format(game["gameId"],preview_date,home_team, away_team))
            # connect to our mongoDb cluster
            mongoengine_client = MongoClient.connect("1")
            # preview class
            preview = Previews(
                gameId=game["gameId"],
                homeTeam=row["home_team"],
                awayTeam=row["away_team"],
                text=row["text"],
                author=row["author"],
                venue=row["venue"],
                referee=row["referee"],
                odds=row["odds"],
                oddsHomeTeam=row["odds_home_team"],
                oddsAwayTeam=row["odds_away_team"],
                oddsDraw=row["odds_draw"],
                gameDate=game["gameDate"],
                previewDate=row["preview_date"],
                previewLink=row["preview_link"],
            )
            # Validate and save input raw data
            MongoClient.save(preview)

### Previews Class

##### This class represents the extracted previews from the guardian:

We created a class that contains the various attributes to store in order to save every preview information extracted from the Guardian website.
<br>This class ensures that data is stored in a convenient format and type.

In [None]:
# export
class Previews(Document):
    """
        A class to represent the extracted previews from the guardian.

    ...

    Attributes
    ----------
    gameId : int
        the opta game id
    homeTeam : str
        home team name
    awayTeam : str
        away_team name
    text : str
        preview text
    author : str
        preview author
    venue : str
        match venue
    referee : str
        match referee
    odds : str
        betting odds
    oddsHomeTeam : float
        decimal betting odds for home team
    oddsAwayTeam : float
        decimal betting odds for away team
    oddsDraw : float
        decimal betting odds for draw
    gameDate : datetime
        the date of the match
    previewDate : datetime
        the date of the preview
    previewLink : str
        the Guardian preview link

    """

    gameId = IntField()
    homeTeam = StringField()
    awayTeam = StringField()
    text = StringField()
    author = StringField()
    venue = StringField()
    referee = StringField()
    odds = StringField()
    oddsHomeTeam = FloatField()
    oddsAwayTeam = FloatField()
    oddsDraw = FloatField()
    gameDate = DateTimeField()
    previewDate = DateTimeField()
    previewLink = StringField()

### MongoCLient Class

##### This class represents a MongoDb Client and has 4 functions:

1- <b> find_credentials </b> returns the MongoDb credentials stored in a local file.


2- <b> connect </b>returns the MongoDb instance to connect to a given cluster.

There are two database URIs in the credentials file, the first of which is for the "OPTA" Database.
<br>The second is for another database that was created to store extracted previews.

3- <b> save </b> save a MongoDb collection.

4- <b> find </b> find a MongoDb query.


In [None]:
# export
class MongoClient:
    """
    A class to represent a MongoDb client.

    ...

    Attributes
    ----------
    CREDENTIALS_PATH : str
        the file path of the MongoDb credentials

    Methods
    -------
    find_credentials()
        returns MongoDb credentials stored in a local file.
    connect(index)
        returns the mongoDb instance.
    save(collection)
        save the MongoDb collection.
    find(mongoengine_client,db,collection,game_filter,projection)
        find a MongoDb query.

    """

    # the file path of the MongoDb credentials
    CREDENTIALS_PATH = "//home//meherkh//secrets//credentials.json"

    @staticmethod
    def find_credentials() -> dict:
        """
          returns MongoDb credentials stored in a local file.

        Returns
        -------
        dict

        """
        # open file and extract the json fields
        with open(MongoClient.CREDENTIALS_PATH) as credentials:
            mongo_credentials = json.load(credentials)
            return mongo_credentials

    @staticmethod
    def connect(index: str) -> pymongo.mongo_client.MongoClient:
        """
          returns the mongoDb instance.

        Parameters
        ----------
        index: str
            the index of the cluster

        Returns
        -------
        pymongo.mongo_client.MongoClient

        """
        # Initialize a MongoDb instance with mongoengine
        disconnect()
        DB_URI = MongoClient.find_credentials()["DB_URI"][index]
        mongoengine_client = connect(host=DB_URI)
        return mongoengine_client

    @staticmethod
    def save(
        collection: mongoengine.base.metaclasses.TopLevelDocumentMetaclass,
    ) -> None:
        """
          save the MongoDb collection.

        Parameters
        ----------
        collection: mongoengine.base.metaclasses.TopLevelDocumentMetaclass
            the MongoDb collection

        Returns
        -------
        None

        """
        try:
            collection.validate()
            collection.save()
        except Exception as e:
            logging.error('error: {}'.format(e))

    @staticmethod
    def find(
        mongoengine_client: pymongo.mongo_client.MongoClient,
        db: str,
        collection: str,
        game_filter: dict,
        projection: dict,
    ) -> pymongo.cursor.Cursor:
        """
          find a MongoDb query.

        Parameters
        ----------
        mongoengine_client: mongoengine.base.metaclasses.TopLevelDocumentMetaclass
            the MongoDb collection
        db: str
            the Database name
        collection: str
            the collection name
        game_filter: dict
            the query filter
        projection: dict
            the query projection

        Returns
        -------
        pymongo.cursor.Cursor

        """

        result = mongoengine_client[db][collection].find(
            filter=game_filter, projection=projection
        )
        return result


## USE CASE


### Scraping all pages and save in local

In [None]:
# starting url
url = "https://www.theguardian.com/football/series/match-previews"
# initialize the scraper instance.
scraper = ScrapingTheGuardian()
# initially we are not at the last page.
last_page = False
# we'll extract the previews that haven't already been extracted.
last_preview = False
# we specify the last preview date in the previews collection on which the scraper will be turned off.
mongoengine_client = MongoClient.connect("1")
projection = {"previewDate": 1, "_id": 0}
result = (
    MongoClient.find(mongoengine_client, "opta", "previews", {}, projection)
    .sort("previewDate", -1)
    .sort("gameDate", -1)
    .limit(1)
)
# if the database is empty we will scrap all pages
query = list(result)
if len(query) > 0:
    last_date_stop = query[0]["previewDate"]
else:
    last_date_stop = None

# if we are not at the last page 
# and we haven't reached an extracted preview
# we launch the scraper
while not last_page and not last_preview:
    # a random timer
    time = random.randint(2, 60)
    logging.info('Waiting for {} seconds ...'.format(time))
    # wait time seconds
    sleep(time)
    logging.info('The current page URL: {}'.format(url))
    # get the html format of the page containing previews
    page = Parser.parse_page(url, scraper.session)
    # launch the scraper , save previews in a local folder
    # and get the first and last extracted preview date
    # and if we are at the last preview or not
    last_preview = scraper.save_previews_locally(page, last_date_stop, last_preview)
    # get the url of the following page and verify if we are at the last page
    url, last_page = Parser.get_next_page(page)

### Extract information from local previews

In [None]:
local_folder = ".//previews//"
scraper = ScrapingTheGuardian()
all_previews_information = scraper.extract_previews_information(local_folder)

  date_obj = stz.localize(date_obj)


In [None]:
data = pd.DataFrame(all_previews_information)

In [None]:
data

Unnamed: 0,home_team,away_team,text,author,venue,referee,odds,odds_home_team,odds_away_team,odds_draw,preview_date,preview_link
0,Bolton Wanderers,Tottenham Hotspur,"Old Trafford, where they have not won since 19...",Tim Rich,Reebok Stadium,M Jones,"[13-5, 10-11, 23-10]",3.600000,1.909091,3.30,2009-10-02,https://www.theguardian.com/football/2009/oct/...
1,Manchester City,Hull City,Hull's previous visit to Eastlands ended in a ...,Jamie Jackson,City of Manchester Stadium,L Probert,"[2-9, 10-1, 21-5]",1.222222,11.000000,5.20,2009-11-27,https://www.theguardian.com/football/2009/nov/...
2,Manchester City,Fulham,Mark Hughes will have half an eye on events at...,Chris Bell,City of Manchester Stadium,K Friend,"[2-5, 13-2, 3-1]",1.400000,7.500000,4.00,2009-10-23,https://www.theguardian.com/football/2009/oct/...
3,Sunderland,Everton,Confidence could well be in short supply with ...,Louise Taylor,Stadium of Light,M Atkinson,"[13-10, 9-5, 11-5]",2.300000,2.800000,3.20,2009-12-24,https://www.theguardian.com/football/2009/dec/...
4,Fulham,Tottenham Hotspur,Teams are increasingly viewing trips to Craven...,Dominic Fifield,Craven Cottage,S Bennett,"[17-10, 27-20, 9-4]",2.700000,2.350000,3.25,2009-12-24,https://www.theguardian.com/football/2009/dec/...
...,...,...,...,...,...,...,...,...,...,...,...,...
80,Portsmouth,Burnley,Avram Grant is growing tired of the hypothetic...,James Callow,Fratton Park,P Dowd,"[4-5, 3-1, 12-5]",1.800000,4.000000,3.40,2009-12-04,https://www.theguardian.com/football/2009/dec/...
81,Wigan Athletic,Bolton Wanderers,Bolton arrive at the DW Stadium outside the re...,Marcus Christenson,DW Stadium,A Wiley,"[1-1, 12-5, 9-4]",2.000000,3.400000,3.25,2009-12-18,https://www.theguardian.com/football/2009/dec/...
82,Blackburn Rovers,Stoke City,Surviving for three years is the key to Premie...,Richard Rae,Ewood Park,H Webb,"[9-10, 13-5, 12-5]",1.900000,3.600000,3.40,2009-11-27,https://www.theguardian.com/football/2009/nov/...
83,Arsenal,Tottenham Hotspur,Robbie Keane's belief that Tottenham have a sq...,Kevin McCarra,Emirates Stadium,M Clattenburg,"[11-20, 9-2, 13-5]",1.550000,5.500000,3.60,2009-10-30,https://www.theguardian.com/football/2009/oct/...


In [None]:
data.to_csv(".//datasets//previews.csv",index=False)

## Part 2 : Mapping

In [None]:
# a dictionary that contains the teams and their different names.
df_dict = pd.read_csv(".//datasets//final_data.csv")
# extracted previews
df_guardian = pd.read_csv(".//datasets//previews.csv")

In [None]:
df_dict.head()

Unnamed: 0,ID,optaId,name,symid,shortClubName,optaName,whoScoredName,sofifaName,statsName,inStatName,transfermarktName,fotmobName,oddsportalName,fminsideName,nickName1,nickName2,nickName3
0,9e78bbc137fd00c66162080bc9e987e67297643dc50616...,3,Arsenal,ARS,Arsenal,Arsenal,Arsenal,Arsenal,Arsenal,Arsenal FC,Arsenal FC,Arsenal,Arsenal,Arsenal,Arsenal FC,Arsenal Football Club,The Gunners
1,0ef9883721814dd09038659130c61c76f18976cb7b8e86...,47,Portsmouth,POR,Portsmouth,Portsmouth,Portsmouth,Portsmouth,Portsmouth,Portsmouth,Portsmouth FC,,,,Portsmouth FC,Portsmouth Football Club,Pompey
2,eb89c068ca204a72408360450847a990c97c5b5ff0ec9f...,110,Stoke City,STK,Stoke,Stoke City,Stoke,Stoke City,Stoke City,Stoke,Stoke City,Stoke,Stoke,Stoke,Stoke City FC,Stoke City Football Club,The Potters
3,c1a486f8ca465e58b6301f038e754058986187d454110c...,56,Sunderland,SUN,Sunderland,Sunderland,Sunderland,Sunderland,Sunderland,Sunderland AFC,Sunderland AFC,Sunderland,,,Sunderland FC,Sunderland Association Football Club,Sunderland A.F.C.
4,0db353094ccf93e0005cf378ea862b56e77cacc57b7c5e...,111,Wigan Athletic,WIG,Wigan,Wigan Athletic,Wigan,Wigan Athletic,Wigan Athletic,Wigan Athletic,Wigan Athletic,Wigan,Wigan,,Wigan Athletic FC,Wigan Athletic Football Club,The Latics The Tics


In [None]:
df_guardian.head()

Unnamed: 0,home_team,away_team,text,author,venue,referee,odds,odds_home_team,odds_away_team,odds_draw,preview_date,preview_link
0,Bolton Wanderers,Tottenham Hotspur,"Old Trafford, where they have not won since 19...",Tim Rich,Reebok Stadium,M Jones,"['13-5', '10-11', '23-10']",3.6,1.909091,3.3,2009-10-02,https://www.theguardian.com/football/2009/oct/...
1,Manchester City,Hull City,Hull's previous visit to Eastlands ended in a ...,Jamie Jackson,City of Manchester Stadium,L Probert,"['2-9', '10-1', '21-5']",1.222222,11.0,5.2,2009-11-27,https://www.theguardian.com/football/2009/nov/...
2,Manchester City,Fulham,Mark Hughes will have half an eye on events at...,Chris Bell,City of Manchester Stadium,K Friend,"['2-5', '13-2', '3-1']",1.4,7.5,4.0,2009-10-23,https://www.theguardian.com/football/2009/oct/...
3,Sunderland,Everton,Confidence could well be in short supply with ...,Louise Taylor,Stadium of Light,M Atkinson,"['13-10', '9-5', '11-5']",2.3,2.8,3.2,2009-12-24,https://www.theguardian.com/football/2009/dec/...
4,Fulham,Tottenham Hotspur,Teams are increasingly viewing trips to Craven...,Dominic Fifield,Craven Cottage,S Bennett,"['17-10', '27-20', '9-4']",2.7,2.35,3.25,2009-12-24,https://www.theguardian.com/football/2009/dec/...


In [None]:
X = df_guardian.copy()
PreviewsMapping.save_mapped_data(X,df_dict)


  date_obj = stz.localize(date_obj)
