In [10]:
import scrapy
import matplotlib.pyplot as plt
from scrapy.crawler import CrawlerProcess

In [2]:
class PremierLeagueScraper(scrapy.Spider):
    """Scrapes data for the current season from NFL.com
    """
    name = "PremierLeagueSpider"
    scorelines = []

    def start_requests(self):
        """Generator for URLs to be parsed
        Notes:
            Loops through the passing, rushing, receiving and kicing data for the current season
        Yields:
            str: Initial URL to start web scraping from.
         """
        # Define URLs
        urls = ["https://www.skysports.com/league-2-results"]
        # Yield URL to scrapy request.
        for url in urls:
            yield scrapy.Request(url=url, callback=self._parse_stats)

    def _parse_stats(self, response):
        """Populates dictionaries for stats, player names, columns and data types
        Params:
            response(Selector): Scrapy Selector object which contains HTML for a given URL
        Yields:
             str: A further URL to scrape data from
        Notes:
            Process will continue as long as there is a 'next page' link on the URL
        """
        # Get the data type we're dealing with

        goals = response.xpath('//span[@class="matches__teamscores-side"]/text()').extract()

        PremierLeagueScraper.scorelines = goals

In [3]:
process = CrawlerProcess()
process.crawl(PremierLeagueScraper)
process.start()

2020-12-22 11:04:28 [scrapy.utils.log] INFO: Scrapy 2.4.0 started (bot: scrapybot)
2020-12-22 11:04:28 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.6.10 |Anaconda, Inc.| (default, Jan  7 2020, 15:01:53) - [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Darwin-19.6.0-x86_64-i386-64bit
2020-12-22 11:04:28 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-12-22 11:04:28 [scrapy.crawler] INFO: Overridden settings:
{}
2020-12-22 11:04:28 [scrapy.extensions.telnet] INFO: Telnet Password: 3b792604e6867996
2020-12-22 11:04:28 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2020-12-22 11:04:28 [scrapy.middleware] INFO

In [6]:
home_scores = [] 
away_scores = []

for position, score in enumerate(PremierLeagueScraper.scorelines):
    if position % 2 == 0:
        home_scores.append(int(score))
    else:
        away_scores.append(int(score))

In [26]:
possible_results = ["0-0", "1-0", "2-0", "3-0", "0-1", "0-2", "0-3", "1-1",
                    "2-2", "3-3", "2-1", "1-2", "3-1", "3-2", "3-3"]
scores = {}

for result in possible_results:
    # Create record in dictionary if result isn't currently included
    if result not in scores.keys():
        scores[result] = 0

    # Assign scores for different results
    for scoreline in zip(home_scores, away_scores):
        # Perfect match
        if result == str(scoreline[0]) + "-" + str(scoreline[1]):
            scores[result] += 5
        # Home win points
        elif int(result[0]) > int(result[2]) and int(scoreline[0]) > int(scoreline[1]):
            scores[result] += 2
        # Away win points
        elif int(result[0]) < int(result[2]) and int(scoreline[0]) < int(scoreline[1]):
            scores[result] += 2
        # Draw points
        elif int(result[0]) == int(result[2]) and int(scoreline[0]) == int(scoreline[1]):
            scores[result] += 2


2-1
3-0
1-1
0-2
6-2
0-3
0-7
0-1
2-1
1-1
0-0
2-3
1-1
5-2
0-2
0-0
2-1
1-1
2-1
1-1
3-0
1-1
1-1
0-1
3-0
0-1
2-1
0-0
1-0
1-2
1-2
1-5
1-2
2-0
4-0
1-1
2-0
1-3
3-1
1-2
2-1
2-3
0-0
1-2
1-1
5-0
0-1
1-0
0-2
1-0
1-1
2-3
0-1
0-0
3-0
0-2
1-2
2-0
1-0
0-1
1-0
1-1
0-3
1-3
4-1
4-1
1-0
0-0
2-0
2-0
1-4
3-4
2-1
0-1
2-1
0-1
0-3
2-1
2-0
1-1
0-1
2-0
1-1
0-1
1-1
1-2
0-0
2-1
0-3
0-0
0-1
1-1
1-1
3-3
0-1
2-2
3-3
1-0
1-4
0-3
2-0
2-1
1-0
1-6
7-2
4-0
4-2
1-1
3-1
0-3
3-1
0-1
1-1
2-5
4-0
2-3
1-2
3-3
0-1
1-0
1-3
2-5
0-3
0-2
4-2
5-2
4-3
1-3
2-1
0-2
1-3
0-3
0-1
0-3
1-0
4-3
0-2


In [24]:
# Plot results

D = dict(sorted(scores.items(), key=lambda item: item[1]))

plt.bar(*zip(*D.items()))
plt.title("Prediction Points by Result")
plt.show()


TypeError: string indices must be integers