In [3]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import csv
import re
import logging
import json
import time
from datetime import datetime
from collections import deque

# month in 1st %s, year in 2nd %s
URL = 'https://venturebeat.com/%s-%s-npd'


class Scraper:
    def __init__(self, request_delay):
        """
        request_delay - delay in seconds before another request can be made
        if a 429 Too many requests error is encountered
        """
        self.cache = deque()
        self.request_delay = request_delay

    def simple_get(self, url):
        """
        Attempts to get the content at `url` by making an HTTP GET request.
        If the content-type of response is some kind of HTML/XML, return the
        text content, otherwise return None.
        """
        logging.info('Scraper: Requested "' + url + '"')

        try:
            while True:
                with closing(get(url, stream=True)) as resp:
                    # Encountered 429; sleep and try again
                    if resp.content == b'Too Many Requests\n':
                        logging.info('429 Too Many Requests; sleeping for ' + str(self.request_delay) + ' seconds...')
                        time.sleep(self.request_delay)
                        continue

                    return resp.content
        except RequestException as e:
            logging.critical('Scraper: Error during requests to {0} : {1}'.format(url, str(e)))
            raise e

    def get_software_usd(self, month, year):
        """
        month - string
        year - string
        
        Returns a tuple: (amount for year-1, amount for year)
        where amount is in USD, millions, as a string
        """
        try:
            raw_html = self.simple_get(URL % (month, year))
            html = BeautifulSoup(raw_html, 'html.parser')

            all_td = html.find_all('td')

            start = -1            
            for i, s in enumerate(all_td):
                s_str = str(s)
                if 'Video Game Software (Physical and Full' in s_str or 'Video Game Content' in s_str or 'Software' in s_str:
                    start = i
                    break
            if start == -1:
                logging.critical('For ' + month + year + ': did not scrape correctly')
            else:
                if '%' in all_td[start + 2].contents[0] or '%' in all_td[start + 3].contents[0]:
                    return (all_td[start + 1].contents[0][1:], all_td[start + 2].contents[0][1:])
                else:
                    # all_td[start + 3].contents[0] = "$xx"
                    return (all_td[start + 2].contents[0][1:], all_td[start + 3].contents[0][1:])
                
            return (-1, -1)
        except Exception as e:
            raise e

scraper = Scraper(20)
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
years = [2021, 2020]
d = {}
for m in months:
    for y in years:
        amounts = scraper.get_software_usd(m, str(y))
        d[(m, y)] = amounts[1]
        d[(m, y-1)] = amounts[0]

csv_file = './Datasets/gamesales.csv'
with open(csv_file, 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=['Month', 'Year', 'Amount (USD millions)'])
    writer.writeheader()
    for k, v in d.items():
        print(k, v)
        writer.writerow({'Month': k[0], 'Year': k[1], 'Amount (USD millions)': v})

CRITICAL:root:For April2021: did not scrape correctly
CRITICAL:root:For May2021: did not scrape correctly
CRITICAL:root:For June2021: did not scrape correctly
CRITICAL:root:For July2021: did not scrape correctly
CRITICAL:root:For August2021: did not scrape correctly
CRITICAL:root:For September2021: did not scrape correctly
CRITICAL:root:For October2021: did not scrape correctly
CRITICAL:root:For November2021: did not scrape correctly
CRITICAL:root:For December2021: did not scrape correctly


('January', 2021) 4,172
('January', 2020) 311m
('January', 2019) 451m
('February', 2021) 3,998
('February', 2020) 307m
('February', 2019) 477m
('March', 2021) 4,628
('March', 2020) 739m
('March', 2019) 550m
('April', 2021) -1
('April', 2020) 662
('April', 2019) 428
('May', 2021) -1
('May', 2020) 438
('May', 2019) 262
('June', 2021) -1
('June', 2020) 570
('June', 2019) 382
('July', 2021) -1
('July', 2020) 3,251
('July', 2019) 2,426
('August', 2021) -1
('August', 2020) 2,935
('August', 2019) 2,147
('September', 2021) -1
('September', 2020) 3,841
('September', 2019) 3,541
('October', 2021) -1
('October', 2020) 3,429
('October', 2019) 3,054
('November', 2021) -1
('November', 2020) 5,244
('November', 2019) 3,974
('December', 2021) -1
('December', 2020) 5,806
('December', 2019) 4,732
