# Scrape tv-by-the-numbers and rotten-tomatoes

Scrape [TV by the numbers](https://tvbythenumbers.zap2it.com/category/daily-ratings/) to get daily ratings for tv shows

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from tv_by_the_numbers import *
import tv_by_the_numbers as debug #Access to hidden modules for debugging convenience

In [None]:
! conda list '(beautifulsoup4|mysql-connector|pandas|python$)'

```python
beautifulsoup4            4.8.0                    py37_0    conda-forge
ipython                   7.8.0            py37h5ca1d4c_0    conda-forge
msgpack-python            0.6.2            py37h770b8ee_0    conda-forge
mysql-connector-c         6.1.11            h42c63d9_1003    conda-forge
mysql-connector-python    8.0.18           py37he91358f_0    conda-forge
pandas                    0.25.1           py37h86efe34_0    conda-forge
python                    3.7.3                h359304d_0  
```

Definitions:
 - A 'page' contains a number of links (usually 12) to 'articles'
 - An 'article' contains ratings
 
Notes:
 - There are 351 pages, with numbering starting at 1

## Part 1 – Scrape the tv-by-the-numbers

### Scrape landing pages to find links to articles

In [None]:
# Searchs for any pages not currently in the database
#  and puts them in the database. (~idempotent)
# 
# Safe to re-run
# Verbose
# On failure:
#  - returns latest web call data fetched or processed in a dictionary (for debugging purposes)
#  - may raise an error if there is no data to protect
error_data = update_pages_database()

### Scrape articles to find ratings

In [None]:
error_data = update_articles_database(sleep_time = 0.05, on_fail = 'continue')

In [None]:
# Some pages (30) were not fetched
#  On examination: some are due to parsing errors
#                  many are articles not containing data at all
to_do = get_missing_articles()

Add a primary key to the tv_by_the_numbers articles:

In [None]:
add_primary_key('tv_by_the_numbers_articles')

Post-process the broadcast networks which have weirdly formatted show names

In [None]:
# Looks for records without a network. Attempts to parse those properly, then writes a new record to the db.
#  returns a list of old ids to be deleted
def process_broadcast_networks(do_anything = False):
    to_alter = query(f'''
        SELECT
            *
        FROM
            tv_by_the_numbers_articles
        WHERE
            network IS NULL
    ''')

    print(f'{len(to_alter)} bad records found')
    
    if not do_anything:
        return []
    broadcast_networks = ['CBS', 'FOX', 'ABC', 'The CW', 'NBC', 'CW', 'NCBS']
    pattern = '(.*)' + '\((' + '|'.join(broadcast_networks) + ')\).*'
    def parse_show(show):
        maybe_match = re.match(pattern, show)
        if not maybe_match:
            out_dict = {'show' : show, 'network' : None}
        else:
            show = maybe_match.group(1)
            net  = maybe_match.group(2)
            if net == 'CW':
                net = 'The CW'
            out_dict = {'show' : show, 'network' : net}
            
        return pd.Series(out_dict)

    fixed = to_alter.show.apply(parse_show)
    
    to_alter['show'] = fixed.show
    to_alter['network'] = fixed.network
    
    to_alter.dropna(subset = ['network'], inplace = True)
    
    ids_to_drop = to_alter.table_id.tolist()
    fixed = to_alter.drop('table_id', axis = 'columns')
    
    
    
    fixed.to_sql('tv_by_the_numbers_articles', db, schema = 'tvshows', if_exists = 'append', index = False)
    
    return ids_to_drop
    try:
        query_text = [
         'DELETE FROM',
         'tv_by_the_numbers_articles',
         'WHERE',
         'table_id',
         'IN',
         str(tuple(ids_to_drop))
        ]
        return '\n'.join(query_text)
    except:
        return ids_to_drop


In [None]:
ids_to_drop = process_broadcast_networks()
print(f'{len(ids_to_drop)} records in need of dropping')

In [None]:
# Batch drop old ids
fail_count = 0
while len(ids_to_drop) > 0:
    try:
        to_delete = ids_to_drop[:100]
        db.execute(f'DELETE FROM tv_by_the_numbers_articles WHERE table_id IN {tuple(to_delete)}')
        for table_id in to_delete:
            ids_to_drop.remove(table_id)
    except:
        print(f'failure on ids {ids_to_drop[:100]}')
        fail_count = fail_count + 1
        if fail_count > 10:
            print('aborting')
            break
        else:
            continue

## Part 2 – Scrape Rotten Tomatoes

In [1]:
from rotten_tomatoes import *

### Part A: See which shows can actually be found on rotten tomatoes

In [None]:
list_of_tv_shows = query_list('s.show', 'tv_by_the_numbers_articles s')

In [None]:
list_of_urls = find_on_rotten_tomatoes(list_of_tv_shows)

In [None]:
df = pd.DataFrame.from_records(list_of_urls)

In [None]:
df.to_sql('rt_urls', db, index = False, if_exists = 'append')


### Part B: scrape rotten tomatoes

In [None]:
error_data = update_rt_data()

# Part 3 — Get data on Amazon and Netflix

In [21]:
list_of_tv_shows = query_list('title', 'netflix_amazon_shows')

In [None]:
list_of_urls = find_on_rotten_tomatoes(list_of_tv_shows)

In [None]:
print(len(list_of_tv_shows), len(list_of_urls))

In [26]:
df = pd.DataFrame.from_records(list_of_urls)

In [None]:
from datacleaning import list_subtract
list_subtract(list_of_tv_shows, df.by_the_numbers_name.tolist())

In [28]:
df.to_sql('rt_urls', db, index = False, if_exists = 'append')


In [None]:
error_data = update_rt_data()