Scrape [TV by the numbers](https://tvbythenumbers.zap2it.com/category/daily-ratings/) to get daily ratings for tv shows

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
! conda list '(beautifulsoup4|mysql-connector|pandas|python$)'

```python
beautifulsoup4            4.8.0                    py37_0    conda-forge
ipython                   7.8.0            py37h5ca1d4c_0    conda-forge
msgpack-python            0.6.2            py37h770b8ee_0    conda-forge
mysql-connector-c         6.1.11            h42c63d9_1003    conda-forge
mysql-connector-python    8.0.18           py37he91358f_0    conda-forge
pandas                    0.25.1           py37h86efe34_0    conda-forge
python                    3.7.3                h359304d_0  
```

In [1]:
from tvbythenumbers import *
import tvbythenumbers as debug #Access to hidden modules for debugging convenience

Definitions:
 - A 'page' contains a number of links (usually 12) to 'articles'
 - An 'article' contains ratings
 
Notes:
 - There are 351 pages, with numbering starting at 1

In [2]:
data = query('''SELECT * FROM tv_by_the_numbers_articles''')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108341 entries, 0 to 108340
Data columns (total 10 columns):
time             108341 non-null object
show             108341 non-null object
demo_rating      107238 non-null object
viewers          108341 non-null object
tags             108341 non-null object
article_title    108341 non-null object
url              108341 non-null object
notes            108341 non-null object
network          71738 non-null object
table_id         108341 non-null int64
dtypes: int64(1), object(9)
memory usage: 8.3+ MB


In [None]:
data.to_sql('articles_backup_2', db, 'tvshows', index = False, if_exists = 'replace')

False

## Part 1 – Scrape the landing pages to find links to articles

In [None]:
# Searchs for any pages not currently in the database
#  and puts them in the database. (~idempotent)
# 
# Safe to re-run
# Verbose
# On failure:
#  - returns latest web call data fetched or processed in a dictionary (for debugging purposes)
#  - may raise an error if there is no data to protect
error_data = update_pages_database()

In [None]:
error_data = update_articles_database(sleep_time = 0.05, on_fail = 'continue')

In [5]:
# Some pages (30) were not fetched
#  On examination: some are due to parsing errors
#                  many are articles not containing data at all
to_do = get_missing_articles()

4176 pages already scraped detected
30 pages needing scraping detected


In [2]:
add_primary_key('tv_by_the_numbers_articles')

In [2]:
def process_broadccast_networks():
    to_alter = query(f'''
        SELECT
            *
        FROM
            tv_by_the_numbers_articles
        WHERE
            network IS NULL
    ''')

    print(f'{len(to_alter)} bad records found')
    broadcast_networks = ['CBS', 'FOX', 'ABC', 'The CW', 'NBC', 'CW', 'NCBS']
    pattern = '(.*)' + '\((' + '|'.join(broadcast_networks) + ')\).*'
    def parse_show(show):
        maybe_match = re.match(pattern, show)
        if not maybe_match:
            out_dict = {'show' : show, 'network' : None}
        else:
            show = maybe_match.group(1)
            net  = maybe_match.group(2)
            if net == 'CW':
                net = 'The CW'
            out_dict = {'show' : show, 'network' : net}
            
        return pd.Series(out_dict)

    fixed = to_alter.show.apply(parse_show)
    
    to_alter['show'] = fixed.show
    to_alter['network'] = fixed.network
    
    to_alter.dropna(subset = ['network'], inplace = True)
    
    ids_to_drop = to_alter.table_id.tolist()
    fixed = to_alter.drop('table_id', axis = 'columns')
    
    
    
    fixed.to_sql('tv_by_the_numbers_articles', db, schema = 'tvshows', if_exists = 'append', index = False)
    
    return ids_to_drop
    try:
        query_text = [
         'DELETE FROM',
         'tv_by_the_numbers_articles',
         'WHERE',
         'table_id',
         'IN',
         str(tuple(ids_to_drop))
        ]
        return '\n'.join(query_text)
    except:
        return ids_to_drop


In [None]:
a = process_broadccast_networksoadccast_networksbroadccast_networksess_broadcast_networks()

In [None]:
b = copy.copy(a)

In [17]:
# This is miserably, miserably slow
fail_count = 0
while len(a) > 0:
    try:
        to_delete = a[:100]
        db.execute(f'DELETE FROM tv_by_the_numbers_articles WHERE table_id IN {tuple(to_delete)}')
        for table_id in to_delete:
            a.remove(table_id)
    except:
        print(f'failure on id {a[0]}')
        fail_count = fail_count + 1
        if fail_count > 10:
            print('aborting')
            break
        else:
            continue

In [15]:
len(a)

31340

In [16]:
len(b)

36579