In [None]:
import sys
sys.path.append('..')
from db import *
import re
import numpy as np
import dateparser

## Load the RT data

In [None]:
rt_data = query('SELECT * FROM rt_data')

In [None]:
rt_data.head()

## Create the Streaming Table

### Key conversion table

In [None]:
streaming_urls = query('SELECT * FROM amazon_netflix_rt_urls')

In [None]:
streaming_urls.rename({'by_the_numbers_name' : 'title'}, inplace = True, axis = 'columns')

In [None]:
streaming_urls.head()

### List of Streaming Shows

In [None]:
streaming_shows = query('SELECT * FROM netflix_amazon_shows')

In [None]:
streaming_shows.head()

### Perform the Merge

In [None]:
temp_df         = rt_data.merge(streaming_urls,  on = 'url',                                      suffixes = ('_rt', '_streaming'), validate = '1:1')
streaming_df = temp_df.merge(streaming_shows, left_on = 'title_streaming', right_on = 'title', suffixes = ('_rt', '_streaming'), validate = '1:1')

In [None]:
streaming_df.head()

In [None]:
streaming_df.info()

In [None]:
streaming_df.rename({'premiere_date_streaming' : 'premiere_date',
                     'genre_streaming' : 'genre',
                     'network_streaming' : 'network'
                    }, inplace = True, axis = 'columns')

In [None]:
streaming_df['premiere_date'] = pd.to_datetime(streaming_df.premiere_date)

In [None]:
streaming_df['critic_rating'] = streaming_df.critic_rating.astype(float)
streaming_df['audience_rating'] = streaming_df.audience_rating.astype(float)

In [None]:
streaming_df_clean = streaming_df[['title', 'critic_rating', 'audience_rating', 'network', 'premiere_date', 'genre', 'executive_producers']]

In [None]:
streaming_df_clean.info()

In [None]:
streaming_df_clean.to_sql('streaming', db, if_exists = 'fail', index = False)

## Create the Nielsen Table

### Load Key Conversion Table

In [None]:
nielsen_urls = query('SELECT * FROM rt_urls')

In [None]:
nielsen_urls.rename({'rt_name' : 'rt_id'}, inplace = True, axis = 'columns')

In [None]:
nielsen_urls.head()

### Load the Nielsen Data

In [None]:
nielsen_data = query('SELECT * FROM tv_by_the_numbers_articles')

In [None]:
nielsen_data.head()

### Perform the Merge

In [None]:
temp_df = rt_data.merge(nielsen_urls, on = 'url', validate = '1:m')
nielsen_df = temp_df.merge(nielsen_data, left_on = 'by_the_numbers_name', right_on = 'show', suffixes = ('_rt', '_tvbtn'), validate = '1:m')

### Clean the Nielsen Data and colled the appropriate columns

In [None]:
def clean_demo_rating(r):
    try:
        if '/' in r:
            return float(r[:r.find('/')])
        else:
            return float(r)
    except:
        return np.nan

In [None]:
nielsen_df.demo_rating = nielsen_df.demo_rating.map(clean_demo_rating)

In [None]:
nielsen_df.columns

In [None]:
months = ['Jan', 'January', 'Feb', 'February', 'Mar', 'March', 'Apr', 'April', 'May', 'Jun', 'June', 'Jul', 'July',
          'Aug', 'August', 'Sep', 'Sept', 'September', 'Oct', 'October', 'Nov', 'November', 'Dec', 'December']
month_re  = '(' + '|'.join(months) + ')'
date_re   = '([0-3]?[0-9])'
year_re   = '(2?0?1[0-9])'
total_re  = '[-/_]'.join([month_re, date_re, year_re])
total_re

In [None]:
def parse_date(url):
    look = re.search(total_re, url, re.IGNORECASE)
    if not look:
        return None
    
    findings = list(look.groups())
    if findings[2] == '215':
        findings[2] = '2015'
    out_date = dateparser.parse(' '.join(findings))
    if not out_date:
        print(f'Warning, failed to parse the date for {url}')
        return None
    
    return out_date.date()

In [None]:
dates = nielsen_df.url_tvbtn.map(parse_date)

In [None]:
nielsen_df['date'] = pd.to_datetime(dates)

In [None]:
nielsen_df.rename({'network_rt' : 'network'}, inplace = True, axis = 'columns')

In [None]:
nielsen_df['critic_rating'] = nielsen_df.critic_rating.astype(float)
nielsen_df['audience_rating'] = nielsen_df.audience_rating.astype(float)
nielsen_df['demo_rating']     = nielsen_df.demo_rating.astype(float)

In [None]:
nielsen_df_clean = nielsen_df[['rt_id', 'title', 'critic_rating', 'audience_rating', 'network', 'genre', 'executive_producers', 'demo_rating', 'date']]

In [None]:
nielsen_df_clean.info()

In [None]:
nielsen_df_clean.to_sql('nielsen', db, if_exists = 'fail', index = False)