In [1]:
from pathlib import Path
import sys

sys.path.append(str(Path().absolute().parent))
# sys.path

# Ingestion

Includes:
- reading from `playlist.csv`
- making an API call to `youtube.videos.list()`
- logging and storing the API calls

## Reading from the `playlist.csv`

Only read, do not poll.

In [4]:
import pandas as pd

def read_playlist_csv(source="../playlist/db/playlist.csv"):
    df = pd.read_csv(source)
    return df

In [6]:
df = read_playlist_csv()
df.head()

Unnamed: 0,kind,etag,id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.channelTitle,snippet.playlistId,snippet.position,...,snippet.thumbnails.maxres.height,snippet.videoOwnerChannelTitle,snippet.videoOwnerChannelId,contentDetails.videoPublishedAt,meta.kind,meta.etag,meta.nextPageToken,meta.pageInfo.totalResults,meta.pageInfo.resultsPerPage,meta.requestNumber
0,youtube#playlistItem,jr_adfcvhrU9UD2kp5b8Xh2644U,UExyYUZid0NvaXNKQlRsMG9YbjhVb1VhbTVIWFdVWjdFUy...,2016-04-30T00:30:14Z,UCKy1dAqELo0zrOtPkf0eTMw,Alienation Review,Housemarque's follow-up to Dead Nation is fast...,IGN,PLraFbwCoisJBTl0oXn8UoUam5HXWUZ7ES,1600,...,720.0,IGN,UCKy1dAqELo0zrOtPkf0eTMw,2016-04-30T00:30:01Z,youtube#playlistItemListResponse,OR2Yc_htGAvUDyCLBQaF_SyzLHc,EAAaMVBUOkNQSU1JaEEyTnpneVFrRkZORVV5T1RaRE1rRT...,2182,50,123
1,youtube#playlistItem,ePHYBy6OhjiL1IClvuazdjoB79Q,UExyYUZid0NvaXNKQlRsMG9YbjhVb1VhbTVIWFdVWjdFUy...,2016-04-27T02:00:23Z,UCKy1dAqELo0zrOtPkf0eTMw,The Walking Dead: Michonne - Episode 3 Review,The third and final episode of Michonne's jour...,IGN,PLraFbwCoisJBTl0oXn8UoUam5HXWUZ7ES,1601,...,720.0,IGN,UCKy1dAqELo0zrOtPkf0eTMw,2016-04-27T02:00:01Z,youtube#playlistItemListResponse,OR2Yc_htGAvUDyCLBQaF_SyzLHc,EAAaMVBUOkNQSU1JaEEyTnpneVFrRkZORVV5T1RaRE1rRT...,2182,50,123
2,youtube#playlistItem,kAGScijE-ZDpBZGHMQgwF1nDjFo,UExyYUZid0NvaXNKQlRsMG9YbjhVb1VhbTVIWFdVWjdFUy...,2016-04-26T15:30:09Z,UCKy1dAqELo0zrOtPkf0eTMw,Severed Review,"Severed starts as a fun take on touch combat, ...",IGN,PLraFbwCoisJBTl0oXn8UoUam5HXWUZ7ES,1602,...,720.0,IGN,UCKy1dAqELo0zrOtPkf0eTMw,2016-04-26T15:30:00Z,youtube#playlistItemListResponse,OR2Yc_htGAvUDyCLBQaF_SyzLHc,EAAaMVBUOkNQSU1JaEEyTnpneVFrRkZORVV5T1RaRE1rRT...,2182,50,123
3,youtube#playlistItem,QVZITUKMom6idlvcdtWyYzcg1Vk,UExyYUZid0NvaXNKQlRsMG9YbjhVb1VhbTVIWFdVWjdFUy...,2016-04-22T02:30:15Z,UCKy1dAqELo0zrOtPkf0eTMw,Battlefleet Gothic: Armada Review,"It's not always clear what's going on, but thi...",IGN,PLraFbwCoisJBTl0oXn8UoUam5HXWUZ7ES,1603,...,720.0,IGN,UCKy1dAqELo0zrOtPkf0eTMw,2016-04-22T02:30:01Z,youtube#playlistItemListResponse,OR2Yc_htGAvUDyCLBQaF_SyzLHc,EAAaMVBUOkNQSU1JaEEyTnpneVFrRkZORVV5T1RaRE1rRT...,2182,50,123
4,youtube#playlistItem,DwbGeylu7C63IzykIrHvrFtDymA,UExyYUZid0NvaXNKQlRsMG9YbjhVb1VhbTVIWFdVWjdFUy...,2016-04-22T01:00:11Z,UCKy1dAqELo0zrOtPkf0eTMw,1979 Revolution: Black Friday Review,1979 Revolution: Black Friday turns an overloo...,IGN,PLraFbwCoisJBTl0oXn8UoUam5HXWUZ7ES,1604,...,720.0,IGN,UCKy1dAqELo0zrOtPkf0eTMw,2016-04-22T01:00:00Z,youtube#playlistItemListResponse,OR2Yc_htGAvUDyCLBQaF_SyzLHc,EAAaMVBUOkNQSU1JaEEyTnpneVFrRkZORVV5T1RaRE1rRT...,2182,50,123


## Downloading data on each video

Call `youtube.videos().list()`.

Also some helpful logging functions.

In [14]:
# add time requested to the received response
from datetime import datetime, timezone

# just use utc time. "central"
def get_current_time(timezone=timezone.utc):
    now = datetime.now(timezone)
    return now

def serialize_time(dt):
    return dt.isoformat()

def deserialize_datetime_string(dt_string):
    return dt_string.fromisoformat()

# init/reset response.csv. careful with this function.
def reset_response_csv(target="./db/response.csv"):
    '''
    Writes to target csv a dataframe with only columns/headers, no rows.
    Returns the dataframe used to write to the csv.
    
    Each record is a request made and received from the request_video() function.
    '''
    # dummy for keys, copied over from request_video()
    params = {
            "part": 1,
            "id": 2,
            "maxResults": 3
        }
    headers_df = pd.DataFrame(columns=[
        'requestNumber',
        'timeSent',
        'timeReceived',
        *[f'params.{k}' for k in params.keys()]
    ])
    
    headers_df.to_csv(target, index=False)
    return headers_df

# edit response.csv
def append_response_csv(data_dict, target="./db/response.csv"):
    df = pd.json_normalize(data_dict)
    df.to_csv(target, index=False, mode='a', header=False)
    return df

# add request number
def generate_request_number(source="./db/response.csv"):
    try:
        request_numbers = pd.read_csv(source)['requestNumber']
    except:
        reset_response_csv()
        request_numbers = pd.read_csv(source)['requestNumber']
        
    generated = request_numbers.max() + 1
    if pd.isna(generated):
        # na + 1 = na
        generated = 0
    return generated

In [43]:
import json

def request_video(youtube, video_id_list):
    '''
    Order might matter. Instead of writing to file in the request loop, could save the metadata and responses into a list (in-memory). Then, after the loop, save everything to file.
    For now, just toss everything in the request loop.
    
    Note that some videos are private, so YouTube won't hand over the data for those videos.
    If you ask for it, `items` will be empty.
    '''
    
    response_list = []
    videos_requested = 0
    for start_index in range(0, len(video_id_list), 50):
        id_chunk = list(video_id_list)[start_index:(start_index + 50)]
        params = {
            "part": "contentDetails,id,liveStreamingDetails,localizations,player,recordingDetails,snippet,statistics,status,topicDetails",
            "id": ','.join(id_chunk),
            "maxResults": 50
        }
        request = youtube.videos().list(**params)
        request_number = generate_request_number()
        
        time_sent = get_current_time()
        response = request.execute()
        time_received = get_current_time()
        
        # log to ./db/response.csv
        md = {}
        md["requestNumber"] = request_number
        md["timeSent"] = time_sent
        md["timeReceived"] = time_received
        md["params"] = {**params}
        append_response_csv(md)
        
        # log to raw, a directory of jsons. save the conversion from json to csv in the transformation stage.
        response["requestNumber"] = request_number
        with open(f"./raw/{response['requestNumber']}.json", "w") as fp:
            json.dump(response, fp)
        
        response_list.append(response)
        
        videos_requested += len(id_chunk)
        print(f"{videos_requested}/{len(video_id_list)} ({round(videos_requested / len(video_id_list) * 100, 2)}%)", end='\r')
    
    # thinking about returning a list of responses
    return response_list

In [50]:
import api_key

youtube = api_key.get_youtube()
df = read_playlist_csv()
video_ids = df['snippet.resourceId.videoId']
r_list = request_video(youtube, video_ids)

2182/2182 (100.0%)

In [36]:
r_list

[{'kind': 'youtube#videoListResponse',
  'etag': 'YIUPVpqNjppyCWOZfL-19bLb7uk',
  'items': [],
  'pageInfo': {'totalResults': 0, 'resultsPerPage': 0},
  'requestNumber': 3}]

# Transformation

In [71]:
def video_response_to_df(video_dict):
    df = pd.json_normalize(
        video_dict,
        record_path='items',
        meta=["kind", "etag", ["pageInfo", "totalResults"], ["pageInfo", "resultsPerPage"], "requestNumber"],
        meta_prefix="meta.",
        errors="ignore"
    )
    return df

df = video_response_to_df(r_list)
df.head(2)

Unnamed: 0,kind,etag,id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,snippet.thumbnails.default.height,...,contentDetails.contentRating.ytRating,liveStreamingDetails.actualStartTime,liveStreamingDetails.actualEndTime,liveStreamingDetails.scheduledStartTime,liveStreamingDetails.scheduledEndTime,meta.kind,meta.etag,meta.pageInfo.totalResults,meta.pageInfo.resultsPerPage,meta.requestNumber
0,youtube#video,q07ah-xiaLF9_bENqSDbu-Ud3Fw,-UE2gzTtHxo,2016-04-30T00:30:01Z,UCKy1dAqELo0zrOtPkf0eTMw,Alienation Review,Housemarque's follow-up to Dead Nation is fast...,https://i.ytimg.com/vi/-UE2gzTtHxo/default.jpg,120,90,...,,,,,,youtube#videoListResponse,ZhOVBxwEbaOQHAdL-UyWLjX7sgY,50,50,0
1,youtube#video,36FNaLbUX-iC-hFgAzv7AEpRTlg,yb6vAQ2Tir4,2016-04-27T02:00:01Z,UCKy1dAqELo0zrOtPkf0eTMw,The Walking Dead: Michonne - Episode 3 Review,The third and final episode of Michonne's jour...,https://i.ytimg.com/vi/yb6vAQ2Tir4/default.jpg,120,90,...,,,,,,youtube#videoListResponse,ZhOVBxwEbaOQHAdL-UyWLjX7sgY,50,50,0


In [81]:
def reset_video_csv(target="./db/video.csv"):
    cols = ['kind',
 'etag',
 'id',
 'snippet.publishedAt',
 'snippet.channelId',
 'snippet.title',
 'snippet.description',
 'snippet.thumbnails.default.url',
 'snippet.thumbnails.default.width',
 'snippet.thumbnails.default.height',
 'snippet.thumbnails.medium.url',
 'snippet.thumbnails.medium.width',
 'snippet.thumbnails.medium.height',
 'snippet.thumbnails.high.url',
 'snippet.thumbnails.high.width',
 'snippet.thumbnails.high.height',
 'snippet.thumbnails.standard.url',
 'snippet.thumbnails.standard.width',
 'snippet.thumbnails.standard.height',
 'snippet.thumbnails.maxres.url',
 'snippet.thumbnails.maxres.width',
 'snippet.thumbnails.maxres.height',
 'snippet.channelTitle',
 'snippet.tags',
 'snippet.categoryId',
 'snippet.liveBroadcastContent',
 'snippet.localized.title',
 'snippet.localized.description',
 'contentDetails.duration',
 'contentDetails.dimension',
 'contentDetails.definition',
 'contentDetails.caption',
 'contentDetails.licensedContent',
 'contentDetails.projection',
 'status.uploadStatus',
 'status.privacyStatus',
 'status.license',
 'status.embeddable',
 'status.publicStatsViewable',
 'status.madeForKids',
 'statistics.viewCount',
 'statistics.likeCount',
 'statistics.favoriteCount',
 'statistics.commentCount',
 'player.embedHtml',
 'topicDetails.topicCategories',
 'snippet.defaultAudioLanguage',
 'contentDetails.contentRating.ytRating',
 'liveStreamingDetails.actualStartTime',
 'liveStreamingDetails.actualEndTime',
 'liveStreamingDetails.scheduledStartTime',
 'liveStreamingDetails.scheduledEndTime',
 'meta.kind',
 'meta.etag',
 'meta.pageInfo.totalResults',
 'meta.pageInfo.resultsPerPage',
 'meta.requestNumber']
    pd.DataFrame(columns=cols).to_csv(target, index=False)
    return cols

def gather_jsons_to_df(directory="./raw"):
    big_df = pd.DataFrame()
    to_concat = [big_df]
    
    json_files = Path(directory).glob("*.json")
    for j in json_files:
        df = json_to_df(j)
        to_concat.append(df)
        
    big_df = pd.concat(to_concat, axis=0)
    return big_df

def json_to_df(path):
    with open(path, 'r') as fp:
        response_dict = json.load(fp)
        df = video_response_to_df(response_dict)
        
    return df

In [79]:
import shutil

def consume_raw(directory='./raw'):
    json_df = gather_jsons_to_df(directory)
    try:
        old_df = pd.read_csv("./db/video.csv")
    except:
        reset_video_csv()
        old_df = pd.read_csv("./db/video.csv")
        
    combined_df = pd.concat([old_df, json_df], axis=0)
    combined_df = combined_df.drop_duplicates('id')
    combined_df.to_csv("./db/video.csv", index=False, header=True)
    
    json_files = Path(directory).glob("*.json")
    for j in json_files:
        shutil.move(j, "./consumed")
    
    return list(json_files)

In [83]:
cols = reset_video_csv()

In [84]:
json_files = consume_raw()

In [85]:
video_df = pd.read_csv("./db/video.csv")
video_df.head(5)

Unnamed: 0,kind,etag,id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,snippet.thumbnails.default.height,...,contentDetails.contentRating.ytRating,liveStreamingDetails.actualStartTime,liveStreamingDetails.actualEndTime,liveStreamingDetails.scheduledStartTime,liveStreamingDetails.scheduledEndTime,meta.kind,meta.etag,meta.pageInfo.totalResults,meta.pageInfo.resultsPerPage,meta.requestNumber
0,youtube#video,6MOCAIN4LYZZvDIVus9QTrl9HTk,h663A9inOgw,2017-03-02T09:00:05Z,UCKy1dAqELo0zrOtPkf0eTMw,Super Bomberman R (Nintendo Switch) Review Imp...,Here's what we think of Bomberman's Nintendo S...,https://i.ytimg.com/vi/h663A9inOgw/default.jpg,120,90,...,,,,,,youtube#videoListResponse,Q4q8qHbBwBSWTe5p2I6ucBsuqZs,50,50,28
1,youtube#video,Yrv4eQfwEXrNPGPUzk4n5eDcQw4,_BMPpDZu7dk,2017-03-01T14:08:45Z,UCKy1dAqELo0zrOtPkf0eTMw,Nintendo Switch Review In Progress,Here's what we think so far about Nintendo's a...,https://i.ytimg.com/vi/_BMPpDZu7dk/default.jpg,120,90,...,,,,,,youtube#videoListResponse,Q4q8qHbBwBSWTe5p2I6ucBsuqZs,50,50,28
2,youtube#video,43V5iVRxKoECu6btkD6NJiVywQA,KJxi30ZAXeM,2017-02-28T21:24:07Z,UCKy1dAqELo0zrOtPkf0eTMw,Torment: Tides of Numenera Review,Torment: Tides of Numenera reviewed by Leif Jo...,https://i.ytimg.com/vi/KJxi30ZAXeM/default.jpg,120,90,...,,,,,,youtube#videoListResponse,Q4q8qHbBwBSWTe5p2I6ucBsuqZs,50,50,28
3,youtube#video,vy6TZqRhO6eVcN0Lu-sRe1dAcds,dk12WzLRD2I,2017-02-22T02:07:48Z,UCKy1dAqELo0zrOtPkf0eTMw,Halo Wars 2 Review,Halo Wars 2 reviewed on Xbox One and PC by Dan...,https://i.ytimg.com/vi/dk12WzLRD2I/default.jpg,120,90,...,,,,,,youtube#videoListResponse,Q4q8qHbBwBSWTe5p2I6ucBsuqZs,50,50,28
4,youtube#video,kHy-4ctOLBAj9L6XH76pZh7X4qk,maGhZ9kY12M,2017-02-22T01:00:03Z,UCKy1dAqELo0zrOtPkf0eTMw,Berserk and the Band of the Hawk Review,Berserk and the Band of the Hawk reviewed by C...,https://i.ytimg.com/vi/maGhZ9kY12M/default.jpg,120,90,...,,,,,,youtube#videoListResponse,Q4q8qHbBwBSWTe5p2I6ucBsuqZs,50,50,28


In [86]:
video_df.shape

(2159, 57)

#### Testing Grounds

##### Test:

What if send video id of a private video?

In [45]:
# df[df['snippet.title'].str.lower() == 'private video']
private_video_id = 'B6BCKY_KTv0' # example of private video
df = read_playlist_csv()
df[df['snippet.resourceId.videoId'] == private_video_id]

Unnamed: 0,kind,etag,id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.channelTitle,snippet.playlistId,snippet.position,...,snippet.thumbnails.maxres.height,snippet.videoOwnerChannelTitle,snippet.videoOwnerChannelId,contentDetails.videoPublishedAt,meta.kind,meta.etag,meta.nextPageToken,meta.pageInfo.totalResults,meta.pageInfo.resultsPerPage,meta.requestNumber
73,youtube#playlistItem,NSxb1i0FeP3_eTmkK0YFjHdX8nE,UExyYUZid0NvaXNKQlRsMG9YbjhVb1VhbTVIWFdVWjdFUy...,2016-01-20T18:00:11Z,UCKy1dAqELo0zrOtPkf0eTMw,Private video,This video is private.,IGN,PLraFbwCoisJBTl0oXn8UoUam5HXWUZ7ES,1673,...,,,,,youtube#playlistItemListResponse,FVDW-bzKQHoVgTXjeSJrb_3pLl4,EAAaMVBUOkNLUU5JaEJFTjBORk5Ua3pSamczTmtZeFJqY3...,2182,50,79


In [48]:
r_list = request_video(youtube, [video_ids[0], 'B6BCKY_KTv0'])
len(r_list[0]['items'])

2/2 (100.0%)

1

Private videos do not get added as an item when returning a response.