In [1]:
import io
import os
import sqlite3
import pandas as pd
import pickle

from google.cloud import vision
from google.cloud.vision import types

from time import sleep
from random import shuffle

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\ritvik\\Desktop\\YouTubeScraping\\YouTubeScraping-c52225391da2.json'

In [3]:
# Instantiates a client
client = vision.ImageAnnotatorClient()

In [4]:
def display_as_table(data, headers):
    df = pd.DataFrame(data=data, columns=[i[0] for i in headers])
    return df

In [5]:
def execute_query(query, retval=False, many=False, data=None, verbose=False):
    conn = sqlite3.connect('videos.db')

    #create a cursor
    cur = conn.cursor()
    
    if many:
        cur.executemany(query, data)
    else:
        cur.execute(query)
    
    result = cur.fetchall()
    
    if verbose:

        print(display_as_table(result, cur.description))

    #close connection
    conn.commit()
    conn.close()
    
    if retval:
        return result

In [6]:
def get_image_info(video_id, fields):
    """
    Call the GoogleVision API to get characteristics of a video thumbnail
    video_id: the id of the video we care about
    fields: subset of four possible thumbain entities: ['label', 'face', 'image', 'text']
    cur: the cursor of the database
    """
    
    
    result = execute_query('SELECT EntityType From ThumbnailEntities WHERE VideoId = "%s"'%video_id, True)
    existing_entities = [item[0] for item in result]
    fields = [f for f in fields if f not in existing_entities]
    
    #if no fields to process, return
    if len(fields) == 0:
        return
    
    #try to get the thumbnail url for this video
    result = execute_query('SELECT ThumbnailURL From Videos WHERE VideoId = "%s"'%video_id, True)
    
    #if this video not found, return 
    if len(result) == 0:
        return
    
    #get the thumbnail url
    thumbnail_url = result[0][0]
        
    #set this thumbnail as the url
    image = types.Image()
    image.source.image_uri = thumbnail_url
    
    data_dict = dict()
    
    #if we should do label detection
    if 'label' in fields:
    
        #### LABEL DETECTION ######

        response_label = client.label_detection(image=image)

        label_data = [{'label': label.description, 'score': label.score} for label in response_label.label_annotations if label.score > 0.5]
        
        data_dict['label'] = label_data

        ##########################
        
    if 'face' in fields:
    
        #### FACIAL DETECTION ######

        response_face = client.face_detection(image=image)
        
        face_data = []

        for face_detection in response_face.face_annotations:
            d = {
                'confidence': face_detection.detection_confidence,
                'joy': face_detection.joy_likelihood,
                'sorrow': face_detection.sorrow_likelihood,
                'surprise': face_detection.surprise_likelihood,
                'anger': face_detection.anger_likelihood
            }
            face_data.append(d)
            
        data_dict['face'] = face_data

        ##########################
        
    if 'image' in fields:
    
        #### IMAGE PROPERTIES ######

        response_image = client.image_properties(image=image)
        
        image_data = []

        for c in response_image.image_properties_annotation.dominant_colors.colors[:3]:
            d = {
                'color': c.color,
                'score': c.score,
                'pixel_fraction': c.pixel_fraction
            }
            image_data.append(d)
            
        data_dict['image'] = image_data

        ##########################
        
    if 'text' in fields:
    
        #### TEXT DETECTION ######
        
        text_data = []

        response_text = client.text_detection(image=image)

        for r in response_text.text_annotations[:1]:
            d = {
                'text': r.description
            }
            text_data.append(d)
            
        data_dict['text'] = text_data

        ##########################
    
    return data_dict

In [7]:
def insert_into_thumbnail_entities(video_ids, batch):
    
    data_to_insert = []
    for idx, data_dict in enumerate(batch):
        for field, data in data_dict.items():
            for entity in data:
                data_to_insert.append([video_ids[idx], field, str(entity)])
    
    try:
        execute_query('INSERT INTO ThumbnailEntities (VideoId, EntityType, Data) VALUES (?,?,?)', retval=False, many=True, data=data_to_insert)
    except Exception as e:
        print(e)

In [8]:
def insert_thumbnail_stats(vid_ids, fields):
    
    batch_size = 20

    curr_batch = []
    curr_vid_ids = []
    
    for idx, vid_id in enumerate(vid_ids):
    
        data_dict = get_image_info(vid_id, fields)

        if data_dict != None:
            curr_batch.append(data_dict)
            curr_vid_ids.append(vid_id)
        
        if (idx % batch_size == batch_size - 1) or (idx + 1 == len(vid_ids)):
            insert_into_thumbnail_entities(curr_vid_ids, curr_batch)
            print('Processed %s videos'%(idx+1))
            curr_batch = []
            curr_vid_ids = []

# Main Code

In [9]:
df = pd.read_csv('sigmoid_data.csv')
vid_ids = list(df.video_id.values)

In [10]:
insert_thumbnail_stats(vid_ids, ['label', 'face', 'image', 'text'])

Processed 20 videos
Processed 40 videos
Processed 60 videos
Processed 80 videos
Processed 100 videos
Processed 120 videos
Processed 140 videos
Processed 160 videos
Processed 180 videos
Processed 200 videos
Processed 220 videos
Processed 240 videos
Processed 260 videos
Processed 280 videos
Processed 300 videos
Processed 320 videos
Processed 340 videos
Processed 360 videos
Processed 380 videos
Processed 400 videos
Processed 420 videos
Processed 440 videos
Processed 460 videos
Processed 480 videos
Processed 500 videos
Processed 520 videos
Processed 540 videos
Processed 560 videos
Processed 580 videos
Processed 600 videos
Processed 620 videos
Processed 640 videos
Processed 660 videos
Processed 680 videos
Processed 700 videos
Processed 720 videos
Processed 740 videos
Processed 760 videos
Processed 780 videos
Processed 800 videos
Processed 820 videos
Processed 840 videos
Processed 860 videos
Processed 880 videos
Processed 900 videos
Processed 920 videos
Processed 940 videos
Processed 960 vid