<a href="https://colab.research.google.com/github/sebi061/VideoAdEngagement/blob/main/2_Scrape_youtube_api.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
### Imports ###
###############

import numpy as np
import pandas as pd
import os
import shutil
import requests
import json
from tqdm import tqdm

In [None]:
### Set data directory
##################

# connect to drive
from google.colab import drive
drive.mount('/content/drive')

# set data directory
data_dir = '/content/drive/MyDrive/0_Downloaded_Data/0_Video_Ad_Urls'
save_dir = '/content/drive/MyDrive/0_Downloaded_Data/3_Engagement_Statistics'

Mounted at /content/drive


In [None]:
### Select which brand types to scrape ###
##########################################

#brand_type = 'sports'
brand_type = 'cars'

In [None]:
### Class to scrape data ###
############################

class YTVideostats:

  def __init__(self, api_key, video_ids):
    self.api_key = api_key
    self.video_ids = video_ids

  def get_video_data(self):

    video_dict = {}

    for id in self.video_ids:
      video_dict[id] = dict()



    parts = ["snippet", "statistics", "contentDetails"]

    for video_id in tqdm(self.video_ids):
      for part in parts:

        data = self._get_single_video_data(video_id, part)
        video_dict[video_id].update(data)



    return video_dict

  def _get_single_video_data(self, video_id, part):
    url = f"https://www.googleapis.com/youtube/v3/videos?part={part}&id={video_id}&key={self.api_key}"

    json_url = requests.get(url)
    data = json.loads(json_url.text)
    try:
      data = data['items'][0][part]

    except:
      print('error')
      data = dict()

    return data

In [None]:
### Brand channels to scrape from ###
#####################################

files_sports = ['nike', 'adidas', 'puma', 'underarmour',
                'asics', 'converse', 'timberland', 'salomon',
                'gymshark', 'gopro', 'redbull', 'monsterenergy']

files_cars = ['hyundai', 'porsche', 'audi', 'bmw',
              'vw', 'mercedes', 'honda', 'ford',
              'skoda', 'ferrari']

In [None]:
def extract_yt_stats(file, API_KEY):

  # function to extract video id from video url
  def extract_ids(example):
    return example['video_url'][-11:]

  # function to calculate engagement score from likes, comments and views
  def engagement(example):
    eng_score = (int(example['likes']) + int(example['comments'])) / int(example['views'])
    return eng_score

  # load dataframe
  df = pd.read_csv(os.path.join(data_dir, f'videos_{file}.csv'))

  # extract video id
  df['video_id'] = df.apply(extract_ids, axis = 1)

  # initialiize class to extract information from youtube

  YT = YTVideostats(API_KEY, df.video_id)

  # extract info to json df
  json_ds_info = YT.get_video_data()

  # extract stats info
  info_stats = []

  # extract brand name
  brand = file

  # extract stats info from json file
  for key, value in json_ds_info.items():

    views = value['viewCount']
    likes = value['likeCount']
    comments = value['commentCount']

    info_stats.append([key, brand, views, likes, comments])


  # create df of stats
  df_scraped_stats = pd.DataFrame(info_stats, columns = ["video_id", "brand", "views", "likes", "comments"])
  df_scraped_stats


  # add engagement score
  df_scraped_stats['eng_score'] = df_scraped_stats.apply(engagement, axis = 1)


  return df_scraped_stats

In [None]:
### Extract info from all videos and concat ###
###############################################

# my api key
API_KEY =

# assign files according to brand type selected before
if brand_type == 'sports':
  files = files_sports

else:
  files = files_cars


# apply funtion to scrape data
for i, file in enumerate(files):
  if i == 0:
    df_final = extract_yt_stats(file = file, API_KEY = API_KEY)

  else:
    df_additional = extract_yt_stats(file = file, API_KEY = API_KEY)
    df_final = pd.concat([df_final, df_additional], ignore_index = True, axis = 0)

100%|██████████| 106/106 [00:42<00:00,  2.48it/s]
100%|██████████| 48/48 [00:19<00:00,  2.50it/s]
100%|██████████| 86/86 [00:33<00:00,  2.60it/s]
100%|██████████| 24/24 [00:09<00:00,  2.46it/s]
100%|██████████| 90/90 [00:34<00:00,  2.63it/s]
100%|██████████| 90/90 [00:37<00:00,  2.43it/s]
100%|██████████| 51/51 [00:20<00:00,  2.51it/s]
100%|██████████| 91/91 [00:36<00:00,  2.48it/s]
100%|██████████| 26/26 [00:10<00:00,  2.44it/s]
100%|██████████| 182/182 [01:12<00:00,  2.52it/s]


In [None]:
### Save ###
############

df_final.to_csv(f'./df_{brand_type}_stats.csv')

In [None]:
shutil.copy(f'./df_{brand_type}_stats.csv', save_dir)

'/content/drive/MyDrive/0_Masterarbeit/2_Pipelines/Data/df_cars_stats.csv'