## 1. Setup


### 1.1 Importing Dependencies


In [113]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import pandas as pd
import os
import webbrowser
import pytube
import matplotlib.pyplot as plt
import urllib.request
import json
import requests

## 2. Creating Dataset


### 2.1 Creating directories


In [102]:
os.makedirs('data', exist_ok=True)
os.makedirs('data/images', exist_ok=True)

### 2.2 Creating Word List


In [48]:
word_list = []
words_path = os.path.join('data', 'words.csv')
df = pd.DataFrame({'word': word_list})
df.to_csv(words_path, index=False)

### 2.3 Youtube Scraping


In [49]:
def scrape_youtube_videos(query):
    api_key = os.environ.get("YOUTUBE_API_KEY")
    youtube = build('youtube', 'v3', developerKey=api_key)

    video_links = []
    video_ids = []
    next_page_token = None

    while True:
        search_response = youtube.search().list(
            q=query,
            type='video',
            part='id',
            maxResults=50,
            pageToken=next_page_token
        ).execute()

        for item in search_response['items']:
            video_links.append(
                'https://www.youtube.com/watch?v=' + item['id']['videoId'])
            video_ids.append(item['id']['videoId'])

        next_page_token = search_response.get('nextPageToken')

        if not next_page_token:
            break

    df = pd.DataFrame({'link': video_links, 'id': video_ids})
    p = os.path.join('data', query + '.csv')
    df.to_csv(p, index=False)

In [50]:
wd = pd.read_csv(words_path)
wd = wd['word'].tolist()

In [41]:
for word in wd:
    scrape_youtube_videos(word)

### 2.4 Creating Unique Dataset


In [52]:
for word in wd:
    p = os.path.join('data', word + '.csv')
    if os.path.exists(p):
        df = pd.read_csv(p)
        p2 = os.path.join('data', 'links.csv')
        df.to_csv(p2, mode='a', header=False, index=False)

In [55]:
df = pd.read_csv('data/links.csv')
df.columns = ['index', 'link', 'id']
df.to_csv('data/links.csv', index=False)

In [66]:
df = pd.read_csv('data/links.csv')
df = df.drop('index', axis=1)
# df.head()

In [89]:
video_id = df['id']
video_id = video_id.sample(5)
# video_id

## 3. Visualizing Dataset


### 3.1 Playing Video in Browser


In [90]:
def play_youtube_video(id):
    url = f"https://www.youtube.com/watch?v={id}"
    webbrowser.open(url)

In [91]:
for id in video_id:
    play_youtube_video(id)

### 3.2 Viewing Thumbnails


In [98]:
def plot_thumbnail(id):
    url = f"https://www.youtube.com/watch?v={id}"
    yt = pytube.YouTube(url)
    thumbnail_url = yt.thumbnail_url
    p = os.path.join('data', 'images', 'thumbnail.jpg')
    urllib.request.urlretrieve(thumbnail_url, p)
    thumbnail_image = plt.imread(p)
    plt.imshow(thumbnail_image)
    plt.axis('off')
    plt.show()

In [100]:
for id in video_id:
    plot_thumbnail(id)

## 4. Reporting Videos


In [116]:
res_path = os.path.join('data', 'res.csv')

In [117]:
reason = "Sexual content"

In [110]:
def report_video(id, reason):
    api_service_name = "youtube"
    api_version = "v3"
    developer_key = os.environ.get("YOUTUBE_API_KEY")
    youtube = build(api_service_name, api_version, developerKey=developer_key)
    try:
        report_request = youtube.videos().reportAbuse({
            "videoId": id,
            "reasonId": reason,
        })
        report_request.execute()
        df = pd.DataFrame({'id': [id], 'response': "success"})
        df.to_csv(res_path, mode='a', header=False, index=False)
    except HttpError as e:
        df = pd.DataFrame({'id': [id], 'response': "failed"})
        df.to_csv(res_path, mode='a', header=False, index=False)

In [112]:
for id in video_id:
    report_video(id, reason)

In [118]:
def yt_report_livestream(id, reason):
    URL = f"https://youtube.googleapis.com/youtube/v3/videos/reportAbuse?alt=json"
    payload = {
        "videoId": id,
        "reasonId": reason,
    }
    payload_json = json.dumps(payload)
    response = requests.post(
        url=URL,
        data=payload_json,
        verify=False,
    )
    if response.status_code == 200:
        df = pd.DataFrame({'id': [id], 'response': "success"})
        df.to_csv(res_path, mode='a', header=False, index=False)
    else:
        df = pd.DataFrame({'id': [id], 'response': "failed"})
        df.to_csv(res_path, mode='a', header=False, index=False)

In [119]:
for id in video_id:
    yt_report_livestream(id, reason)

