## 1. Setup


### 1.1 Importing Dependencies


In [67]:
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
import pandas as pd
import os
import random
import webbrowser
import pytube
import matplotlib.pyplot as plt
import urllib.request

## 2. Creating Dataset


### 2.1 Creating directories


In [38]:
os.makedirs('data', exist_ok=True)
os.makedirs('data/images', exist_ok=True)
words_path = os.path.join('data', 'words.csv')

### 2.2 Creating Word List


In [39]:
# enter the search keywords in this list
word_list = ['bubble butt naked']

if word_list:
    df = pd.DataFrame({'word': word_list})
    df.to_csv(words_path, mode='a', index=False)

In [40]:
wd = pd.read_csv(words_path)
wd = wd['word'].tolist()

### 2.3 Youtube Scraping


In [41]:
def scrape_youtube_videos(query):
    api_key = os.environ.get("YOUTUBE_API_KEY")
    youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=api_key)

    video_links = []
    video_ids = []
    next_page_token = None

    while True:
        search_response = youtube.search().list(
            q=query,
            type='video',
            part='id',
            maxResults=50,
            pageToken=next_page_token
        ).execute()

        for item in search_response['items']:
            video_links.append(
                'https://www.youtube.com/watch?v=' + item['id']['videoId'])
            video_ids.append(item['id']['videoId'])

        next_page_token = search_response.get('nextPageToken')

        if not next_page_token:
            break

    df = pd.DataFrame({'link': video_links, 'id': video_ids})
    p = os.path.join('data', query + '.csv')
    df.to_csv(p, index=False)

In [54]:
# function to get youtube video links
for word in wd:
    if os.path.exists(os.path.join('data', word + '.csv')):
        continue
    else:
        scrape_youtube_videos(word)

### 2.4 Creating Unique Dataset


In [44]:
for word in wd:
    p = os.path.join('data', word + '.csv')
    if os.path.exists(p):
        df = pd.read_csv(p)
        p2 = os.path.join('data', 'links.csv')
        df.to_csv(p2, mode='a', header=False, index=False)

In [45]:
df = pd.read_csv('data/links.csv')
df.columns = ['link', 'id']
df.to_csv('data/links.csv', index=False)

In [55]:
df = pd.read_csv('data/links.csv')
# df.head()

In [76]:
df.shape

(4479, 2)

In [75]:
video_id = df['id']
video_id = video_id.sample(20)
# video_id

## 3. Visualizing Dataset


### 3.1 Playing Video in Browser


In [58]:
def play_youtube_video(id):
    url = f"https://www.youtube.com/watch?v={id}"
    webbrowser.open(url)

In [19]:
# function to play youtube videos
for id in video_id:
    play_youtube_video(id)

### 3.2 Viewing Thumbnails


In [71]:
def plot_thumbnail(id):
    url = f"https://www.youtube.com/watch?v={id}"
    yt = pytube.YouTube(url)
    thumbnail_url = yt.thumbnail_url
    video_title = yt.title
    p = os.path.join('data', 'images', 'thumbnail.jpg')
    urllib.request.urlretrieve(thumbnail_url, p)
    thumbnail_image = plt.imread(p)
    plt.imshow(thumbnail_image)
    plt.axis('off')
    plt.show()
    print("📽️ ",video_title)
    print("🔗 ",url)
    print("------------------------------------------------------")

In [78]:
# function to plot thumbnail of youtube videos
for id in video_id:
    plot_thumbnail(id)

## 4. Reporting Videos


### 4.1 Setting up path for Responses

In [33]:
res_path = os.path.join('data', 'response.csv')

In [34]:
scopes = ["https://www.googleapis.com/auth/youtube.force-ssl"]

In [35]:
def report_video(id):
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
    api_service_name = "youtube"
    api_version = "v3"
    client_secrets_file = "client_secret.json"
    flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(client_secrets_file, scopes)
    credentials = flow.run_console()
    youtube = googleapiclient.discovery.build(api_service_name, api_version,credentials=credentials)
    try:
        request = youtube.videos().reportAbuse(
            body={
                "videoId": id,
                "reasonId": "N", 
            }
        )
        request.execute()
        df = pd.DataFrame({'id': [id], 'response': "success"})
        df.to_csv(res_path, mode='a', header=False, index=False)
    except googleapiclient.errors.HttpError as e:
        df = pd.DataFrame({'id': [id], 'response': "failed"})
        df.to_csv(res_path, mode='a', header=False, index=False)

In [36]:
# function to report youtube videos
for id in video_id:
    report_video(id)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=157489452148-7cctdqns3srb8octgpeq6muda1n6al40.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fyoutube.force-ssl&state=W5OOHbLc4BohOD5TG3Wr4qtWb2izIR&prompt=consent&access_type=offline


In [63]:
if os.path.exists(res_path):
    df = pd.read_csv(res_path)
    df.columns = ['id', 'response']
    df.to_csv(res_path, index=False)