In [2]:
import requests, sys, time, os, argparse
import json
import numpy as np
# List of simple to collect features
snippet_features = ["title",
                    "publishedAt",
                    "channelId",
                    "channelTitle",
                    "categoryId"]

# Any characters to exclude, generally these are things that become problematic in CSV files
unsafe_characters = ['\n', '"']

# Used to identify columns, currently hardcoded order
header = ["video_id"] + snippet_features + ["trending_date", "tags", "view_count", "likes", "dislikes",
                                            "comment_count", "thumbnail_link", "comments_disabled",
                                            "ratings_disabled", "description"]


def setup(api_path, code_path):
    with open(api_path, 'r') as file:
        api_key = file.readline()
    
    with open(code_path) as file:
        country_codes = [x.rstrip() for x in file]
    
    return api_key, country_codes


def prepare_feature(feature):
    # Removes any character from the unsafe characters list and surrounds the whole item in quotes
    for ch in unsafe_characters:
        feature = str(feature).replace(ch, "")
    return f'"{feature}"'


def api_request(page_token, country_code):
    # Builds the URL and requests the JSON from it
    print(page_token)
    print(country_code)
    request_url = f"https://www.googleapis.com/youtube/v3/videos?part=id,statistics,snippet{page_token}chart=mostPopular&regionCode={country_code}&maxResults=50&key={api_key}"
    print(request_url)
    request = requests.get(request_url)
    print(request.status_code)
    if request.status_code == 429:
        print("Temp-Banned due to excess requests, please wait and continue later")
        sys.exit()
    return request.json()
def api_request_comments(page_token, video_id):
    
    request_comments_url = f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet{page_token}&videoId={video_id.strip()}&key=AIzaSyADlxGSg_VGSPtB6-rRkBw9g4eGSkdIUM4"
    #request_comments_url = "https://www.googleapis.com/youtube/v3/videos?part=id,statistics,snippet&chart=mostPopular&regionCode=US&maxResults=50&key=AIzaSyAT0_U4OzScfwyJheOesL9GZRzH5yA4JUY"
    #print(request_comments_url)
    request = requests.get(request_comments_url)
    print(request.status_code)
  
    if request.status_code == 429:
        print("Temp-Banned due to excess requests, please wait and continue later")
        sys.exit()
    return request.json()

def get_tags(tags_list):
    # Takes a list of tags, prepares each tag and joins them into a string by the pipe character
    return prepare_feature("|".join(tags_list))


def get_videos(items):
    lines = []
    for video in items:
        comments_disabled = False
        ratings_disabled = False

        # We can assume something is wrong with the video if it has no statistics, often this means it has been deleted
        # so we can just skip it
        if "statistics" not in video:
            continue

        # A full explanation of all of these features can be found on the GitHub page for this project
        video_id = prepare_feature(video['id'])

        # Snippet and statistics are sub-dicts of video, containing the most useful info
        snippet = video['snippet']
        statistics = video['statistics']

        # This list contains all of the features in snippet that are 1 deep and require no special processing
        features = [prepare_feature(snippet.get(feature, "")) for feature in snippet_features]

        # The following are special case features which require unique processing, or are not within the snippet dict
        description = snippet.get("description", "")
        thumbnail_link = snippet.get("thumbnails", dict()).get("default", dict()).get("url", "")
        trending_date = time.strftime("%y.%d.%m")
        tags = get_tags(snippet.get("tags", ["[none]"]))
        view_count = statistics.get("viewCount", 0)

        # This may be unclear, essentially the way the API works is that if a video has comments or ratings disabled
        # then it has no feature for it, thus if they don't exist in the statistics dict we know they are disabled
        if 'likeCount' in statistics and 'dislikeCount' in statistics:
            likes = statistics['likeCount']
            dislikes = statistics['dislikeCount']
        else:
            ratings_disabled = True
            likes = 0
            dislikes = 0

        if 'commentCount' in statistics:
            comment_count = statistics['commentCount']
        else:
            comments_disabled = True
            comment_count = 0

        # Compiles all of the various bits of info into one consistently formatted line
        line = [video_id] + features + [prepare_feature(x) for x in [trending_date, tags, view_count, likes, dislikes,
                                                                       comment_count, thumbnail_link, comments_disabled,
                                                                       ratings_disabled, description]]
        lines.append(",".join(line))
    return lines

def get_comments(video_id):
    video_comments = []
    next_page_token = "&"
    i =0;
    while next_page_token is not None:
        # A page of data i.e. a list of videos and all needed data
        comments_data_page = api_request_comments(next_page_token, video_id)
        i += 1
        if(i == 10):
            break;
        
        # Get the next page token and build a string which can be injected into the request with it, unless it's None,
        # then let the whole thing be None so that the loop ends after this cycle
        next_page_token = comments_data_page.get("nextPageToken", None)
        next_page_token = f"&pageToken={next_page_token}&" if next_page_token is not None else next_page_token

        # Get all of the items as a list and let get_videos return the needed features
        video_comments += comments_data_page.get('items', [])
        
    name = "Comments_"+video_id+".json";
    with open(output_dir+'/'+name, 'w') as outfile:
        json.dump(video_comments, outfile)
    
def get_pages(country_code, next_page_token="&"):
    country_data = []
    i = 0
    # Because the API uses page tokens (which are literally just the same function of numbers everywhere) it is much
    # more inconvenient to iterate over pages, but that is what is done here.
    while next_page_token is not None:
        # A page of data i.e. a list of videos and all needed data
        video_data_page = api_request(next_page_token, country_code)
        
        
        # Get the next page token and build a string which can be injected into the request with it, unless it's None,
        # then let the whole thing be None so that the loop ends after this cycle
        next_page_token = video_data_page.get("nextPageToken", None)
        next_page_token = f"&pageToken={next_page_token}&" if next_page_token is not None else next_page_token

        # Get all of the items as a list and let get_videos return the needed features
        items = video_data_page.get('items', [])
        country_data += get_videos(items)
    #print(np.size(country_data))
   # print("kod fora sam")
    for video in country_data:
       video_id =  video.split(",")[0]
       video_id = video_id.replace('"', '')
       get_comments(video_id)
       # print("usao")
    #print(next_page_token)
    return country_data


def write_to_file(country_code, country_data):

    print(f"Writing {country_code} data to file...")
   
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    with open(f"{output_dir}/{time.strftime('%y.%d.%m')}_{country_code}_videos.csv", "w+", encoding='utf-8') as file:
        for row in country_data:
            file.write(f"{row}\n")


def get_data():
    for country_code in country_codes:
     
        country_data = [",".join(header)] + get_pages(country_code)
        #print(country_data)
        write_to_file(country_code, country_data)


if __name__ == "__main__":

    output_dir = "output"
    api_key = "api_key.txt"
    country_codes = "country_codes.txt"
    api_key, country_codes= setup(api_key,country_codes)
    get_data()
    print('END')

&
US
https://www.googleapis.com/youtube/v3/videos?part=id,statistics,snippet&chart=mostPopular&regionCode=US&maxResults=50&key=AIzaSyC4sS4U66SGjXFIQTsyXwj_Nf-LpODTxiU
200
&pageToken=CDIQAA&
US
https://www.googleapis.com/youtube/v3/videos?part=id,statistics,snippet&pageToken=CDIQAA&chart=mostPopular&regionCode=US&maxResults=50&key=AIzaSyC4sS4U66SGjXFIQTsyXwj_Nf-LpODTxiU
200
&pageToken=CGQQAA&
US
https://www.googleapis.com/youtube/v3/videos?part=id,statistics,snippet&pageToken=CGQQAA&chart=mostPopular&regionCode=US&maxResults=50&key=AIzaSyC4sS4U66SGjXFIQTsyXwj_Nf-LpODTxiU
200
&pageToken=CJYBEAA&
US
https://www.googleapis.com/youtube/v3/videos?part=id,statistics,snippet&pageToken=CJYBEAA&chart=mostPopular&regionCode=US&maxResults=50&key=AIzaSyC4sS4U66SGjXFIQTsyXwj_Nf-LpODTxiU
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200


SSLError: HTTPSConnectionPool(host='www.googleapis.com', port=443): Max retries exceeded with url: /youtube/v3/commentThreads?part=snippet&pageToken=QURTSl9pMkt4dFVWaWNyclowaEpreHpRYUtKX21oZ0hicEgtay1taUxCdFRpdGZCT0Z2bmRNTGtqNDhvY25UcUttcUJRZVR6YnI1dUpoWFVhN2ptUXFKWlZPYk9XTk9HeVowNV9wN21NXzk1Z0tOOFJ1clNtOUVIZkV0Qjc3c0tYcS0tdVFUNHA4TXVXUnQ5NzNRY1l1Z1BqVTVaMHFr&&videoId=I_jiQivSYEc&key=AIzaSyADlxGSg_VGSPtB6-rRkBw9g4eGSkdIUM4 (Caused by SSLError(SSLError("bad handshake: SysCallError(-1, 'Unexpected EOF')")))