# Imports and options

In [1]:
import numpy as np
import pandas as pd
from googleapiclient.discovery import build
from datetime import timedelta

# Display options:
pd.set_option("display.width", 1200)
pd.set_option("display.max_columns", 300)
pd.set_option("display.max_rows", 300)

# Youtube Data API

In [2]:
yt = build(
    serviceName = "youtube",
    version = "v3",
    developerKey = "" # my key
)

# Channels sample

In [5]:
# * the API only returns 20.000 videos by channel
# Ids from (content = "https://www.youtube.com/channel/...) and titles:
channels = {
    "UCYO_jab_esuFRV4b17AJtAw": "3Blue1Brown",
    "UCNKcMBYP_-18FLgk4BYGtfw": "AWE me",
    "UCpEczh-MV7I8WgPaVmMYhhg": "Abandoned Miniatures",
    "UC4K10PNjqgGLKA3lo5V8KdQ": "Above The Noise",
    "UCjgZbW6Tvx93iKUkNw5lyew": "Academia de Libras",
    "UCiDJtJKMICpb9B1qf7qjEOA": "Adam Savage’s Tested",
    "UC32z4mtyiq02Ge-XWy78ibw": "Aero Por Trás da Aviação",
    "UCWizIdwZdmr43zfxlCktmNw": "Alec Steele",
    "UCvFGf8HZGZWFzpcDCqb3Lhw": "All Things Secured",
    "UCaRMivfyupj3ucUyJbZbCNg": "Anglo-Link",
    "UC1Nj4gkoi_n5eCcrKCVOXKA": "Animal Wonders Montana",
    "UCwg6_F2hDHYrqbNSGjmar4w": "Animalogic",
    "UCONd1SNf3_QqjzjCVsURNuA": "AntsCanada",
    "UCC552Sd-3nyi_tk2BudLUzA": "AsapSCIENCE",
    "UCi5iiEyLwSLvlqnMi02u5gQ": "Ask a Mortician",
    "UCGYBY4KaFYmkEKAGLL07BXw": "AstroTubers",
    "UC-9b7aDP6ZN0coj9-xFnrtw": "Astrum",
    "UCSTlOTcyUmzvhQi6F8lFi5w": "Atila Iamarino",
    "UCLXl1V6n82Dyg1VhVgSL0nw": "Aviões e Músicas com Lito Sousa",
    "UCGlmclFVnJCFQ_VO4kTgTig": "Azusa Barbie",
    "UCCj956IF62FbT7Gouszaj9w": "BBC",
    "UCvZe6ZCbF9xgbbbdkiodPKQ": "Baumgartner Restoration",
    "UCDSzwZqgtJEnUzacq3ddoOQ": "Ben G Thomas",
    "UCngMw0G2y2YFLPVbyEsfVog": "Black Nerd Comedy",
    "UCxcnsr1R5Ge_fbTu5ajt8DQ": "Bob Ross",
    "UCQRPDZMSwXFEDS67uc7kIdg": "Buenas Ideias",
    "UCcyq283he07B7_KUX07mmtA": "Business Insider",
    "UC2C_jShtL725hvbm1arSV9w": "CGP Grey",
    "UCvJJ_dzjViJCoLf5uKUTwoA": "CNBC",
    "UCWq1xltHB2fDe6YkYoOrryg": "Canal do Schwarza",
    "UCEOXxzW2vU0P-0THehuIIeg": "Captain Disillusion",
    "UCwnKziETDbHJtx78nIkfYug": "CaspianReport",
    "UCR1-7g_y2YcYlh1W9y_1LUg": "Chaves Estranho",
    "UC04KsGq3npibMCE9Td3mVDg": "Cheddar",
    "UC6HDXr-sNPnWLF_Q-y3KduA": "Chess Talk",
    "UCCYX4s1DCn51Hpf1peHS30Q": "Cinema Therapy",
    "UCEwIUtFBhaI2L2PuKv0KL2g": "Classic Mr Bean",
    "UCvmijL-eepDVHYSJHDY3d6w": "Cole and Marmalade",
    "UC9-y-6csu5WGm29I7JiwpnA": "Computerphile",
    "UCNvsIonJdJ5E4EXMa65VYpA": "ContraPoints",
    "UCyAMV63OY0DRbKk6gGv6wJg": "Cradle Of Filth",
    "UCX6b17PVsYBQ0ip5gyeme-Q": "CrashCourse",
    "UCe_vXdMrHHseZ_esYUskSBw": "CrazyRussianHacker",
    "UC726J5A0LLFRxQ0SZqr2mYQ": "Curious Droid",
    "UCFuIUoyHB12qpYa8Jpxoxow": "Código Fonte TV",
    "UCW39zufHfsuGgpLviKh297Q": "DW Documentary",
    "UCd-swDW3HCs4LTZhq8Cf7Fg": "DW Euromaxx",
    "UCQIcXQ2n0sa-7CD0NtqnrrA": "Darko Audio",
    "UCa37IMrH8BGS_pO6CKbTL-A": "Dinosaurs",
    "UCEf5U1dB5a2e2S-XUlnhxSA": "Diolinux",
    "UC_5niPa-d35gg88HaS7RrIw": "Disney",
    "UC1xwwLwm6WSMbUn_Tp597hQ": "Disney Parks",
    "UCKUNiU0D2u3yxNZZxBqa6bQ": "DutchPilotGirl",
    "UCVRrGAcUc7cblUzOhI1KfFg": "Earthling Ed",
    "UCxiKhn4yzWO1mR67iRrVk_A": "Elis Valeriano",
    "UCZq_CYXRoRjKqidapMPujaQ": "EntrePlanos",
    "UCIBaDdAbGlFDeS33shmlD0A": "European Space Agency, ESA",
    "UCCPHeV_9kyViBufLwBl9b5g": "Evelyn From The Internets",
    "UC6uKrU_WqJ1R2HMTY3LIx5Q": "Everyday Astronaut",
    "UCZ6JHFBaDUJ9wfo41HSII_w": "Fabio Chaves",
    "UC6zbvGt_jJVgw7-MWkVUC5A": "Fala Vegan",
    "UCxGiUjep8KiihvHldRousPA": "Fancy Fairy Wings & Things",
    "UCyNtlmLB73-7gtlBz00XOQQ": "Folding Ideas",
    "UCweCc7bSMX5J4jEH7HFImng": "GMHikaru",
    "UCuTaETsuCOkJ0H_GAztWt0Q": "Global Cycling Network",
    "UCeSRjhfeeqIgr--AcP9qhyg": "Guinness World Records",
    "UCKh3kvpo6_Xt2MDEUXjsqxw": "Hallease",
    "UCyyp7X9QBvJh6mIoQYJmUuA": "Hanabira工房",
    "UCLAcytNR3gdw44yzoSiKluA": "Hello Korea",
    "UCggHoXaj8BQHIiPmOxezeWA": "History Buffs",
    "UC26YLK0OEbLB3TCYxGh8xVQ": "Huygens Optics",
    "UCcMDMoNu66_1Hwi5-MeiQgw": "Hydraulic Press Channel",
    "UC3uAjWoLZ4bSi6qI9SjALxA": "Imperial War Museums",
    "UCHJuQZuzapBh-CuhRYxIZrg": "Insider",
    "UCkAbeyJZqp6JFYfKy-98NjA": "Integrando Conhecimento",
    "UCJ8bjM5yQSkOP-_99n1zKkw": "Intensivo Pedagógico",
    "UCwO-UgquohXwoe7f0e6lMnw": "Invicta",
    "UC8p-P1qJ312lZD30GoVCH7Q": "It's Black Friday",
    "UCH4BNI0-FOK2dMXoFtViWHw": "It's Okay To Be Smart",
    "UC-B9ND1Xz-b3DVbKIlACftA": "J Lisle Writes",
    "UCZ9l_6_f0PWRYXN5Y7Lcl2A": "Jacques Slade",
    "UCbbQalJ4OaC0oQ0AqRaOJ9g": "Jay Foreman",
    "UCkP2CvRubyU0MTZv_Qo712g": "Jessica in the Kitchen - Vegan Recipes",
    "UC-2YHgc363EdcusLIBbgxzg": "Joe Scott",
    "UCBbnbBWJtwsf0jLGUwX5Q3g": "Journey to the Microcosmos",
    "UCxFWzKZa74SyAqpJyVlG5Ew": "Kat Blaque",
    "UCLQWhXC5aOZ_tQNloBg40BA": "Kirsten & Joerg",
    "UCuCuEKq1xuRA0dFQj1qg9-Q": "Knowledgia",
    "UCsXVk37bltHxD1rDPwtNM8Q": "Kurzgesagt – In a Nutshell",
    "UCNqNkZ7kKfqimqHkgbWMNYA": "Launch Pad Astronomy",
    "UCG1h-Wqjtwz7uUANw6gazRw": "Lindsay Ellis",
    "UCoNTMWgGuXtGPLv9UeJZwBw": "Living Big In A Tiny House",
    "UCm9K6rby98W8JigLoZOh6FQ": "LockPickingLawyer",
    "UCAcZNVPEQ8K7JhMDwOcf8Pg": "Loira na Estrada",
    "UCFzODWkN1gXm4ryOgLK8Deg": "Ludoviajante",
    "UCt2WVZXVrHaFgZ432s33fLw": "MAKE UP FOR EVER",
    "UCbK5Us4E-HsXw6fQ1PYUuog": "MW Informática",
    "UCXgxNzAgZ1GExhTW4X1mUrg": "Marc Rebillet",
    "UCBlXovStrlQkVA2xJEROUNg": "Marcel Vos",
    "UCBJycsmduvYEL83R_U4JriQ": "Marques Brownlee",
    "UCr22xikWUK2yUW4YxOKXclQ": "MathTheBeautiful",
    "UC6nZXhOz_gHr4mlb5UNl9ug": "Matt Jordan",
    "UCjZfdrnSOrh4iFL2GCNvWVw": "Maven of the Eventide",
    "UCy8MVV7aDkSD6UghRhBtXOw": "Maya Table",
    "UCQwFuQLnLocj5F7ZcmcuWYQ": "MetaBallStudios",
    "UCk5BcU1rOy6hepflk7_q_Pw": "Meteoro Brasil",
    "UCqONzeACDBaF6FfKjh7ndAQ": "Microsoft Flight Simulator",
    "UChHS4NI6U4XgCuYgsrygVCA": "Mintfaery",
    "UCeiYXex_fwgYDonaTcSIk6w": "MinuteEarth",
    "UC1ZBQ-F-yktYD4m5AzM6pww": "Mustard",
    "UCLA_DiR1FfKNvjuUpBHmylQ": "NASA",
    "UCSuHzQ3GrHSzoBbwrIq3LLA": "Naomi Brockwell",
    "UCPZvqkyXoYDlzBClxpxVIzg": "Naomi Farr",
    "UCggHsHce2n3vvbJf_8YKrMA": "Nerdforge",
    "UCWOA1ZGywLbqmigxE4Qlvuw": "Netflix",
    "UCD57tGnYPW1twCohgIVkJpw": "Nox et Lux",
    "UCoxcjq-8xIDTYp3uz647V5A": "Numberphile",
    "UCroDJPcFCf6DBmHns6Xeb8g": "Nyma Tang",
    "UCjivwB8MrrGCMlIuoSdkrQg": "Nátaly Neri",
    "UCQ3JxE-NOyZaJ3m3qIkZbhA": "O Pimentinha",
    "UCK1XzxcKXB_v_dcw_tflC1A": "OBF",
    "UCtb2f3j_sB4Z4Ik2YJtbzsg": "OneikaTraveller",
    "UCwK-svlNEASA7g_smu1_d8g": "OnlyConnect Fan",
    "UCiB8h9jD2Mlxx96ZFnGDSJw": "Origin Of Everything",
    "UCpLQXR116cLVUa1LRY8KS4w": "OwlKitty",
    "UCzR-rom72PHN9Zg7RML9EbA": "PBS Eons",
    "UC7_gcs09iThXybpVgjHZ_7g": "PBS Space Time",
    "UCjY_LBR_7-Ile0MAsbeJWBg": "PH Santos",
    "UCddYq41_tZ1FnLlguLT6-Ow": "Parafernalha",
    "UCOVkkVaUP0Xop6IIjyk86QA": "Perception",
    "UCtESv1e7ntJaLJYKIO1FoYw": "Periodic Videos",
    "UC2PA-AKmVpU6NKCGtZq_rKQ": "Philosophy Tube",
    "UC7DdEm33SyaTDtWYGO2CwdA": "Physics Girl",
    "UCQwHYU3ZbSLG6LOVwtvuozg": "Planeta Aves",
    "UCEWHPFNilsT0IfQfutVzsag": "Porta dos Fundos",
    "UCNJe8uQhM2G4jJFRWiM89Wg": "Potato Jet",
    "UClo-U5gvXPRy-4VqWEkkRDg": "Primitive Building",
    "UCKKJpBveT8vWVNfLQ-MvZMg": "Prof. André Azevedo da Fonseca",
    "UCXOzDw_X92bNqSCqw-NojiA": "Professor Julio Borbo",
    "UCwSxSJqGpSRpEsq5-YUbM8g": "Professor Noslen",
    "UCNzul4dnciIlDg8BAcn5-cQ": "Prowalk Tours",
    "UCvn_XCl_mgQmt3sD753zdJA": "Rachel's English",
    "UCR1IuLEqb6UEA_zQ81kwXfg": "Real Engineering",
    "UCP5tjEmvPItGyLhmjdwP7Ww": "RealLifeLore",
    "UCFJxE0l3cVYU4kHzi4qVEkw": "Rebecca Watson",
    "UCliwOcLaEfqkGJ7xq2ipq4w": "Reverse Engineering",
    "UC9Ep0Y4T5rvUuIfjKN1wqTw": "Riddle",
    "UCovtFObhY9NypXcyHxAS7-Q": "Roberto Blake",
    "UCk85otLk-ASsqPu0aXFGjzA": "Say It Loud",
    "UC5UYMeKfZbFYnLHzoTJB1xA": "Schaffrillas Productions",
    "UCZYTClx2T1of7BRZ86-8fow": "SciShow",
    "UCxzC4EngIsMrPmbm6Nxvb-A": "Scott Manley",
    "UCi7l9chXMljpUft67vw78qw": "Sideways",
    "UC3KEoMzNz8eYnwBC34RaKCQ": "Simone Giertz",
    "UCT3wwJBh8fwePiNC63NlzCA": "Simple Flying",
    "UC6107grRI4m0o2-emgoDnAA": "SmarterEveryDay",
    "UC_Fk7hHbl7vv_7K8tYqJd5A": "SpaceToday",
    "UCtI0Hodo5o5dUb67FeUjDeA": "SpaceX",
    "UC00uG71I6iPyx15EX6i_GDA": "Star Wars Comics",
    "UCZ9jWH_8tJ-Nmaj8dSQdEYA": "Stefan Milo",
    "UCpQLC-evmUAon9BBpcW4kYg": "Steve Cutts",
    "UCO6nDCimkF79NZRRb8YiDcA": "Storied",
    "UC6u6uY4VbvuNtU0BU7F9olw": "Studson Studio",
    "UCRXRbi80k0_vcIfgpOSerTg": "Subject Zero Science",
    "UCAuUUnT6oDeKwE6v1NGQxug": "TED",
    "UCOuWeOkMrq84u5LY6apWQ8Q": "TREY the Explainer",
    "UC7U72Ze-y8s4hvAmRvhXPgQ": "TecLab",
    "UCZdJE8KpuFm6NRafHTEIC-g": "Tempero Drag",
    "UCnUXq8mGmoHt0e6ItuTs10w": "That Pedal Show",
    "UC8uT9cgJorJPWu7ITLGo9Ww": "The 8-Bit Guy",
    "UCmQThz1OLYt8mb2PU540LOA": "The Art Assignment",
    "UC6n8I1UDTKP1IWjQMg6_TwA": "The B1M",
    "UCvpQ-l09fCVxJd3urZbxzHg": "The British Museum",
    "UCINb0wqPz-A0dV9nARjJlOQ": "The Dodo",
    "UCsQXQMAZTygXddj_I4Ud9xA": "The Green Witch",
    "UC42VsoDtra5hMiXZSsD6eGg": "The Modern Rogue",
    "UCUK0HBIBWgM2c4vsPhkYY4w": "The Slow Mo Guys",
    "UC2LVhJH_9cT2XKp0VAfsKOQ": "The Tim Traveller",
    "UC4qGmRZ7aLOLfVsSdj5Se2A": "TheThings",
    "UCFGTzZL-yxzlD2ZF90XKFkg": "Tiny Cakes",
    "UCaTSjmqzOO-P8HmtVW3t7sA": "Todd in the Shadows",
    "UCBa659QWEk1AI4Tg--mrJ2A": "Tom Scott",
    "UCN3aYbtQ7yCqk9DM56B0kEw": "Tomorrow's Build",
    "UClVbhSLxwws-KSsPKz135bw": "TopMovieClips",
    "UC29ju8bIPH5as8OGnQzwJyA": "Traversy Media",
    "UCgtq_tKnFFQyJ-7j0yr9utg": "Tyta Montrase",
    "UCSxFcMDGmwgDlSP9qmYVx1w": "TÁ NA MESA VEGG",
    "UCVEVuanoMK9tGclfWLghaKw": "Tá Querida",
    "UCv3fc4d_IhCK2g5Y_kjpvaw": "UM BOTÂNICO NO APARTAMENTO",
    "UCTlLFaGX4q2otO0ZVxLW0VQ": "Uma história a mais",
    "UC9fSZHEh6XsRpX-xJc6lT3A": "UrAvgConsumer",
    "UCbYFhcKSE2mWYB0yD_Qr_8A": "Urban Gardening",
    "UCc9M8_pF78tYblW8nMHXUqg": "VIEWGANAS",
    "UCp3iXxis9n_E_GfbE-_ksFw": "VeganBlackMetalChef",
    "UCHnyfMqiRRG1u-2MsSQLbXA": "Veritasium",
    "UCLXo7UDZvByw2ixzpQCufnA": "Vox",
    "UCYulrOk4EbqL833oETw0bWQ": "Wanderlust Travel Videos",
    "UCs9MJ-TmN0-CLY2eNBHAltw": "Wanna Walk",
    "UC-He-TI37gYyrrSy7BWvh1Q": "YCImaging",
    "UCmVa-cbCpkd5Cd9Fr_4tCWg": "Your Dinosaurs Are Wrong",
    "UCSc16oMxxlcJSb9SXkjwMjA": "YourMovieSucksDOTorg",
    "UC9Ntx-EF3LzKY1nQ5rTUP2g": "cyriak",
    "UCpkZ7Z_J8RJvRtYc3JQdh0g": "mikannn",
    "UCUHW94eEFW7hkUMVaZz4eDg": "minutephysics",
    "UCtYKe7-XbaDjpUwcU5x0bLg": "neo",
    "UCkyfHZ6bY2TjqbJhiH8Y2QQ": "thebrainscoop",
    "UCGaVdbSav8xWuFWTadK6loA": "vlogbrothers",
    "UCVpankR4HtoAVtYnFDUieYA": "zefrank1"
 }

In [7]:
len(channels)

208

# Videos data

In [8]:
# Loop in the channels:
df_videos = pd.DataFrame(
    {
        "channel_title": pd.Series([], dtype = "str"),
        "channel_id": pd.Series([], dtype = "str"),
        "video_title": pd.Series([], dtype = "str"),
        "video_id": pd.Series([], dtype = "str"),
        "video_upload_date": pd.Series([], dtype = "str"),
        "views": pd.Series([], dtype = "int"),
        "likes": pd.Series([], dtype = "int"),
        "dislikes": pd.Series([], dtype = "int"),
        "comments": pd.Series([], dtype = "int")
    }
)
for chnl_ind, chnl_key in enumerate(channels):

    # Choose the channel:
    channel_id = chnl_key
    channel_title = channels[channel_id]

    print(channel_title)

    ### Extract the data from all the videos of the channel

    # Content details:
    content = yt.channels().list(
        id = channel_id,
        part = "contentDetails"
    ).execute()

    # Upload Id:
    upload_id = content["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]

    # All the channel's videos:
    all_videos = []
    next_pg_token = None
    res = yt.playlistItems().list(
        playlistId = upload_id,
        maxResults = 50,
        part = "snippet",
        pageToken = next_pg_token
    ).execute()
    while True:
        res = yt.playlistItems().list(
            playlistId = upload_id,
            maxResults = 50,
            part = "snippet",
            pageToken = next_pg_token
        ).execute()
        all_videos += res["items"]
        next_pg_token = res.get("nextPageToken")
        if next_pg_token is None:
            break

    # Videos statistics:
    videos_ids = list(map(lambda x: x["snippet"]["resourceId"]["videoId"], all_videos))
    stats = []
    for i in range(0, len(videos_ids), 40):
        res = (yt).videos().list(
            id = ",".join(videos_ids[i:i + 40]),
            part = "statistics"
        ).execute()
        stats += res["items"]

    # Put it in a dataframe:
    video_id, views, likes, dislikes, comments = [], [], [], [], []
    for i in range(len(stats)):
        video_id += [stats[i]["id"]]
        stats_i = stats[i]["statistics"]
        views += [stats_i[k] if "viewCount" in stats_i.keys() else np.nan for k in ["viewCount"]]
        likes += [stats_i[k] if "likeCount" in stats_i.keys() else np.nan for k in ["likeCount"]]
        dislikes += [stats_i[k] if "dislikeCount" in stats_i.keys() else np.nan for k in ["dislikeCount"]]
        comments += [stats_i[k] if "commentCount" in stats_i.keys() else np.nan for k in ["commentCount"]]
    df_channel_videos = pd.DataFrame(
        {
            "channel_title": channel_title,
            "channel_id": channel_id,
            "video_title": list(map(lambda x: x["snippet"]["title"], all_videos)),
            "video_id": video_id,
            "video_upload_date": list(map(lambda x: x["snippet"]["publishedAt"], all_videos)),
            "views": views,
            "likes": likes,
            "dislikes": dislikes,
            "comments": comments        
        }
    )
    df_videos = pd.concat([df_videos, df_channel_videos])

3Blue1Brown
AWE me
Abandoned Miniatures
Above The Noise
Academia de Libras
Adam Savage’s Tested
Aero Por Trás da Aviação
Alec Steele
All Things Secured
Anglo-Link
Animal Wonders Montana
Animalogic
AntsCanada
AsapSCIENCE
Ask a Mortician
AstroTubers
Astrum
Atila Iamarino
Aviões e Músicas com Lito Sousa
Azusa Barbie
BBC
Baumgartner Restoration
Ben G Thomas
Black Nerd Comedy
Bob Ross
Buenas Ideias
Business Insider
CGP Grey
CNBC
Canal do Schwarza
Captain Disillusion
CaspianReport
Chaves Estranho
Cheddar
Chess Talk
Cinema Therapy
Classic Mr Bean
Cole and Marmalade
Computerphile
ContraPoints
Cradle Of Filth
CrashCourse
CrazyRussianHacker
Curious Droid
Código Fonte TV
DW Documentary
DW Euromaxx
Darko Audio
Dinosaurs
Diolinux
Disney
Disney Parks
DutchPilotGirl
Earthling Ed
Elis Valeriano
EntrePlanos
European Space Agency, ESA
Evelyn From The Internets
Everyday Astronaut
Fabio Chaves
Fala Vegan
Fancy Fairy Wings & Things
Folding Ideas
GMHikaru
Global Cycling Network
Guinness World Records
Hallea

In [9]:
df_videos

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments
0,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,A few of the best math explainers from this su...,F3Qixy-r_rQ,2021-10-23T18:11:23Z,537453,30720,113,785
1,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,How a Mandelbrot set arises from Newton’s work,LqbZpur38nw,2021-10-15T16:41:50Z,621248,27411,156,1248
2,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,Newton's Fractal (which Newton knew nothing ab...,-RdOwhmqP5s,2021-10-07T02:19:39Z,1267428,63787,304,2864
3,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,The Summer of Math Exposition,ojjzXyQCzso,2021-07-16T15:37:16Z,616067,29415,216,1721
4,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,A quick trick for computing eigenvalues | Chap...,e50Bj7jn9IQ,2021-05-07T19:01:16Z,432555,17245,145,1150
...,...,...,...,...,...,...,...,...,...
192,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Hole,LPfcGXMpKds,2009-06-06T22:16:56Z,90973,1804,23,75
193,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Affordable Button,h3lvSflNixI,2009-05-30T00:13:46Z,103287,1575,47,18
194,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Optical Illusion,8mS5RK0Yo6w,2009-05-23T22:36:58Z,1072499,9865,1201,
195,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Outsource,zxIbQ6ZG4OI,2009-05-17T00:50:31Z,78321,1628,20,87


# Data cleaning

## Change the data types

In [10]:
def toint64(x):
    return x.str.extract("(\d+)", expand = False).astype("float").astype("Int64")

df_videos["views"] = toint64(df_videos["views"])
df_videos["likes"] = toint64(df_videos["likes"])
df_videos["dislikes"] = toint64(df_videos["dislikes"])
df_videos["comments"] = toint64(df_videos["comments"])
df_videos["video_upload_date"] = pd.to_datetime(df_videos["video_upload_date"])

In [11]:
df_videos.dtypes

channel_title                     object
channel_id                        object
video_title                       object
video_id                          object
video_upload_date    datetime64[ns, UTC]
views                              Int64
likes                              Int64
dislikes                           Int64
comments                           Int64
dtype: object

## Remove videos with NaN or 0 in any variable

In [12]:
df_videos = df_videos.dropna(axis = 0)
df_videos = df_videos.loc[(df_videos["views"] > 0) & (df_videos["likes"] > 0) & (df_videos["dislikes"] > 0) & (df_videos["comments"] > 0)]
df_videos = df_videos.reset_index(drop = True)

In [13]:
df_videos

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments
0,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,A few of the best math explainers from this su...,F3Qixy-r_rQ,2021-10-23 18:11:23+00:00,537453,30720,113,785
1,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,How a Mandelbrot set arises from Newton’s work,LqbZpur38nw,2021-10-15 16:41:50+00:00,621248,27411,156,1248
2,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,Newton's Fractal (which Newton knew nothing ab...,-RdOwhmqP5s,2021-10-07 02:19:39+00:00,1267428,63787,304,2864
3,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,The Summer of Math Exposition,ojjzXyQCzso,2021-07-16 15:37:16+00:00,616067,29415,216,1721
4,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,A quick trick for computing eigenvalues | Chap...,e50Bj7jn9IQ,2021-05-07 19:01:16+00:00,432555,17245,145,1150
...,...,...,...,...,...,...,...,...,...
150987,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Art Hour,C6hpMftAIs0,2009-06-13 18:32:12+00:00,172867,2165,31,137
150988,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Hole,LPfcGXMpKds,2009-06-06 22:16:56+00:00,90973,1804,23,75
150989,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Affordable Button,h3lvSflNixI,2009-05-30 00:13:46+00:00,103287,1575,47,18
150990,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Outsource,zxIbQ6ZG4OI,2009-05-17 00:50:31+00:00,78321,1628,20,87


# New features

## Age in days (until the last upload date)

In [14]:
# Filter the wrong upload dates (coming from the API as being the request date) considering a margin of 2 days:
last_date = max(df_videos["video_upload_date"])
age = []
for i in range(df_videos.shape[0]):
    date_i = df_videos["video_upload_date"].iloc[i]
    diff_days = last_date - date_i
    if diff_days < timedelta(days = 2):
        age += [np.nan]
    else:
        age += [diff_days.days]
df_videos["age_days"] = age

df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [15]:
df_videos["age_days"].describe()

count    150899.000000
mean       1640.070365
std        1132.317065
min           2.000000
25%         762.000000
50%        1490.000000
75%        2217.000000
max        5749.000000
Name: age_days, dtype: float64

In [16]:
df_videos.sort_values("age_days").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days
66894,Guinness World Records,UCeSRjhfeeqIgr--AcP9qhyg,Fastest Time to Eat A Banana (NO HANDS) - Guin...,H2l0r_20ntQ,2021-11-08 17:34:21+00:00,87101,2225,251,307,2.0
79983,Jacques Slade,UCZ9l_6_f0PWRYXN5Y7Lcl2A,Get Your Hands Up,FXLXBfmlaOk,2006-02-12 20:31:24+00:00,28254,207,9,108,5749.0


## Likes/dislikes

In [17]:
df_videos["likes_dislikes_ratio"] = df_videos["likes"]/df_videos["dislikes"]
df_videos["likes_dislikes_ratio"] = df_videos["likes_dislikes_ratio"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [18]:
df_videos["likes_dislikes_ratio"].describe()

count    150899.000000
mean         67.316134
std          83.937714
min           0.021739
25%          19.156954
50%          42.819421
75%          84.904409
max        4030.000000
Name: likes_dislikes_ratio, dtype: float64

In [19]:
df_videos.sort_values("likes_dislikes_ratio").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio
36504,CNBC,UCvJJ_dzjViJCoLf5uKUTwoA,Fired Google Employee Behind Anti-Diversity Me...,WyIW7hgmryw,2017-08-08 17:16:32+00:00,1385,1,46,3,1555.0,0.021739
59904,Evelyn From The Internets,UCCPHeV_9kyViBufLwBl9b5g,This Is What It Takes To Vlog Every Day In Apr...,hgwrjWvow8U,2016-04-15 22:03:49+00:00,33731,4030,1,616,2034.0,4030.0


## Likes/views

In [20]:
df_videos["likes_views_ratio"] = df_videos["likes"]/df_videos["views"]
df_videos["likes_views_ratio"] = df_videos["likes_views_ratio"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [21]:
df_videos["likes_views_ratio"].describe()

count    150899.000000
mean          0.111443
std          28.213952
min           0.000058
25%           0.010699
50%           0.021290
75%           0.042788
max       10959.936877
Name: likes_views_ratio, dtype: float64

In [22]:
df_videos.sort_values("likes_views_ratio").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio
98832,Netflix,UCWOA1ZGywLbqmigxE4Qlvuw,"Discover, Relive and Watch TV from the Beginni...",iwrK1Mqao34,2013-09-05 00:38:00+00:00,790360,46,4,11,2988.0,11.5,5.8e-05
99697,Numberphile,UCoxcjq-8xIDTYp3uz647V5A,Why do YouTube views freeze at 301?,oIkhgagvrjI,2012-06-22 18:38:55+00:00,301,3298941,46560,291188,3428.0,70.853544,10959.936877


## Dislikes/views

In [23]:
df_videos["dislikes_views_ratio"] = df_videos["dislikes"]/df_videos["views"]
df_videos["dislikes_views_ratio"] = df_videos["dislikes_views_ratio"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [24]:
df_videos["dislikes_views_ratio"].describe()

count    1.508990e+05
mean     2.184485e-03
std      3.982119e-01
min      7.518938e-07
25%      2.955819e-04
50%      5.607657e-04
75%      1.076395e-03
max      1.546844e+02
Name: dislikes_views_ratio, dtype: float64

In [25]:
df_videos.sort_values("dislikes_views_ratio").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio
55105,Disney Parks,UC1xwwLwm6WSMbUn_Tp597hQ,Europe Family Vacations | Adventures by Disney,BAk5kgg70uo,2019-01-25 14:48:12+00:00,1329975,103,1,1,1020.0,103.0,7.7e-05,1e-06
99697,Numberphile,UCoxcjq-8xIDTYp3uz647V5A,Why do YouTube views freeze at 301?,oIkhgagvrjI,2012-06-22 18:38:55+00:00,301,3298941,46560,291188,3428.0,70.853544,10959.936877,154.684385


## Comments/views

In [26]:
df_videos["comments_views_ratio"] = df_videos["comments"]/df_videos["views"]
df_videos["comments_views_ratio"] = df_videos["comments_views_ratio"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [27]:
df_videos["comments_views_ratio"].describe()

count    1.508990e+05
mean     1.002503e-02
std      2.490369e+00
min      6.929110e-07
25%      8.927642e-04
50%      1.958028e-03
75%      4.140772e-03
max      9.674020e+02
Name: comments_views_ratio, dtype: float64

In [28]:
df_videos.sort_values("comments_views_ratio").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio
66929,Guinness World Records,UCeSRjhfeeqIgr--AcP9qhyg,Cutting the world's longest fingernails #Shorts,GQjeGkuTEg4,2021-08-19 14:32:05+00:00,23090989,560824,48702,16,83.0,11.51542,0.024288,0.002109,1e-06
99697,Numberphile,UCoxcjq-8xIDTYp3uz647V5A,Why do YouTube views freeze at 301?,oIkhgagvrjI,2012-06-22 18:38:55+00:00,301,3298941,46560,291188,3428.0,70.853544,10959.936877,154.684385,967.401993


## Comments/likes

In [29]:
df_videos["comments_likes_ratio"] = df_videos["comments"]/df_videos["likes"]
df_videos["comments_likes_ratio"] = df_videos["comments_likes_ratio"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [30]:
df_videos["comments_likes_ratio"].describe()

count    150899.000000
mean          0.134334
std           0.213136
min           0.000017
25%           0.049850
50%           0.081968
75%           0.145684
max          23.500000
Name: comments_likes_ratio, dtype: float64

In [31]:
df_videos.sort_values("comments_likes_ratio").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio
26740,Buenas Ideias,UCQRPDZMSwXFEDS67uc7kIdg,A GÊNESE DE LULA - EDUARDO BUENO,2myuXzu7IoM,2021-06-16 12:32:31+00:00,468951,57902,5446,1,147.0,10.632024,0.123471,0.011613,2e-06,1.7e-05
37538,CNBC,UCvJJ_dzjViJCoLf5uKUTwoA,Rep. Steny Hoyer: President Trump's Budget Is ...,tvdHfHf-MKo,2017-03-16 12:57:23+00:00,1040,2,1,47,1700.0,2.0,0.001923,0.000962,0.045192,23.5


## Comments/dislikes

In [32]:
df_videos["comments_dislikes_ratio"] = df_videos["comments"]/df_videos["dislikes"]
df_videos["comments_dislikes_ratio"] = df_videos["comments_dislikes_ratio"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [33]:
df_videos["comments_dislikes_ratio"].describe()

count    150899.000000
mean          5.863109
std          11.767920
min           0.000184
25%           1.616405
50%           3.333333
75%           6.600000
max        1455.000000
Name: comments_dislikes_ratio, dtype: float64

In [34]:
df_videos.sort_values("comments_dislikes_ratio").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio
26740,Buenas Ideias,UCQRPDZMSwXFEDS67uc7kIdg,A GÊNESE DE LULA - EDUARDO BUENO,2myuXzu7IoM,2021-06-16 12:32:31+00:00,468951,57902,5446,1,147.0,10.632024,0.123471,0.011613,2e-06,1.7e-05,0.000184
146721,YCImaging,UC-He-TI37gYyrrSy7BWvh1Q,50K Subscriber Giveaway TONIGHT!! | 9PM EST,6tvcXuWllPw,2017-04-23 19:27:06+00:00,5001,1275,1,1455,1661.0,1275.0,0.254949,0.0002,0.290942,1.141176,1455.0


## Mean views per day

In [35]:
df_videos["mean_views_day"] = df_videos["views"]/df_videos["age_days"]
df_videos["mean_views_day"] = df_videos["mean_views_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [36]:
df_videos["mean_views_day"].describe()

count    1.508990e+05
mean     1.063808e+03
std      8.713970e+03
min      7.908816e-03
25%      1.518840e+01
50%      7.681737e+01
75%      4.149035e+02
max      1.437207e+06
Name: mean_views_day, dtype: float64

In [37]:
df_videos.sort_values("mean_views_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day
31799,Business Insider,UCcyq283he07B7_KUX07mmtA,From Neutral To Buy- Oil Sector Upgrades,oi3w93LmW3w,2010-02-01 19:13:42+00:00,34,1,1,1,4299.0,1.0,0.029412,0.029412,0.029412,1.0,1.0,0.007909
100747,OwlKitty,UCpLQXR116cLVUa1LRY8KS4w,Jurassic Park but with a Cat,W85oD8FEF78,2021-10-31 12:17:19+00:00,14372069,790536,5124,23511,10.0,154.28103,0.055005,0.000357,0.001636,0.029741,4.588407,1437206.9


## Mean likes per day

In [38]:
df_videos["mean_likes_day"] = df_videos["likes"]/df_videos["age_days"]
df_videos["mean_likes_day"] = df_videos["mean_likes_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [39]:
df_videos["mean_likes_day"].describe()

count    150899.000000
mean         35.516953
std         382.157568
min           0.000226
25%           0.259012
50%           1.934253
75%          12.265823
max       79053.600000
Name: mean_likes_day, dtype: float64

In [40]:
df_videos.sort_values("mean_likes_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day
31822,Business Insider,UCcyq283he07B7_KUX07mmtA,Thomas Gensemer On E-Mail During The Obama Cam...,NsrimXL-Pq0,2009-10-02 21:42:20+00:00,124,1,2,2,4421.0,0.5,0.008065,0.016129,0.016129,2.0,1.0,0.028048,0.000226
100747,OwlKitty,UCpLQXR116cLVUa1LRY8KS4w,Jurassic Park but with a Cat,W85oD8FEF78,2021-10-31 12:17:19+00:00,14372069,790536,5124,23511,10.0,154.28103,0.055005,0.000357,0.001636,0.029741,4.588407,1437206.9,79053.6


## Mean dislikes per day

In [41]:
df_videos["mean_dislikes_day"] = df_videos["dislikes"]/df_videos["age_days"]
df_videos["mean_dislikes_day"] = df_videos["mean_dislikes_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [42]:
df_videos["mean_dislikes_day"].describe()

count    150899.000000
mean          0.892459
std          15.704292
min           0.000191
25%           0.007187
50%           0.043736
75%           0.265232
max        5248.151648
Name: mean_dislikes_day, dtype: float64

In [43]:
df_videos.sort_values("mean_dislikes_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day,mean_dislikes_day
24157,BBC,UCCj956IF62FbT7Gouszaj9w,Zulu Stick Fighting - Last Man Standing - BBC ...,zCAgEHqZGpI,2007-06-30 02:42:37+00:00,27955,53,1,17,5247.0,53.0,0.001896,3.6e-05,0.000608,0.320755,17.0,5.327806,0.010101,0.000191
95263,Netflix,UCWOA1ZGywLbqmigxE4Qlvuw,Cuties | Official Trailer | Netflix,M0O7lLe4SmA,2020-08-12 18:38:39+00:00,17883116,94126,2387909,205184,455.0,0.039418,0.005263,0.133529,0.011474,2.179887,0.085926,39303.551648,206.87033,5248.151648


## Mean comments per day

In [44]:
df_videos["mean_comments_day"] = df_videos["comments"]/df_videos["age_days"]
df_videos["mean_comments_day"] = df_videos["mean_comments_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [45]:
df_videos["mean_comments_day"].describe()

count    150899.000000
mean          2.286351
std          21.051067
min           0.000192
25%           0.026692
50%           0.168589
75%           0.873976
max        3044.800000
Name: mean_comments_day, dtype: float64

In [46]:
df_videos.sort_values("mean_comments_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day,mean_dislikes_day,mean_comments_day
24128,BBC,UCCj956IF62FbT7Gouszaj9w,Adil Ray with Heera - London Mela 2007 - BBC A...,aR8v1FrPBE0,2007-08-12 15:56:57+00:00,6305,7,1,1,5204.0,7.0,0.00111,0.000159,0.000159,0.142857,1.0,1.211568,0.001345,0.000192,0.000192
85790,Marques Brownlee,UCBJycsmduvYEL83R_U4JriQ,M1 Max MacBook Pro Review: Truly Next Level!,rr2XfL_df3o,2021-11-05 01:40:24+00:00,4062772,163818,2700,15224,5.0,60.673333,0.040322,0.000665,0.003747,0.092932,5.638519,812554.4,32763.6,540.0,3044.8


## Mean likes/dislikes per day

In [47]:
df_videos["mean_likes_dislikes_ratio_day"] = df_videos["likes_dislikes_ratio"]/df_videos["age_days"]
df_videos["mean_likes_dislikes_ratio_day"] = df_videos["mean_likes_dislikes_ratio_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [48]:
df_videos["mean_likes_dislikes_ratio_day"].describe()

count    150899.000000
mean          0.212357
std           1.894871
min           0.000014
25%           0.010965
50%           0.030873
75%           0.090780
max         369.500000
Name: mean_likes_dislikes_ratio_day, dtype: float64

In [49]:
df_videos.sort_values("mean_likes_dislikes_ratio_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day,mean_dislikes_day,mean_comments_day,mean_likes_dislikes_ratio_day
36504,CNBC,UCvJJ_dzjViJCoLf5uKUTwoA,Fired Google Employee Behind Anti-Diversity Me...,WyIW7hgmryw,2017-08-08 17:16:32+00:00,1385,1,46,3,1555.0,0.021739,0.000722,0.033213,0.002166,3.0,0.065217,0.890675,0.000643,0.029582,0.001929,1.4e-05
100354,O Pimentinha,UCQ3JxE-NOyZaJ3m3qIkZbhA,O Homi garantiu o almoço do domingo!,7WWivs_pcTk,2021-11-08 12:23:27+00:00,9244,1478,2,155,2.0,739.0,0.159887,0.000216,0.016768,0.104871,77.5,4622.0,739.0,1.0,77.5,369.5


## Mean likes/views per day

In [50]:
df_videos["mean_likes_views_ratio_day"] = df_videos["likes_views_ratio"]/df_videos["age_days"]
df_videos["mean_likes_views_ratio_day"] = df_videos["mean_likes_views_ratio_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [51]:
df_videos["mean_likes_views_ratio_day"].describe()

count    1.508990e+05
mean     1.773917e-04
std      8.322635e-03
min      1.947836e-08
25%      5.403867e-06
50%      1.552439e-05
75%      6.049856e-05
max      3.197181e+00
Name: mean_likes_views_ratio_day, dtype: float64

In [52]:
df_videos.sort_values("mean_likes_views_ratio_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day,mean_dislikes_day,mean_comments_day,mean_likes_dislikes_ratio_day,mean_likes_views_ratio_day
98832,Netflix,UCWOA1ZGywLbqmigxE4Qlvuw,"Discover, Relive and Watch TV from the Beginni...",iwrK1Mqao34,2013-09-05 00:38:00+00:00,790360,46,4,11,2988.0,11.5,5.8e-05,5e-06,1.4e-05,0.23913,2.75,264.511379,0.015395,0.001339,0.003681,0.003849,0.0
99697,Numberphile,UCoxcjq-8xIDTYp3uz647V5A,Why do YouTube views freeze at 301?,oIkhgagvrjI,2012-06-22 18:38:55+00:00,301,3298941,46560,291188,3428.0,70.853544,10959.936877,154.684385,967.401993,0.088267,6.254038,0.087806,962.351517,13.582264,84.943991,0.020669,3.197181


## Mean dislikes/views per day

In [53]:
df_videos["mean_dislikes_views_ratio_day"] = df_videos["dislikes_views_ratio"]/df_videos["age_days"]
df_videos["mean_dislikes_views_ratio_day"] = df_videos["mean_dislikes_views_ratio_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [54]:
df_videos["mean_dislikes_views_ratio_day"].describe()

count    1.508990e+05
mean     3.651126e-06
std      1.280021e-04
min      7.371508e-10
25%      1.605642e-07
50%      4.761955e-07
75%      1.421126e-06
max      4.512380e-02
Name: mean_dislikes_views_ratio_day, dtype: float64

In [55]:
df_videos.sort_values("mean_dislikes_views_ratio_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day,mean_dislikes_day,mean_comments_day,mean_likes_dislikes_ratio_day,mean_likes_views_ratio_day,mean_dislikes_views_ratio_day
55105,Disney Parks,UC1xwwLwm6WSMbUn_Tp597hQ,Europe Family Vacations | Adventures by Disney,BAk5kgg70uo,2019-01-25 14:48:12+00:00,1329975,103,1,1,1020.0,103.0,7.7e-05,1e-06,1e-06,0.009709,1.0,1303.897059,0.10098,0.00098,0.00098,0.10098,0.0,0.0
99697,Numberphile,UCoxcjq-8xIDTYp3uz647V5A,Why do YouTube views freeze at 301?,oIkhgagvrjI,2012-06-22 18:38:55+00:00,301,3298941,46560,291188,3428.0,70.853544,10959.936877,154.684385,967.401993,0.088267,6.254038,0.087806,962.351517,13.582264,84.943991,0.020669,3.197181,0.045124


## Mean comments/views per day

In [56]:
df_videos["mean_comments_views_ratio_day"] = df_videos["comments_views_ratio"]/df_videos["age_days"]
df_videos["mean_comments_views_ratio_day"] = df_videos["mean_comments_views_ratio_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [57]:
df_videos["mean_comments_views_ratio_day"].describe()

count    1.508990e+05
mean     1.438528e-05
std      7.353772e-04
min      3.889877e-10
25%      4.976829e-07
50%      1.505283e-06
75%      4.839349e-06
max      2.822059e-01
Name: mean_comments_views_ratio_day, dtype: float64

In [58]:
df_videos.sort_values("mean_comments_views_ratio_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day,mean_dislikes_day,mean_comments_day,mean_likes_dislikes_ratio_day,mean_likes_views_ratio_day,mean_dislikes_views_ratio_day,mean_comments_views_ratio_day
56012,Disney Parks,UC1xwwLwm6WSMbUn_Tp597hQ,"Tubestone Curl | Aulani, A Disney Resort & Spa",ZZUZ26lJQ6A,2014-07-31 14:43:45+00:00,2900461,206,18,3,2659.0,11.444444,7.1e-05,6e-06,1e-06,0.014563,0.166667,1090.808951,0.077473,0.006769,0.001128,0.004304,0.0,0.0,0.0
99697,Numberphile,UCoxcjq-8xIDTYp3uz647V5A,Why do YouTube views freeze at 301?,oIkhgagvrjI,2012-06-22 18:38:55+00:00,301,3298941,46560,291188,3428.0,70.853544,10959.936877,154.684385,967.401993,0.088267,6.254038,0.087806,962.351517,13.582264,84.943991,0.020669,3.197181,0.045124,0.282206


## Mean comments/likes per day

In [59]:
df_videos["mean_comments_likes_ratio_day"] = df_videos["comments_likes_ratio"]/df_videos["age_days"]
df_videos["mean_comments_likes_ratio_day"] = df_videos["mean_comments_likes_ratio_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [60]:
df_videos["mean_comments_likes_ratio_day"].describe()

count    1.508990e+05
mean     2.527357e-04
std      2.314799e-03
min      1.174868e-07
25%      3.330829e-05
50%      6.531864e-05
75%      1.456903e-04
max      3.529412e-01
Name: mean_comments_likes_ratio_day, dtype: float64

In [61]:
df_videos.sort_values("mean_comments_likes_ratio_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day,mean_dislikes_day,mean_comments_day,mean_likes_dislikes_ratio_day,mean_likes_views_ratio_day,mean_dislikes_views_ratio_day,mean_comments_views_ratio_day,mean_comments_likes_ratio_day
26740,Buenas Ideias,UCQRPDZMSwXFEDS67uc7kIdg,A GÊNESE DE LULA - EDUARDO BUENO,2myuXzu7IoM,2021-06-16 12:32:31+00:00,468951,57902,5446,1,147.0,10.632024,0.123471,0.011613,2e-06,1.7e-05,0.000184,3190.142857,393.891156,37.047619,0.006803,0.072327,0.00084,7.9e-05,0.0,0.0
11108,BBC,UCCj956IF62FbT7Gouszaj9w,Another Yorkshire cricketer alleges racism @BB...,plId600n_SY,2021-11-06 06:37:34+00:00,55967,442,435,624,4.0,1.016092,0.007898,0.007772,0.011149,1.411765,1.434483,13991.75,110.5,108.75,156.0,0.254023,0.001974,0.001943,0.002787,0.352941


## Mean comments/dislikes per day

In [62]:
df_videos["mean_comments_dislikes_ratio_day"] = df_videos["comments_dislikes_ratio"]/df_videos["age_days"]
df_videos["mean_comments_dislikes_ratio_day"] = df_videos["mean_comments_dislikes_ratio_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [63]:
df_videos["mean_comments_dislikes_ratio_day"].describe()

count    150899.000000
mean          0.015528
std           0.164234
min           0.000001
25%           0.000991
50%           0.002527
75%           0.006848
max          38.750000
Name: mean_comments_dislikes_ratio_day, dtype: float64

In [64]:
df_videos.sort_values("mean_comments_dislikes_ratio_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day,mean_dislikes_day,mean_comments_day,mean_likes_dislikes_ratio_day,mean_likes_views_ratio_day,mean_dislikes_views_ratio_day,mean_comments_views_ratio_day,mean_comments_likes_ratio_day,mean_comments_dislikes_ratio_day
26740,Buenas Ideias,UCQRPDZMSwXFEDS67uc7kIdg,A GÊNESE DE LULA - EDUARDO BUENO,2myuXzu7IoM,2021-06-16 12:32:31+00:00,468951,57902,5446,1,147.0,10.632024,0.123471,0.011613,2e-06,1.7e-05,0.000184,3190.142857,393.891156,37.047619,0.006803,0.072327,0.00084,7.9e-05,0.0,0.0,1e-06
100354,O Pimentinha,UCQ3JxE-NOyZaJ3m3qIkZbhA,O Homi garantiu o almoço do domingo!,7WWivs_pcTk,2021-11-08 12:23:27+00:00,9244,1478,2,155,2.0,739.0,0.159887,0.000216,0.016768,0.104871,77.5,4622.0,739.0,1.0,77.5,369.5,0.079944,0.000108,0.008384,0.052436,38.75


In [65]:
df_videos

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day,mean_dislikes_day,mean_comments_day,mean_likes_dislikes_ratio_day,mean_likes_views_ratio_day,mean_dislikes_views_ratio_day,mean_comments_views_ratio_day,mean_comments_likes_ratio_day,mean_comments_dislikes_ratio_day
0,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,A few of the best math explainers from this su...,F3Qixy-r_rQ,2021-10-23 18:11:23+00:00,537453,30720,113,785,18.0,271.858407,0.057158,0.00021,0.001461,0.025553,6.946903,29858.5,1706.666667,6.277778,43.611111,15.103245,0.003175,0.000012,0.000081,0.00142,0.385939
1,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,How a Mandelbrot set arises from Newton’s work,LqbZpur38nw,2021-10-15 16:41:50+00:00,621248,27411,156,1248,26.0,175.711538,0.044122,0.000251,0.002009,0.045529,8.0,23894.153846,1054.269231,6.0,48.0,6.758136,0.001697,0.00001,0.000077,0.001751,0.307692
2,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,Newton's Fractal (which Newton knew nothing ab...,-RdOwhmqP5s,2021-10-07 02:19:39+00:00,1267428,63787,304,2864,34.0,209.825658,0.050328,0.00024,0.00226,0.044899,9.421053,37277.294118,1876.088235,8.941176,84.235294,6.171343,0.00148,0.000007,0.000066,0.001321,0.27709
3,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,The Summer of Math Exposition,ojjzXyQCzso,2021-07-16 15:37:16+00:00,616067,29415,216,1721,117.0,136.180556,0.047746,0.000351,0.002794,0.058508,7.967593,5265.529915,251.410256,1.846154,14.709402,1.163936,0.000408,0.000003,0.000024,0.0005,0.068099
4,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,A quick trick for computing eigenvalues | Chap...,e50Bj7jn9IQ,2021-05-07 19:01:16+00:00,432555,17245,145,1150,186.0,118.931034,0.039868,0.000335,0.002659,0.066686,7.931034,2325.564516,92.715054,0.77957,6.182796,0.639414,0.000214,0.000002,0.000014,0.000359,0.04264
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150894,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Art Hour,C6hpMftAIs0,2009-06-13 18:32:12+00:00,172867,2165,31,137,4533.0,69.83871,0.012524,0.000179,0.000793,0.063279,4.419355,38.135231,0.477609,0.006839,0.030223,0.015407,0.000003,0.0,0.0,0.000014,0.000975
150895,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Hole,LPfcGXMpKds,2009-06-06 22:16:56+00:00,90973,1804,23,75,4539.0,78.434783,0.01983,0.000253,0.000824,0.041574,3.26087,20.04252,0.397444,0.005067,0.016523,0.01728,0.000004,0.0,0.0,0.000009,0.000718
150896,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Affordable Button,h3lvSflNixI,2009-05-30 00:13:46+00:00,103287,1575,47,18,4547.0,33.510638,0.015249,0.000455,0.000174,0.011429,0.382979,22.715417,0.346382,0.010336,0.003959,0.00737,0.000003,0.0,0.0,0.000003,0.000084
150897,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Outsource,zxIbQ6ZG4OI,2009-05-17 00:50:31+00:00,78321,1628,20,87,4560.0,81.4,0.020786,0.000255,0.001111,0.05344,4.35,17.175658,0.357018,0.004386,0.019079,0.017851,0.000005,0.0,0.0,0.000012,0.000954


# Save the dataset

In [66]:
df_videos.to_csv("data/videos_data.csv",
                 sep = ";",
                 index = False)