# Imports and options

In [1]:
import numpy as np
import pandas as pd
from googleapiclient.discovery import build
from datetime import timedelta

# Display options:
pd.set_option("display.width", 1200)
pd.set_option("display.max_columns", 300)
pd.set_option("display.max_rows", 300)

# Youtube Data API

In [2]:
yt = build(
    serviceName = "youtube",
    version = "v3",
    developerKey = "" # my key
)

# Channels sample

In [4]:
# * the API only returns 20.000 videos by channel
# Ids from (content = "https://www.youtube.com/channel/...) and titles:
channels = {
    "UCYO_jab_esuFRV4b17AJtAw": "3Blue1Brown",
    "UCNKcMBYP_-18FLgk4BYGtfw": "AWE me",
    "UCiDJtJKMICpb9B1qf7qjEOA": "Adam Savage’s Tested",
    "UC32z4mtyiq02Ge-XWy78ibw": "Aero Por Trás da Aviação",
    "UCWizIdwZdmr43zfxlCktmNw": "Alec Steele",
    "UCvFGf8HZGZWFzpcDCqb3Lhw": "All Things Secured",
    "UCaRMivfyupj3ucUyJbZbCNg": "Anglo-Link",
    "UCwg6_F2hDHYrqbNSGjmar4w": "Animalogic",
    "UCi5iiEyLwSLvlqnMi02u5gQ": "Ask a Mortician",
    "UCGYBY4KaFYmkEKAGLL07BXw": "AstroTubers",
    "UC-9b7aDP6ZN0coj9-xFnrtw": "Astrum",
    "UCSTlOTcyUmzvhQi6F8lFi5w": "Atila Iamarino",
    "UCLXl1V6n82Dyg1VhVgSL0nw": "Aviões e Músicas com Lito Sousa",
    "UCGlmclFVnJCFQ_VO4kTgTig": "Azusa Barbie",
    "UCCj956IF62FbT7Gouszaj9w": "BBC",
    "UCvZe6ZCbF9xgbbbdkiodPKQ": "Baumgartner Restoration",
    "UCxcnsr1R5Ge_fbTu5ajt8DQ": "Bob Ross",
    "UCQRPDZMSwXFEDS67uc7kIdg": "Buenas Ideias",
    "UCcyq283he07B7_KUX07mmtA": "Business Insider",
    "UCCYX4s1DCn51Hpf1peHS30Q": "Cinema Therapy",
    "UCvJJ_dzjViJCoLf5uKUTwoA": "CNBC",
    "UCWq1xltHB2fDe6YkYoOrryg": "Canal do Schwarza",
    "UCEOXxzW2vU0P-0THehuIIeg": "Captain Disillusion",
    "UCwnKziETDbHJtx78nIkfYug": "CaspianReport",
    "UCR1-7g_y2YcYlh1W9y_1LUg": "Chaves Estranho",
    "UC04KsGq3npibMCE9Td3mVDg": "Cheddar",
    "UC6HDXr-sNPnWLF_Q-y3KduA": "Chess Talk",
    "UCEwIUtFBhaI2L2PuKv0KL2g": "Classic Mr Bean",
    "UCvmijL-eepDVHYSJHDY3d6w": "Cole and Marmalade",
    "UC9-y-6csu5WGm29I7JiwpnA": "Computerphile",
    "UCyAMV63OY0DRbKk6gGv6wJg": "Cradle Of Filth",
    "UCe_vXdMrHHseZ_esYUskSBw": "CrazyRussianHacker",
    "UC726J5A0LLFRxQ0SZqr2mYQ": "Curious Droid",
    "UCFuIUoyHB12qpYa8Jpxoxow": "Código Fonte TV",
    "UCW39zufHfsuGgpLviKh297Q": "DW Documentary",
    "UCd-swDW3HCs4LTZhq8Cf7Fg": "DW Euromaxx",
    "UCQIcXQ2n0sa-7CD0NtqnrrA": "Darko Audio",
    "UCa37IMrH8BGS_pO6CKbTL-A": "Dinosaurs",
    "UCEf5U1dB5a2e2S-XUlnhxSA": "Diolinux",
    "UC_5niPa-d35gg88HaS7RrIw": "Disney",
    "UC1xwwLwm6WSMbUn_Tp597hQ": "Disney Parks",
    "UCKUNiU0D2u3yxNZZxBqa6bQ": "DutchPilotGirl",
    "UCVRrGAcUc7cblUzOhI1KfFg": "Earthling Ed",
    "UCIBaDdAbGlFDeS33shmlD0A": "European Space Agency, ESA",
    "UC6uKrU_WqJ1R2HMTY3LIx5Q": "Everyday Astronaut",
    "UCZ6JHFBaDUJ9wfo41HSII_w": "Fabio Chaves",
    "UC6zbvGt_jJVgw7-MWkVUC5A": "Fala Vegan",
    "UCxGiUjep8KiihvHldRousPA": "Fancy Fairy Wings & Things",
    "UCweCc7bSMX5J4jEH7HFImng": "GMHikaru",
    "UCuTaETsuCOkJ0H_GAztWt0Q": "Global Cycling Network",
    "UCeSRjhfeeqIgr--AcP9qhyg": "Guinness World Records",
    "UCLAcytNR3gdw44yzoSiKluA": "Hello Korea",
    "UC26YLK0OEbLB3TCYxGh8xVQ": "Huygens Optics",
    "UCcMDMoNu66_1Hwi5-MeiQgw": "Hydraulic Press Channel",
    "UC3uAjWoLZ4bSi6qI9SjALxA": "Imperial War Museums",
    "UCHJuQZuzapBh-CuhRYxIZrg": "Insider",
    "UCkAbeyJZqp6JFYfKy-98NjA": "Integrando Conhecimento",
    "UCJ8bjM5yQSkOP-_99n1zKkw": "Intensivo Pedagógico",
    "UC8p-P1qJ312lZD30GoVCH7Q": "It's Black Friday",
    "UCZ9l_6_f0PWRYXN5Y7Lcl2A": "Jacques Slade",
    "UCbbQalJ4OaC0oQ0AqRaOJ9g": "Jay Foreman",
    "UCkP2CvRubyU0MTZv_Qo712g": "Jessica in the Kitchen - Vegan Recipes",
    "UC-2YHgc363EdcusLIBbgxzg": "Joe Scott",
    "UCBbnbBWJtwsf0jLGUwX5Q3g": "Journey to the Microcosmos",
    "UCLQWhXC5aOZ_tQNloBg40BA": "Kirsten & Joerg",
    "UCuCuEKq1xuRA0dFQj1qg9-Q": "Knowledgia",
    "UCsXVk37bltHxD1rDPwtNM8Q": "Kurzgesagt – In a Nutshell",
    "UCNqNkZ7kKfqimqHkgbWMNYA": "Launch Pad Astronomy",
    "UCoNTMWgGuXtGPLv9UeJZwBw": "Living Big In A Tiny House",
    "UCm9K6rby98W8JigLoZOh6FQ": "LockPickingLawyer",
    "UCAcZNVPEQ8K7JhMDwOcf8Pg": "Loira na Estrada",
    "UCt2WVZXVrHaFgZ432s33fLw": "MAKE UP FOR EVER",
    "UCbK5Us4E-HsXw6fQ1PYUuog": "MW Informática",
    "UCXgxNzAgZ1GExhTW4X1mUrg": "Marc Rebillet",
    "UCBlXovStrlQkVA2xJEROUNg": "Marcel Vos",
    "UCr22xikWUK2yUW4YxOKXclQ": "MathTheBeautiful",
    "UC6nZXhOz_gHr4mlb5UNl9ug": "Matt Jordan",
    "UCjZfdrnSOrh4iFL2GCNvWVw": "Maven of the Eventide",
    "UCQwFuQLnLocj5F7ZcmcuWYQ": "MetaBallStudios",
    "UCk5BcU1rOy6hepflk7_q_Pw": "Meteoro Brasil",
    "UCqONzeACDBaF6FfKjh7ndAQ": "Microsoft Flight Simulator",
    "UChHS4NI6U4XgCuYgsrygVCA": "Mintfaery",
    "UC1ZBQ-F-yktYD4m5AzM6pww": "Mustard",
    "UCLA_DiR1FfKNvjuUpBHmylQ": "NASA",
    "UCSuHzQ3GrHSzoBbwrIq3LLA": "Naomi Brockwell",
    "UCPZvqkyXoYDlzBClxpxVIzg": "Naomi Farr",
    "UCggHsHce2n3vvbJf_8YKrMA": "Nerdforge",
    "UCWOA1ZGywLbqmigxE4Qlvuw": "Netflix",
    "UCD57tGnYPW1twCohgIVkJpw": "Nox et Lux",
    "UCoxcjq-8xIDTYp3uz647V5A": "Numberphile",
    "UCroDJPcFCf6DBmHns6Xeb8g": "Nyma Tang",
    "UCjivwB8MrrGCMlIuoSdkrQg": "Nátaly Neri",
    "UCQ3JxE-NOyZaJ3m3qIkZbhA": "O Pimentinha",
    "UCK1XzxcKXB_v_dcw_tflC1A": "OBF",
    "UCtb2f3j_sB4Z4Ik2YJtbzsg": "OneikaTraveller",
    "UCwK-svlNEASA7g_smu1_d8g": "OnlyConnect Fan",
    "UCpLQXR116cLVUa1LRY8KS4w": "OwlKitty",
    "UCzR-rom72PHN9Zg7RML9EbA": "PBS Eons",
    "UC7_gcs09iThXybpVgjHZ_7g": "PBS Space Time",
    "UCddYq41_tZ1FnLlguLT6-Ow": "Parafernalha",
    "UCOVkkVaUP0Xop6IIjyk86QA": "Perception",
    "UCtESv1e7ntJaLJYKIO1FoYw": "Periodic Videos",
    "UC7DdEm33SyaTDtWYGO2CwdA": "Physics Girl",
    "UCQwHYU3ZbSLG6LOVwtvuozg": "Planeta Aves",
    "UCEWHPFNilsT0IfQfutVzsag": "Porta dos Fundos",
    "UCNJe8uQhM2G4jJFRWiM89Wg": "Potato Jet",
    "UClo-U5gvXPRy-4VqWEkkRDg": "Primitive Building",
    "UCKKJpBveT8vWVNfLQ-MvZMg": "Prof. André Azevedo da Fonseca",
    "UCXOzDw_X92bNqSCqw-NojiA": "Professor Julio Borbo",
    "UCwSxSJqGpSRpEsq5-YUbM8g": "Professor Noslen",
    "UCNzul4dnciIlDg8BAcn5-cQ": "Prowalk Tours",
    "UCvn_XCl_mgQmt3sD753zdJA": "Rachel's English",
    "UCR1IuLEqb6UEA_zQ81kwXfg": "Real Engineering",
    "UCP5tjEmvPItGyLhmjdwP7Ww": "RealLifeLore",
    "UCFJxE0l3cVYU4kHzi4qVEkw": "Rebecca Watson",
    "UCliwOcLaEfqkGJ7xq2ipq4w": "Reverse Engineering",
    "UC9Ep0Y4T5rvUuIfjKN1wqTw": "Riddle",
    "UCovtFObhY9NypXcyHxAS7-Q": "Roberto Blake",
    "UCZYTClx2T1of7BRZ86-8fow": "SciShow",
    "UCxzC4EngIsMrPmbm6Nxvb-A": "Scott Manley",
    "UC3KEoMzNz8eYnwBC34RaKCQ": "Simone Giertz",
    "UCT3wwJBh8fwePiNC63NlzCA": "Simple Flying",
    "UC6107grRI4m0o2-emgoDnAA": "SmarterEveryDay",
    "UC_Fk7hHbl7vv_7K8tYqJd5A": "SpaceToday",
    "UCtI0Hodo5o5dUb67FeUjDeA": "SpaceX",
    "UC00uG71I6iPyx15EX6i_GDA": "Star Wars Comics",
    "UCZ9jWH_8tJ-Nmaj8dSQdEYA": "Stefan Milo",
    "UCpQLC-evmUAon9BBpcW4kYg": "Steve Cutts",
    "UC6u6uY4VbvuNtU0BU7F9olw": "Studson Studio",
    "UCRXRbi80k0_vcIfgpOSerTg": "Subject Zero Science",
    "UCAuUUnT6oDeKwE6v1NGQxug": "TED",
    "UC7U72Ze-y8s4hvAmRvhXPgQ": "TecLab",
    "UCZdJE8KpuFm6NRafHTEIC-g": "Tempero Drag",
    "UCnUXq8mGmoHt0e6ItuTs10w": "That Pedal Show",
    "UC8uT9cgJorJPWu7ITLGo9Ww": "The 8-Bit Guy",
    "UC6n8I1UDTKP1IWjQMg6_TwA": "The B1M",
    "UCvpQ-l09fCVxJd3urZbxzHg": "The British Museum",
    "UCsQXQMAZTygXddj_I4Ud9xA": "The Green Witch",
    "UC42VsoDtra5hMiXZSsD6eGg": "The Modern Rogue",
    "UCUK0HBIBWgM2c4vsPhkYY4w": "The Slow Mo Guys",
    "UC2LVhJH_9cT2XKp0VAfsKOQ": "The Tim Traveller",
    "UCBa659QWEk1AI4Tg--mrJ2A": "Tom Scott",
    "UCN3aYbtQ7yCqk9DM56B0kEw": "Tomorrow's Build",
    "UClVbhSLxwws-KSsPKz135bw": "TopMovieClips",
    "UC29ju8bIPH5as8OGnQzwJyA": "Traversy Media",
    "UCgtq_tKnFFQyJ-7j0yr9utg": "Tyta Montrase",
    "UCSxFcMDGmwgDlSP9qmYVx1w": "TÁ NA MESA VEGG",
    "UCVEVuanoMK9tGclfWLghaKw": "Tá Querida",
    "UCv3fc4d_IhCK2g5Y_kjpvaw": "UM BOTÂNICO NO APARTAMENTO",
    "UCTlLFaGX4q2otO0ZVxLW0VQ": "Uma história a mais",
    "UC9fSZHEh6XsRpX-xJc6lT3A": "UrAvgConsumer",
    "UCbYFhcKSE2mWYB0yD_Qr_8A": "Urban Gardening",
    "UCc9M8_pF78tYblW8nMHXUqg": "VIEWGANAS",
    "UCp3iXxis9n_E_GfbE-_ksFw": "VeganBlackMetalChef",
    "UCHnyfMqiRRG1u-2MsSQLbXA": "Veritasium",
    "UCYulrOk4EbqL833oETw0bWQ": "Wanderlust Travel Videos",
    "UCs9MJ-TmN0-CLY2eNBHAltw": "Wanna Walk",
    "UCmVa-cbCpkd5Cd9Fr_4tCWg": "Your Dinosaurs Are Wrong",
    "UC9Ntx-EF3LzKY1nQ5rTUP2g": "cyriak",
    "UCtYKe7-XbaDjpUwcU5x0bLg": "neo",
    "UCVpankR4HtoAVtYnFDUieYA": "zefrank1"
 }

In [5]:
len(channels)

161

# Videos data

In [6]:
# Loop in the channels:
df_videos = pd.DataFrame(
    {
        "channel_title": pd.Series([], dtype = "str"),
        "channel_id": pd.Series([], dtype = "str"),
        "video_title": pd.Series([], dtype = "str"),
        "video_id": pd.Series([], dtype = "str"),
        "video_upload_date": pd.Series([], dtype = "str"),
        "views": pd.Series([], dtype = "int"),
        "likes": pd.Series([], dtype = "int"),
        "dislikes": pd.Series([], dtype = "int"),
        "comments": pd.Series([], dtype = "int")
    }
)
for chnl_ind, chnl_key in enumerate(channels):

    # Choose the channel:
    channel_id = chnl_key
    channel_title = channels[channel_id]

    print(channel_title)

    ### Extract the data from all the videos of the channel

    # Content details:
    content = yt.channels().list(
        id = channel_id,
        part = "contentDetails"
    ).execute()

    # Upload Id:
    upload_id = content["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]

    # All the channel's videos:
    all_videos = []
    next_pg_token = None
    res = yt.playlistItems().list(
        playlistId = upload_id,
        maxResults = 50,
        part = "snippet",
        pageToken = next_pg_token
    ).execute()
    while True:
        res = yt.playlistItems().list(
            playlistId = upload_id,
            maxResults = 50,
            part = "snippet",
            pageToken = next_pg_token
        ).execute()
        all_videos += res["items"]
        next_pg_token = res.get("nextPageToken")
        if next_pg_token is None:
            break

    # Videos statistics:
    videos_ids = list(map(lambda x: x["snippet"]["resourceId"]["videoId"], all_videos))
    stats = []
    for i in range(0, len(videos_ids), 40):
        res = (yt).videos().list(
            id = ",".join(videos_ids[i:i + 40]),
            part = "statistics"
        ).execute()
        stats += res["items"]

    # Put it in a dataframe:
    video_id, views, likes, dislikes, comments = [], [], [], [], []
    for i in range(len(stats)):
        video_id += [stats[i]["id"]]
        stats_i = stats[i]["statistics"]
        views += [stats_i[k] if "viewCount" in stats_i.keys() else np.nan for k in ["viewCount"]]
        likes += [stats_i[k] if "likeCount" in stats_i.keys() else np.nan for k in ["likeCount"]]
        dislikes += [stats_i[k] if "dislikeCount" in stats_i.keys() else np.nan for k in ["dislikeCount"]]
        comments += [stats_i[k] if "commentCount" in stats_i.keys() else np.nan for k in ["commentCount"]]
    df_channel_videos = pd.DataFrame(
        {
            "channel_title": channel_title,
            "channel_id": channel_id,
            "video_title": list(map(lambda x: x["snippet"]["title"], all_videos)),
            "video_id": video_id,
            "video_upload_date": list(map(lambda x: x["snippet"]["publishedAt"], all_videos)),
            "views": views,
            "likes": likes,
            "dislikes": dislikes,
            "comments": comments        
        }
    )
    df_videos = pd.concat([df_videos, df_channel_videos])

3Blue1Brown
AWE me
Adam Savage’s Tested
Aero Por Trás da Aviação
Alec Steele
All Things Secured
Anglo-Link
Animalogic
Ask a Mortician
AstroTubers
Astrum
Atila Iamarino
Aviões e Músicas com Lito Sousa
Azusa Barbie
BBC
Baumgartner Restoration
Bob Ross
Buenas Ideias
Business Insider
Cinema Therapy
CNBC
Canal do Schwarza
Captain Disillusion
CaspianReport
Chaves Estranho
Cheddar
Chess Talk
Classic Mr Bean
Cole and Marmalade
Computerphile
Cradle Of Filth
CrazyRussianHacker
Curious Droid
Código Fonte TV
DW Documentary
DW Euromaxx
Darko Audio
Dinosaurs
Diolinux
Disney
Disney Parks
DutchPilotGirl
Earthling Ed
European Space Agency, ESA
Everyday Astronaut
Fabio Chaves
Fala Vegan
Fancy Fairy Wings & Things
GMHikaru
Global Cycling Network
Guinness World Records
Hello Korea
Huygens Optics
Hydraulic Press Channel
Imperial War Museums
Insider
Integrando Conhecimento
Intensivo Pedagógico
It's Black Friday
Jacques Slade
Jay Foreman
Jessica in the Kitchen - Vegan Recipes
Joe Scott
Journey to the Microco

In [7]:
df_videos

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments
0,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,A few of the best math explainers from this su...,F3Qixy-r_rQ,2021-10-23T18:11:23Z,507643,29685,108,768
1,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,How a Mandelbrot set arises from Newton’s work,LqbZpur38nw,2021-10-15T16:41:50Z,603492,26984,150,1237
2,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,Newton's Fractal (which Newton knew nothing ab...,-RdOwhmqP5s,2021-10-07T02:19:39Z,1239697,63031,296,2852
3,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,The Summer of Math Exposition,ojjzXyQCzso,2021-07-16T15:37:16Z,613816,29369,215,1724
4,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,A quick trick for computing eigenvalues | Chap...,e50Bj7jn9IQ,2021-05-07T19:01:16Z,428473,17163,145,1150
...,...,...,...,...,...,...,...,...,...
191,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Hole,LPfcGXMpKds,2009-06-06T22:16:56Z,90902,1801,23,75
192,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Affordable Button,h3lvSflNixI,2009-05-30T00:13:46Z,103234,1571,47,18
193,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Optical Illusion,8mS5RK0Yo6w,2009-05-23T22:36:58Z,1072373,9861,1201,
194,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Outsource,zxIbQ6ZG4OI,2009-05-17T00:50:31Z,78272,1625,20,87


# Data cleaning

## Change the data types

In [8]:
def toint64(x):
    return x.str.extract("(\d+)", expand = False).astype("float").astype("Int64")

df_videos["views"] = toint64(df_videos["views"])
df_videos["likes"] = toint64(df_videos["likes"])
df_videos["dislikes"] = toint64(df_videos["dislikes"])
df_videos["comments"] = toint64(df_videos["comments"])
df_videos["video_upload_date"] = pd.to_datetime(df_videos["video_upload_date"])

In [9]:
df_videos.dtypes

channel_title                     object
channel_id                        object
video_title                       object
video_id                          object
video_upload_date    datetime64[ns, UTC]
views                              Int64
likes                              Int64
dislikes                           Int64
comments                           Int64
dtype: object

## Remove videos with NaN or 0 in any variable

In [10]:
df_videos = df_videos.dropna(axis = 0)
df_videos = df_videos.loc[(df_videos["views"] > 0) & (df_videos["likes"] > 0) & (df_videos["dislikes"] > 0) & (df_videos["comments"] > 0)]
df_videos = df_videos.reset_index(drop = True)

In [11]:
df_videos

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments
0,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,A few of the best math explainers from this su...,F3Qixy-r_rQ,2021-10-23 18:11:23+00:00,507643,29685,108,768
1,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,How a Mandelbrot set arises from Newton’s work,LqbZpur38nw,2021-10-15 16:41:50+00:00,603492,26984,150,1237
2,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,Newton's Fractal (which Newton knew nothing ab...,-RdOwhmqP5s,2021-10-07 02:19:39+00:00,1239697,63031,296,2852
3,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,The Summer of Math Exposition,ojjzXyQCzso,2021-07-16 15:37:16+00:00,613816,29369,215,1724
4,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,A quick trick for computing eigenvalues | Chap...,e50Bj7jn9IQ,2021-05-07 19:01:16+00:00,428473,17163,145,1150
...,...,...,...,...,...,...,...,...,...
124449,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Art Hour,C6hpMftAIs0,2009-06-13 18:32:12+00:00,172813,2163,31,136
124450,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Hole,LPfcGXMpKds,2009-06-06 22:16:56+00:00,90902,1801,23,75
124451,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Affordable Button,h3lvSflNixI,2009-05-30 00:13:46+00:00,103234,1571,47,18
124452,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Outsource,zxIbQ6ZG4OI,2009-05-17 00:50:31+00:00,78272,1625,20,87


# New features

## Age in days (until the last upload date)

In [12]:
# Filter the wrong upload dates (coming from the API as being the request date) considering a margin of 2 days:
last_date = max(df_videos["video_upload_date"])
age = []
for i in range(df_videos.shape[0]):
    date_i = df_videos["video_upload_date"].iloc[i]
    diff_days = last_date - date_i
    if diff_days < timedelta(days = 2):
        age += [np.nan]
    else:
        age += [diff_days.days]
df_videos["age_days"] = age

df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [13]:
df_videos["age_days"].describe()

count    124382.000000
mean       1646.102266
std        1137.286025
min           2.000000
25%         757.000000
50%        1487.000000
75%        2236.750000
max        5744.000000
Name: age_days, dtype: float64

In [14]:
df_videos.sort_values("age_days").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days
106476,SpaceToday,UC_Fk7hHbl7vv_7K8tYqJd5A,TUDO SOBRE O GRANDIOSO ESPELHO DO TELESCÓPIO J...,zW1eJvIzUS8,2021-11-02 22:12:45+00:00,38372,5711,44,192,2.0
72154,Jacques Slade,UCZ9l_6_f0PWRYXN5Y7Lcl2A,Get Your Hands Up,FXLXBfmlaOk,2006-02-12 20:31:24+00:00,28240,207,9,108,5744.0


## Likes/dislikes

In [15]:
df_videos["likes_dislikes_ratio"] = df_videos["likes"]/df_videos["dislikes"]
df_videos["likes_dislikes_ratio"] = df_videos["likes_dislikes_ratio"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [16]:
df_videos["likes_dislikes_ratio"].describe()

count    124382.000000
mean         63.061805
std          79.354553
min           0.021739
25%          17.447467
50%          39.826922
75%          79.351905
max        2639.000000
Name: likes_dislikes_ratio, dtype: float64

In [17]:
df_videos.sort_values("likes_dislikes_ratio").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio
32956,CNBC,UCvJJ_dzjViJCoLf5uKUTwoA,Fired Google Employee Behind Anti-Diversity Me...,WyIW7hgmryw,2017-08-08 17:16:32+00:00,1385,1,46,3,1549.0,0.021739
120946,TÁ NA MESA VEGG,UCSxFcMDGmwgDlSP9qmYVx1w,"COMO ARMAZENAR FOLHAS, ORGANIZAR FEIRA + SAL T...",GGqRCQmtx-U,2021-05-21 01:16:50+00:00,15420,2639,1,261,168.0,2639.0


## Likes/views

In [18]:
df_videos["likes_views_ratio"] = df_videos["likes"]/df_videos["views"]
df_videos["likes_views_ratio"] = df_videos["likes_views_ratio"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [19]:
df_videos["likes_views_ratio"].describe()

count    124382.000000
mean          0.126377
std          31.061673
min           0.000058
25%           0.009411
50%           0.019332
75%           0.041366
max       10954.803987
Name: likes_views_ratio, dtype: float64

In [20]:
df_videos.sort_values("likes_views_ratio").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio
88862,Netflix,UCWOA1ZGywLbqmigxE4Qlvuw,"Discover, Relive and Watch TV from the Beginni...",iwrK1Mqao34,2013-09-05 00:38:00+00:00,790357,46,4,11,2983.0,11.5,5.8e-05
89727,Numberphile,UCoxcjq-8xIDTYp3uz647V5A,Why do YouTube views freeze at 301?,oIkhgagvrjI,2012-06-22 18:38:55+00:00,301,3297396,46484,291362,3422.0,70.93615,10954.803987


## Dislikes/views

In [21]:
df_videos["dislikes_views_ratio"] = df_videos["dislikes"]/df_videos["views"]
df_videos["dislikes_views_ratio"] = df_videos["dislikes_views_ratio"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [22]:
df_videos["dislikes_views_ratio"].describe()

count    1.243820e+05
mean     2.448951e-03
std      4.378931e-01
min      7.519085e-07
25%      2.952890e-04
50%      5.705980e-04
75%      1.111316e-03
max      1.544319e+02
Name: dislikes_views_ratio, dtype: float64

In [23]:
df_videos.sort_values("dislikes_views_ratio").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio
50066,Disney Parks,UC1xwwLwm6WSMbUn_Tp597hQ,Europe Family Vacations | Adventures by Disney,BAk5kgg70uo,2019-01-25 14:48:12+00:00,1329949,103,1,1,1014.0,103.0,7.7e-05,1e-06
89727,Numberphile,UCoxcjq-8xIDTYp3uz647V5A,Why do YouTube views freeze at 301?,oIkhgagvrjI,2012-06-22 18:38:55+00:00,301,3297396,46484,291362,3422.0,70.93615,10954.803987,154.431894


## Comments/views

In [24]:
df_videos["comments_views_ratio"] = df_videos["comments"]/df_videos["views"]
df_videos["comments_views_ratio"] = df_videos["comments_views_ratio"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [25]:
df_videos["comments_views_ratio"].describe()

count    1.243820e+05
mean     1.130643e-02
std      2.744653e+00
min      6.938809e-07
25%      8.144948e-04
50%      1.829491e-03
75%      3.930209e-03
max      9.679801e+02
Name: comments_views_ratio, dtype: float64

In [26]:
df_videos.sort_values("comments_views_ratio").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio
61150,Guinness World Records,UCeSRjhfeeqIgr--AcP9qhyg,Cutting the world's longest fingernails #Shorts,GQjeGkuTEg4,2021-08-19 14:32:05+00:00,23058712,560348,48634,16,77.0,11.521734,0.024301,0.002109,1e-06
89727,Numberphile,UCoxcjq-8xIDTYp3uz647V5A,Why do YouTube views freeze at 301?,oIkhgagvrjI,2012-06-22 18:38:55+00:00,301,3297396,46484,291362,3422.0,70.93615,10954.803987,154.431894,967.980066


## Comments/likes

In [27]:
df_videos["comments_likes_ratio"] = df_videos["comments"]/df_videos["likes"]
df_videos["comments_likes_ratio"] = df_videos["comments_likes_ratio"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [28]:
df_videos["comments_likes_ratio"].describe()

count    124382.000000
mean          0.139261
std           0.229329
min           0.000017
25%           0.049764
50%           0.082417
75%           0.148095
max          23.500000
Name: comments_likes_ratio, dtype: float64

In [29]:
df_videos.sort_values("comments_likes_ratio").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio
23308,Buenas Ideias,UCQRPDZMSwXFEDS67uc7kIdg,A GÊNESE DE LULA - EDUARDO BUENO,2myuXzu7IoM,2021-06-16 12:32:31+00:00,465839,57623,5429,1,141.0,10.613925,0.123697,0.011654,2e-06,1.7e-05
33990,CNBC,UCvJJ_dzjViJCoLf5uKUTwoA,Rep. Steny Hoyer: President Trump's Budget Is ...,tvdHfHf-MKo,2017-03-16 12:57:23+00:00,1040,2,1,47,1694.0,2.0,0.001923,0.000962,0.045192,23.5


## Comments/dislikes

In [30]:
df_videos["comments_dislikes_ratio"] = df_videos["comments"]/df_videos["dislikes"]
df_videos["comments_dislikes_ratio"] = df_videos["comments_dislikes_ratio"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [31]:
df_videos["comments_dislikes_ratio"].describe()

count    124382.000000
mean          5.395370
std          10.121774
min           0.000184
25%           1.500000
50%           3.104201
75%           6.116570
max         726.423077
Name: comments_dislikes_ratio, dtype: float64

In [32]:
df_videos.sort_values("comments_dislikes_ratio").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio
23308,Buenas Ideias,UCQRPDZMSwXFEDS67uc7kIdg,A GÊNESE DE LULA - EDUARDO BUENO,2myuXzu7IoM,2021-06-16 12:32:31+00:00,465839,57623,5429,1,141.0,10.613925,0.123697,0.011654,2e-06,1.7e-05,0.000184
122611,UrAvgConsumer,UC9fSZHEh6XsRpX-xJc6lT3A,iPhone 6S + LG G4: DOUBLE GIVEAWAY!,RAiQ1MgG9fA,2016-02-09 21:38:37+00:00,105333,9710,52,37774,2095.0,186.730769,0.092184,0.000494,0.358615,3.890216,726.423077


## Mean views per day

In [33]:
df_videos["mean_views_day"] = df_videos["views"]/df_videos["age_days"]
df_videos["mean_views_day"] = df_videos["mean_views_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [34]:
df_videos["mean_views_day"].describe()

count    1.243820e+05
mean     9.784434e+02
std      8.972164e+03
min      7.918025e-03
25%      1.331161e+01
50%      6.849015e+01
75%      3.714058e+02
max      1.493597e+06
Name: mean_views_day, dtype: float64

In [35]:
df_videos.sort_values("mean_views_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day
28360,Business Insider,UCcyq283he07B7_KUX07mmtA,From Neutral To Buy- Oil Sector Upgrades,oi3w93LmW3w,2010-02-01 19:13:42+00:00,34,1,1,1,4294.0,1.0,0.029412,0.029412,0.029412,1.0,1.0,0.007918
123444,Veritasium,UCHnyfMqiRRG1u-2MsSQLbXA,How Imaginary Numbers Were Invented,cUzklzVXJwo,2021-11-01 04:53:29+00:00,4480791,267438,2268,13066,3.0,117.917989,0.059685,0.000506,0.002916,0.048856,5.761023,1493597.0


## Mean likes per day

In [36]:
df_videos["mean_likes_day"] = df_videos["likes"]/df_videos["age_days"]
df_videos["mean_likes_day"] = df_videos["mean_likes_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [37]:
df_videos["mean_likes_day"].describe()

count    124382.000000
mean         33.757654
std         447.475257
min           0.000226
25%           0.195217
50%           1.556859
75%          10.769770
max       89146.000000
Name: mean_likes_day, dtype: float64

In [38]:
df_videos.sort_values("mean_likes_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day
28383,Business Insider,UCcyq283he07B7_KUX07mmtA,Thomas Gensemer On E-Mail During The Obama Cam...,NsrimXL-Pq0,2009-10-02 21:42:20+00:00,124,1,2,2,4416.0,0.5,0.008065,0.016129,0.016129,2.0,1.0,0.02808,0.000226
123444,Veritasium,UCHnyfMqiRRG1u-2MsSQLbXA,How Imaginary Numbers Were Invented,cUzklzVXJwo,2021-11-01 04:53:29+00:00,4480791,267438,2268,13066,3.0,117.917989,0.059685,0.000506,0.002916,0.048856,5.761023,1493597.0,89146.0


## Mean dislikes per day

In [39]:
df_videos["mean_dislikes_day"] = df_videos["dislikes"]/df_videos["age_days"]
df_videos["mean_dislikes_day"] = df_videos["mean_dislikes_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [40]:
df_videos["mean_dislikes_day"].describe()

count    124382.000000
mean          0.861399
std          17.881305
min           0.000191
25%           0.006423
50%           0.038818
75%           0.241714
max        5315.748330
Name: mean_dislikes_day, dtype: float64

In [41]:
df_videos.sort_values("mean_dislikes_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day,mean_dislikes_day
22565,BBC,UCCj956IF62FbT7Gouszaj9w,Zulu Stick Fighting - Last Man Standing - BBC ...,zCAgEHqZGpI,2007-06-30 02:42:37+00:00,27953,53,1,17,5241.0,53.0,0.001896,3.6e-05,0.000608,0.320755,17.0,5.333524,0.010113,0.000191
85293,Netflix,UCWOA1ZGywLbqmigxE4Qlvuw,Cuties | Official Trailer | Netflix,M0O7lLe4SmA,2020-08-12 18:38:39+00:00,17868644,94058,2386771,205226,449.0,0.039408,0.005264,0.133573,0.011485,2.181909,0.085985,39796.534521,209.483296,5315.74833


## Mean comments per day

In [42]:
df_videos["mean_comments_day"] = df_videos["comments"]/df_videos["age_days"]
df_videos["mean_comments_day"] = df_videos["mean_comments_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [43]:
df_videos["mean_comments_day"].describe()

count    124382.000000
mean          2.142196
std          26.059825
min           0.000192
25%           0.020990
50%           0.136882
75%           0.773225
max        4355.333333
Name: mean_comments_day, dtype: float64

In [44]:
df_videos.sort_values("mean_comments_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day,mean_dislikes_day,mean_comments_day
22536,BBC,UCCj956IF62FbT7Gouszaj9w,Adil Ray with Heera - London Mela 2007 - BBC A...,aR8v1FrPBE0,2007-08-12 15:56:57+00:00,6305,7,1,1,5198.0,7.0,0.00111,0.000159,0.000159,0.142857,1.0,1.212967,0.001347,0.000192,0.000192
123444,Veritasium,UCHnyfMqiRRG1u-2MsSQLbXA,How Imaginary Numbers Were Invented,cUzklzVXJwo,2021-11-01 04:53:29+00:00,4480791,267438,2268,13066,3.0,117.917989,0.059685,0.000506,0.002916,0.048856,5.761023,1493597.0,89146.0,756.0,4355.333333


## Mean likes/dislikes per day

In [45]:
df_videos["mean_likes_dislikes_ratio_day"] = df_videos["likes_dislikes_ratio"]/df_videos["age_days"]
df_videos["mean_likes_dislikes_ratio_day"] = df_videos["mean_likes_dislikes_ratio_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [46]:
df_videos["mean_likes_dislikes_ratio_day"].describe()

count    124382.000000
mean          0.227981
std           3.668131
min           0.000014
25%           0.009721
50%           0.028599
75%           0.085434
max         893.000000
Name: mean_likes_dislikes_ratio_day, dtype: float64

In [47]:
df_videos.sort_values("mean_likes_dislikes_ratio_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day,mean_dislikes_day,mean_comments_day,mean_likes_dislikes_ratio_day
32956,CNBC,UCvJJ_dzjViJCoLf5uKUTwoA,Fired Google Employee Behind Anti-Diversity Me...,WyIW7hgmryw,2017-08-08 17:16:32+00:00,1385,1,46,3,1549.0,0.021739,0.000722,0.033213,0.002166,3.0,0.065217,0.894125,0.000646,0.029697,0.001937,1.4e-05
117915,The Green Witch,UCsQXQMAZTygXddj_I4Ud9xA,How to Craft a Home Protection Spell Candle,0KjjEThO-Io,2021-11-03 02:14:35+00:00,13509,1786,1,107,2.0,1786.0,0.132208,7.4e-05,0.007921,0.05991,107.0,6754.5,893.0,0.5,53.5,893.0


## Mean likes/views per day

In [48]:
df_videos["mean_likes_views_ratio_day"] = df_videos["likes_views_ratio"]/df_videos["age_days"]
df_videos["mean_likes_views_ratio_day"] = df_videos["mean_likes_views_ratio_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [49]:
df_videos["mean_likes_views_ratio_day"].describe()

count    1.243820e+05
mean     1.912174e-04
std      9.198568e-03
min      1.951108e-08
25%      4.767631e-06
50%      1.403487e-05
75%      6.062476e-05
max      3.201287e+00
Name: mean_likes_views_ratio_day, dtype: float64

In [50]:
df_videos.sort_values("mean_likes_views_ratio_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day,mean_dislikes_day,mean_comments_day,mean_likes_dislikes_ratio_day,mean_likes_views_ratio_day
88862,Netflix,UCWOA1ZGywLbqmigxE4Qlvuw,"Discover, Relive and Watch TV from the Beginni...",iwrK1Mqao34,2013-09-05 00:38:00+00:00,790357,46,4,11,2983.0,11.5,5.8e-05,5e-06,1.4e-05,0.23913,2.75,264.953738,0.015421,0.001341,0.003688,0.003855,0.0
89727,Numberphile,UCoxcjq-8xIDTYp3uz647V5A,Why do YouTube views freeze at 301?,oIkhgagvrjI,2012-06-22 18:38:55+00:00,301,3297396,46484,291362,3422.0,70.93615,10954.803987,154.431894,967.980066,0.088361,6.268006,0.08796,963.587376,13.583869,85.143776,0.020729,3.201287


## Mean dislikes/views per day

In [51]:
df_videos["mean_dislikes_views_ratio_day"] = df_videos["dislikes_views_ratio"]/df_videos["age_days"]
df_videos["mean_dislikes_views_ratio_day"] = df_videos["mean_dislikes_views_ratio_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [52]:
df_videos["mean_dislikes_views_ratio_day"].describe()

count    1.243820e+05
mean     4.090752e-06
std      1.575083e-04
min      7.415272e-10
25%      1.604298e-07
50%      4.900447e-07
75%      1.470525e-06
max      4.512913e-02
Name: mean_dislikes_views_ratio_day, dtype: float64

In [53]:
df_videos.sort_values("mean_dislikes_views_ratio_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day,mean_dislikes_day,mean_comments_day,mean_likes_dislikes_ratio_day,mean_likes_views_ratio_day,mean_dislikes_views_ratio_day
50066,Disney Parks,UC1xwwLwm6WSMbUn_Tp597hQ,Europe Family Vacations | Adventures by Disney,BAk5kgg70uo,2019-01-25 14:48:12+00:00,1329949,103,1,1,1014.0,103.0,7.7e-05,1e-06,1e-06,0.009709,1.0,1311.586785,0.101578,0.000986,0.000986,0.101578,0.0,0.0
89727,Numberphile,UCoxcjq-8xIDTYp3uz647V5A,Why do YouTube views freeze at 301?,oIkhgagvrjI,2012-06-22 18:38:55+00:00,301,3297396,46484,291362,3422.0,70.93615,10954.803987,154.431894,967.980066,0.088361,6.268006,0.08796,963.587376,13.583869,85.143776,0.020729,3.201287,0.045129


## Mean comments/views per day

In [54]:
df_videos["mean_comments_views_ratio_day"] = df_videos["comments_views_ratio"]/df_videos["age_days"]
df_videos["mean_comments_views_ratio_day"] = df_videos["mean_comments_views_ratio_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [55]:
df_videos["mean_comments_views_ratio_day"].describe()

count    1.243820e+05
mean     1.587804e-05
std      8.156732e-04
min      3.898766e-10
25%      4.318555e-07
50%      1.403768e-06
75%      4.820830e-06
max      2.828697e-01
Name: mean_comments_views_ratio_day, dtype: float64

In [56]:
df_videos.sort_values("mean_comments_views_ratio_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day,mean_dislikes_day,mean_comments_day,mean_likes_dislikes_ratio_day,mean_likes_views_ratio_day,mean_dislikes_views_ratio_day,mean_comments_views_ratio_day
51018,Disney Parks,UC1xwwLwm6WSMbUn_Tp597hQ,"Tubestone Curl | Aulani, A Disney Resort & Spa",ZZUZ26lJQ6A,2014-07-31 14:43:45+00:00,2900393,206,18,3,2653.0,11.444444,7.1e-05,6e-06,1e-06,0.014563,0.166667,1093.250283,0.077648,0.006785,0.001131,0.004314,0.0,0.0,0.0
89727,Numberphile,UCoxcjq-8xIDTYp3uz647V5A,Why do YouTube views freeze at 301?,oIkhgagvrjI,2012-06-22 18:38:55+00:00,301,3297396,46484,291362,3422.0,70.93615,10954.803987,154.431894,967.980066,0.088361,6.268006,0.08796,963.587376,13.583869,85.143776,0.020729,3.201287,0.045129,0.28287


## Mean comments/likes per day

In [57]:
df_videos["mean_comments_likes_ratio_day"] = df_videos["comments_likes_ratio"]/df_videos["age_days"]
df_videos["mean_comments_likes_ratio_day"] = df_videos["mean_comments_likes_ratio_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [58]:
df_videos["mean_comments_likes_ratio_day"].describe()

count    1.243820e+05
mean     2.821932e-04
std      3.319270e-03
min      1.230793e-07
25%      3.278288e-05
50%      6.557355e-05
75%      1.506631e-04
max      5.727092e-01
Name: mean_comments_likes_ratio_day, dtype: float64

In [59]:
df_videos.sort_values("mean_comments_likes_ratio_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day,mean_dislikes_day,mean_comments_day,mean_likes_dislikes_ratio_day,mean_likes_views_ratio_day,mean_dislikes_views_ratio_day,mean_comments_views_ratio_day,mean_comments_likes_ratio_day
23308,Buenas Ideias,UCQRPDZMSwXFEDS67uc7kIdg,A GÊNESE DE LULA - EDUARDO BUENO,2myuXzu7IoM,2021-06-16 12:32:31+00:00,465839,57623,5429,1,141.0,10.613925,0.123697,0.011654,2e-06,1.7e-05,0.000184,3303.822695,408.673759,38.503546,0.007092,0.075276,0.000877,8.3e-05,0.0,0.0
9474,BBC,UCCj956IF62FbT7Gouszaj9w,"Act now for our children, Queen urges climate ...",qa67aUgXU_A,2021-11-02 06:41:57+00:00,117961,1506,1839,1725,2.0,0.818923,0.012767,0.01559,0.014623,1.145418,0.93801,58980.5,753.0,919.5,862.5,0.409462,0.006383,0.007795,0.007312,0.572709


## Mean comments/dislikes per day

In [60]:
df_videos["mean_comments_dislikes_ratio_day"] = df_videos["comments_dislikes_ratio"]/df_videos["age_days"]
df_videos["mean_comments_dislikes_ratio_day"] = df_videos["mean_comments_dislikes_ratio_day"].replace([np.inf, -np.inf], np.nan)
df_videos = df_videos.dropna()
df_videos = df_videos.reset_index(drop = True)

In [61]:
df_videos["mean_comments_dislikes_ratio_day"].describe()

count    124382.000000
mean          0.017020
std           0.288258
min           0.000001
25%           0.000886
50%           0.002296
75%           0.006518
max          53.500000
Name: mean_comments_dislikes_ratio_day, dtype: float64

In [62]:
df_videos.sort_values("mean_comments_dislikes_ratio_day").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day,mean_dislikes_day,mean_comments_day,mean_likes_dislikes_ratio_day,mean_likes_views_ratio_day,mean_dislikes_views_ratio_day,mean_comments_views_ratio_day,mean_comments_likes_ratio_day,mean_comments_dislikes_ratio_day
23308,Buenas Ideias,UCQRPDZMSwXFEDS67uc7kIdg,A GÊNESE DE LULA - EDUARDO BUENO,2myuXzu7IoM,2021-06-16 12:32:31+00:00,465839,57623,5429,1,141.0,10.613925,0.123697,0.011654,2e-06,1.7e-05,0.000184,3303.822695,408.673759,38.503546,0.007092,0.075276,0.000877,8.3e-05,0.0,0.0,1e-06
117915,The Green Witch,UCsQXQMAZTygXddj_I4Ud9xA,How to Craft a Home Protection Spell Candle,0KjjEThO-Io,2021-11-03 02:14:35+00:00,13509,1786,1,107,2.0,1786.0,0.132208,7.4e-05,0.007921,0.05991,107.0,6754.5,893.0,0.5,53.5,893.0,0.066104,3.7e-05,0.00396,0.029955,53.5


In [63]:
df_videos

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,likes_views_ratio,dislikes_views_ratio,comments_views_ratio,comments_likes_ratio,comments_dislikes_ratio,mean_views_day,mean_likes_day,mean_dislikes_day,mean_comments_day,mean_likes_dislikes_ratio_day,mean_likes_views_ratio_day,mean_dislikes_views_ratio_day,mean_comments_views_ratio_day,mean_comments_likes_ratio_day,mean_comments_dislikes_ratio_day
0,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,A few of the best math explainers from this su...,F3Qixy-r_rQ,2021-10-23 18:11:23+00:00,507643,29685,108,768,12.0,274.861111,0.058476,0.000213,0.001513,0.025872,7.111111,42303.583333,2473.75,9.0,64.0,22.905093,0.004873,0.000018,0.000126,0.002156,0.592593
1,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,How a Mandelbrot set arises from Newton’s work,LqbZpur38nw,2021-10-15 16:41:50+00:00,603492,26984,150,1237,20.0,179.893333,0.044713,0.000249,0.00205,0.045842,8.246667,30174.6,1349.2,7.5,61.85,8.994667,0.002236,0.000012,0.000102,0.002292,0.412333
2,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,Newton's Fractal (which Newton knew nothing ab...,-RdOwhmqP5s,2021-10-07 02:19:39+00:00,1239697,63031,296,2852,29.0,212.942568,0.050844,0.000239,0.002301,0.045248,9.635135,42748.172414,2173.482759,10.206897,98.344828,7.342847,0.001753,0.000008,0.000079,0.00156,0.332246
3,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,The Summer of Math Exposition,ojjzXyQCzso,2021-07-16 15:37:16+00:00,613816,29369,215,1724,111.0,136.6,0.047847,0.00035,0.002809,0.058701,8.018605,5529.873874,264.585586,1.936937,15.531532,1.230631,0.000431,0.000003,0.000025,0.000529,0.07224
4,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,A quick trick for computing eigenvalues | Chap...,e50Bj7jn9IQ,2021-05-07 19:01:16+00:00,428473,17163,145,1150,181.0,118.365517,0.040056,0.000338,0.002684,0.067005,7.931034,2367.254144,94.823204,0.801105,6.353591,0.653953,0.000221,0.000002,0.000015,0.00037,0.043818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124377,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Art Hour,C6hpMftAIs0,2009-06-13 18:32:12+00:00,172813,2163,31,136,4527.0,69.774194,0.012516,0.000179,0.000787,0.062876,4.387097,38.173846,0.4778,0.006848,0.030042,0.015413,0.000003,0.0,0.0,0.000014,0.000969
124378,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Hole,LPfcGXMpKds,2009-06-06 22:16:56+00:00,90902,1801,23,75,4534.0,78.304348,0.019813,0.000253,0.000825,0.041644,3.26087,20.048963,0.397221,0.005073,0.016542,0.01727,0.000004,0.0,0.0,0.000009,0.000719
124379,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Affordable Button,h3lvSflNixI,2009-05-30 00:13:46+00:00,103234,1571,47,18,4542.0,33.425532,0.015218,0.000455,0.000174,0.011458,0.382979,22.728754,0.345883,0.010348,0.003963,0.007359,0.000003,0.0,0.0,0.000003,0.000084
124380,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Outsource,zxIbQ6ZG4OI,2009-05-17 00:50:31+00:00,78272,1625,20,87,4555.0,81.25,0.020761,0.000256,0.001112,0.053538,4.35,17.183754,0.356751,0.004391,0.0191,0.017838,0.000005,0.0,0.0,0.000012,0.000955


# Save the dataset

In [64]:
df_videos.to_csv("data/videos_data.csv",
                 sep = ";",
                 index = False)