In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.figure_factory as ff
from matplotlib.colors import LinearSegmentedColormap, to_hex
from datetime import timedelta
import ipywidgets as widgets
from scipy.stats import skew

# Display options:
pd.set_option("display.width", 1200)
pd.set_option("display.max_columns", 300)
pd.set_option("display.max_rows", 300)

In [2]:
# Read the data:
df_videos = pd.read_csv("data/videos_data.csv", sep = ";")

In [3]:
df_videos

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments
0,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,A few of the best math explainers from this su...,F3Qixy-r_rQ,2021-10-23T18:11:23Z,361195.0,24407.0,82.0,659.0
1,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,How a Mandelbrot set arises from Newton’s work,LqbZpur38nw,2021-10-15T16:41:50Z,554691.0,25722.0,144.0,1205.0
2,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,Newton's Fractal (which Newton knew nothing ab...,-RdOwhmqP5s,2021-10-07T02:19:39Z,1179922.0,61162.0,285.0,2792.0
3,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,The Summer of Math Exposition,ojjzXyQCzso,2021-07-16T15:37:16Z,609396.0,29267.0,214.0,1719.0
4,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,A quick trick for computing eigenvalues | Chap...,e50Bj7jn9IQ,2021-05-07T19:01:16Z,419586.0,16984.0,144.0,1145.0
...,...,...,...,...,...,...,...,...,...
203003,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Hole,LPfcGXMpKds,2009-06-06T22:16:56Z,90791.0,1791.0,23.0,75.0
203004,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Affordable Button,h3lvSflNixI,2009-05-30T00:13:46Z,103154.0,1566.0,47.0,18.0
203005,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Optical Illusion,8mS5RK0Yo6w,2009-05-23T22:36:58Z,1072148.0,9848.0,1201.0,
203006,zefrank1,UCVpankR4HtoAVtYnFDUieYA,HardTimes :: Outsource,zxIbQ6ZG4OI,2009-05-17T00:50:31Z,78193.0,1620.0,20.0,87.0


In [4]:
df_videos.dtypes

channel_title         object
channel_id            object
video_title           object
video_id              object
video_upload_date     object
views                float64
likes                float64
dislikes             float64
comments             float64
dtype: object

In [5]:
############################# Data wrangling #############################

In [5]:
### Age in days (until the last upload date)

# Change to datetime:
df_videos["video_upload_date"] = pd.to_datetime(df_videos["video_upload_date"])

# Filter the wrong upload dates (coming from the API as being the request date) considering a margin of 2 days:
last_date = max(df_videos["video_upload_date"])
age = []
for i in range(df_videos.shape[0]):
    date_i = df_videos.loc[i, "video_upload_date"]
    diff_days = last_date - date_i
    if diff_days < timedelta(days = 2):
        age += [np.nan]
    else:
        age += [diff_days.days]
df_videos["age_days"] = age

In [7]:
df_videos["age_days"].dropna().describe()

count    202896.000000
mean       1547.755387
std        1244.428607
min           2.000000
25%         444.000000
50%        1299.000000
75%        2219.000000
max        5734.000000
Name: age_days, dtype: float64

In [8]:
df_videos.dropna().sort_values("age_days").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days
0,3Blue1Brown,UCYO_jab_esuFRV4b17AJtAw,A few of the best math explainers from this su...,F3Qixy-r_rQ,2021-10-23 18:11:23+00:00,361195.0,24407.0,82.0,659.0,2.0
92000,Jacques Slade,UCZ9l_6_f0PWRYXN5Y7Lcl2A,Get Your Hands Up,FXLXBfmlaOk,2006-02-12 20:31:24+00:00,28226.0,206.0,9.0,108.0,5734.0


In [6]:
### Likes-dislikes ratio

df_videos["likes_dislikes_ratio"] = df_videos["likes"]/df_videos["dislikes"]
df_videos["likes_dislikes_ratio"] = df_videos["likes_dislikes_ratio"].replace([np.inf, -np.inf], np.nan)

In [10]:
df_videos["likes_dislikes_ratio"].dropna().describe()

count    172162.000000
mean         42.583515
std          68.247050
min           0.000000
25%           7.573576
50%          20.666667
75%          49.333333
max        2635.000000
Name: likes_dislikes_ratio, dtype: float64

In [11]:
df_videos.dropna(subset = ["likes_dislikes_ratio"]).sort_values("likes_dislikes_ratio").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio
57657,CNBC,UCvJJ_dzjViJCoLf5uKUTwoA,Aetna CEO: Obamacare Risk-Adjustment Program I...,EK2lKEmSwLA,2016-10-27 20:07:55+00:00,559.0,0.0,1.0,1.0,1824.0,0.0
176509,TÁ NA MESA VEGG,UCSxFcMDGmwgDlSP9qmYVx1w,"COMO ARMAZENAR FOLHAS, ORGANIZAR FEIRA + SAL T...",GGqRCQmtx-U,2021-05-21 01:16:50+00:00,15330.0,2635.0,1.0,261.0,157.0,2635.0


In [7]:
### Comments-views ratio

df_videos["comments_views_ratio"] = df_videos["comments"]/df_videos["views"]
df_videos["comments_views_ratio"] = df_videos["comments_views_ratio"].replace([np.inf, -np.inf], np.nan)

# Here I decided to remove "Why do YouTube views freeze at 301?" from "Numberphile" because its views count was artificialy fixed:
df_videos = df_videos[df_videos["video_title"] != "Why do YouTube views freeze at 301?"]

In [13]:
df_videos["comments_views_ratio"].dropna().describe()

count    200742.000000
mean          0.003143
std           0.006700
min           0.000000
25%           0.000461
50%           0.001498
75%           0.003574
max           0.500000
Name: comments_views_ratio, dtype: float64

In [14]:
df_videos.sort_values("comments_views_ratio").dropna().apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,comments_views_ratio
34175,British Pathé,UCGp4u0WHLsK8OAxnvwiTyhA,British battlecruiser HMS Vanguard on the Rive...,EQ5uby5UiXU,2020-11-10 16:15:23+00:00,98.0,5.0,1.0,0.0,349.0,5.0,0.0
178158,UrAvgConsumer,UC9fSZHEh6XsRpX-xJc6lT3A,"Summer Giveaway Finale! (PS Vita, Nexus 7 2013...",T2LccukparY,2013-09-18 16:28:38+00:00,16217.0,2248.0,12.0,7141.0,2959.0,187.333333,0.44034


In [8]:
### Mean views per day

df_videos["mean_views_day"] = df_videos["views"]/df_videos["age_days"]
df_videos["mean_views_day"] = df_videos["mean_views_day"].replace([np.inf, -np.inf], np.nan)

In [16]:
df_videos["mean_views_day"].dropna().describe()

count    202839.000000
mean        703.974901
std        5960.601170
min           0.000000
25%           3.216458
50%          32.423693
75%         234.375439
max      791007.151515
Name: mean_views_day, dtype: float64

In [17]:
df_videos.sort_values("mean_views_day").dropna().apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,age_days,likes_dislikes_ratio,comments_views_ratio,mean_views_day
44332,Business Insider,UCcyq283he07B7_KUX07mmtA,IGNITION 2017 LIVE - Day One: Morning Session,mppUHGYL_Bk,2017-11-27 14:58:54+00:00,1.0,31.0,16.0,0.0,1428.0,1.9375,0.0,0.0007
88110,Guinness World Records,UCeSRjhfeeqIgr--AcP9qhyg,The fastest man on two hands - Guinness World ...,cEItmb_a20M,2021-09-22 14:18:34+00:00,26103236.0,593301.0,7578.0,40932.0,33.0,78.292557,0.001568,791007.151515


In [18]:
############################# Plots #############################

In [9]:
# Scatter of Mean views per day and Likes-disikes ratio:
x_var_name = "Likes-dislikes ratio"
y_var_name = "Mean views per day"
color_var_name = "Age (days)"
x_var = "likes_dislikes_ratio"
y_var = "mean_views_day"
color_var = "age_days"

channel = "Business Insider"
df_plot = df_videos[df_videos["channel_title"] == channel]

n_colors = 100
my_colors = ["#000000", "#E008F8", "#F81D08", "#F88A08", "#F7FE04"]
cmap = LinearSegmentedColormap.from_list("my_palette", my_colors)
my_palette = [to_hex(j) for j in  [cmap(i/n_colors) for i in np.array(range(n_colors))]]

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = df_plot[x_var],
        y = df_plot[y_var],
        mode = "markers",
        marker = {
            "size": 7,
            "color": df_plot[color_var],
            "colorscale": my_palette,
            "showscale": True,
            "colorbar": {
                "title": "<b>" + color_var_name + "</b>"
            }
        },        
        text = df_plot["video_title"],
        customdata = df_plot[color_var],
        hovertemplate = "<b>" + x_var_name + ": %{x:}<br>" +
                         y_var_name + ": %{y:}<br>" +
                         color_var_name + ": %{customdata}<br>" +
                         "Video: %{text}</b><extra></extra>"
    )
)
fig.update_layout(
    title = "<b>" + channel + "</b>" ,
    xaxis_title = "<b>" + x_var_name + "</b>" ,
    yaxis_title = "<b>" + y_var_name + "</b>",
    font = dict(
        size = 18
    ),
    plot_bgcolor = "white",
    hoverlabel = dict(
        font_size = 20,
        font_family = "Rockwell"
    )
)

In [78]:
# Scatter of likes and dislikes:
x_var_name = "Ages (days)"
y_var_name = "Comments-views ratio"
color_var_name = "Mean views per day"
size_var_name = "Likes-dislikes ratio"
x_var = "age_days"
y_var = "comments_views_ratio"
color_var = "mean_views_day"
size_var = "likes_dislikes_ratio"

channel = "Kurzgesagt – In a Nutshell"
df_plot = df_videos[df_videos["channel_title"] == channel]

n_colors = 100
my_colors = ["#000000", "#E008F8", "#F81D08", "#F88A08", "#F7FE04"]
cmap = LinearSegmentedColormap.from_list("my_palette", my_colors)
my_palette = [to_hex(j) for j in  [cmap(i/n_colors) for i in np.array(range(n_colors))]]

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = df_plot[x_var],
        y = df_plot[y_var],
        mode = "markers",
        marker = {
            "color": df_plot[color_var],
            "colorscale": my_palette,
            "showscale": True,
            "colorbar": {
                "title": "<b>" + color_var_name + "</b>"
            },
            "size": df_plot[size_var],
            "opacity": 0.9,
            "sizemode": "area",
            "sizeref": 2.*max(df_plot[size_var])/(40.**2),
            "sizemin": 4            
        },
        text = df_plot["video_title"],
        customdata = df_plot[[color_var, size_var]],
        hovertemplate = "<b>" + x_var_name + ": %{x:.0f}<br>" +
                         y_var_name + ": %{y:.2g}<br>" +
                         color_var_name + ": %{customdata[0]:.0f}<br>" +
                         size_var_name + ": %{customdata[1]:.3f}<br>" +
                         "Video: %{text}</b><extra></extra>"
    )
)
fig.update_layout(
    title = "<b>" + channel + "</b>" ,
    xaxis_title = "<b>" + x_var_name + "</b>" ,
    yaxis_title = "<b>" + y_var_name + "</b>",
    font = dict(
        size = 18
    ),
    plot_bgcolor = "white",
    hoverlabel = dict(
        font_size = 20,
        font_family = "Rockwell"
    )
)

In [None]:
### Dynamic plots (choosing the channel, the variables and some filters)

In [10]:
# Variables names:
vars_names = {
    "Views": "views",
    "Likes": "likes",
    "Dislikes": "dislikes",
    "Comments": "comments",
    "Likes-dislikes ratio": "likes_dislikes_ratio",
    "Age (days)": "age_days",
    "Comment-views ratio": "comments_views_ratio",
    "Mean views per day": "mean_views_day"
}

# Options:
opts_channel = ["All channels"] + np.sort(df_videos["channel_title"].unique()).tolist()
opts_vars = list(vars_names.keys())
opts_vars = [i for i in vars_names.keys()]
opts_vars.sort()

In [11]:
### 1D histogram

# Variable and filters widgets:
input_xvar = widgets.Dropdown(
    description = "Variable: ",
    options = list(vars_names.keys()),
    value = list(vars_names.keys())[0]
)
input_channels = widgets.Dropdown(
    description = "Channel: ",
    options = opts_channel,
    value = opts_channel[0]
)
input_bins = widgets.IntSlider(
    description = "Bins: ",
    min = 10,
    max = 1000,
    step = 10,
    value = 200
)
input_xmin = widgets.FloatText(
    value = np.min(df_videos[vars_names[input_xvar.value]]),
    description = "Min: "
)
input_xmax = widgets.FloatText(
    value = np.max(df_videos[vars_names[input_xvar.value]]),
    description = "Max: "
)

# Statistics in the title:
def plot_title(x):
    title = "Filtered sample statistics<br>" +\
            "<b style = 'color: #900c3f'>Mean</b>: " + f"{np.mean(x):.7g}      " +\
            "<b style = 'color: #ffc300'>Median</b>: " + f"{np.median(x):.7g}      " +\
            "<b>Standard deviation</b>: " + f"{np.std(x):.7g}      " +\
            "<b>Skewness</b>: " + f"{skew(x):.3g}"
    return(title)

# Mean and median lines:
def vert_lines(x):
    vertical_lines = [
        {
            'line': {
                'color': '#900c3f',
                'dash': 'dash',
                'width': 2
            },
            'type': 'line',
            'x0': np.mean(x),
            'x1': np.mean(x),
            'xref': 'x',
            'y0': 0,
            'y1': 1,
            'yref': 'paper'
        },
        {
            'line': {
                'color': '#ffc300',
                'dash': 'dash',
                'width': 2
            },
            'type': 'line',
            'x0': np.median(x),
            'x1': np.median(x),
            'xref': 'x',
            'y0': 0,
            'y1': 1,
            'yref': 'paper'
        }
    ]
    return(vertical_lines)

# Initialize the figure:
x_init = df_videos[vars_names[input_xvar.value]].dropna()
x_mean = np.mean(x_init)
x_median = np.median(x_init)
fig = go.FigureWidget(
    data = [
        go.Histogram(
            x = x_init,
            histfunc = "count",
            nbinsx = input_bins.value,
            marker_color = "#00baad",
            opacity = 0.9
        )
    ],
    layout = go.Layout(
        title = plot_title(x_init),
        xaxis_title = "<b>" + input_xvar.value + "</b>",
        yaxis_title = "<b>Counts</b>",
        font = dict(
            size = 18
        ),
        showlegend = False,
        plot_bgcolor = "white",
        hoverlabel = dict(
            font_size = 18,
            font_family = "Rockwell"
        ),
        margin = dict(
            l = 20,
            r = 20,
            t = 100,
            b = 20
        ),
        height = 600,
        shapes = vert_lines(x_init)
    )
)

# Filter and update function:
def filtering(chosen_xvar, chosen_channel, chosen_bins, chosen_xmin, chosen_xmax):
    # Filter by channel:
    if chosen_channel == "All channels":
        df_filtered = df_videos.copy()
    else:
        df_filtered = df_videos.copy()[df_videos["channel_title"] == chosen_channel]
    
    # Variable:
    x_vals = df_filtered[vars_names[chosen_xvar]]

    # Filter the range:
    if chosen_xmin >= chosen_xmax or chosen_xmax < np.min(x_vals) or chosen_xmin > np.max(x_vals):
        pass
    else:
        if chosen_xmin < np.min(x_vals):
            pass
        else:
            x_vals = x_vals[x_vals > chosen_xmin]
        if chosen_xmax > np.max(x_vals):
            pass
        else:
            x_vals = x_vals[x_vals < chosen_xmax]
    
    # Drop the nan:
    x_vals = x_vals.dropna()

    # Update the figure:
    with fig.batch_update():
        fig.data[0].x = x_vals
        fig.data[0].nbinsx = chosen_bins
        fig.layout.xaxis.title = "<b>" + chosen_xvar + "</b>"
        fig.layout.title = plot_title(x_vals)
        fig.layout.shapes = vert_lines(x_vals)

# Event handlers:
def eventhandler_xvar(change):
    filtering(chosen_xvar = change.new,
              chosen_channel = input_channels.value,
              chosen_bins = input_bins.value,
              chosen_xmin = input_xmin.value,
              chosen_xmax = input_xmax.value)
def eventhandler_channels(change):
    filtering(chosen_xvar = input_xvar.value,
              chosen_channel = change.new,
              chosen_bins = input_bins.value,
              chosen_xmin = input_xmin.value,
              chosen_xmax = input_xmax.value)
def eventhandler_bins(change):
    filtering(chosen_xvar = input_xvar.value,
              chosen_channel = input_channels.value,
              chosen_bins = change.new,
              chosen_xmin = input_xmin.value,
              chosen_xmax = input_xmax.value)
def eventhandler_xmin(change):
    filtering(chosen_xvar = input_xvar.value,
              chosen_channel = input_channels.value,
              chosen_bins = input_bins.value,
              chosen_xmin = change.new,
              chosen_xmax = input_xmax.value)
def eventhandler_xmax(change):
    filtering(chosen_xvar = input_xvar.value,
              chosen_channel = input_channels.value,
              chosen_bins = input_bins.value,
              chosen_xmin = input_xmin.value,
              chosen_xmax = change.new)

# Observes:
input_xvar.observe(eventhandler_xvar,
                   names = "value")
input_channels.observe(eventhandler_channels,
                       names = "value")
input_bins.observe(eventhandler_bins,
                   names = "value")
input_xmin.observe(eventhandler_xmin,
                   names = "value")
input_xmax.observe(eventhandler_xmax,
                   names = "value")

# Row of filters:
row_filters = widgets.HBox(
    [
        input_xvar,
        input_channels,
        input_bins,
        input_xmin,
        input_xmax
    ]
)

# Main box:
widgets.VBox(
    [
        row_filters,
        fig
    ]
)

VBox(children=(HBox(children=(Dropdown(description='Variable: ', options=('Views', 'Likes', 'Dislikes', 'Comme…

In [None]:
### Scatter with colors

# Variables and filters widgets:
input_xvar = widgets.Dropdown(
    description = "Variable on x: ",
    options = list(vars_names.keys()),
    value = list(vars_names.keys())[0]
)
input_yvar = widgets.Dropdown(
    description = "Variable on y: ",
    options = list(vars_names.keys()),
    value = list(vars_names.keys())[1]
)
input_cvar = widgets.Dropdown(
    description = "Variable on color: ",
    options = list(vars_names.keys()),
    value = list(vars_names.keys())[2]
)
input_channels = widgets.Dropdown(
    description = "Channel: ",
    options = opts_channel,
    value = opts_channel[0]
)

# Initialize the figure:
n_colors = 100
my_colors = ["#000000", "#E008F8", "#F81D08", "#F88A08", "#F7FE04"]
cmap = LinearSegmentedColormap.from_list("my_palette", my_colors)
my_palette = [to_hex(j) for j in  [cmap(i/n_colors) for i in np.array(range(n_colors))]]

x_init = df_videos[vars_names[input_xvar.value]]
y_init = df_videos[vars_names[input_yvar.value]]
c_init = df_videos[vars_names[input_cvar.value]]

fig = go.FigureWidget(
    data = [
        go.Scatter(
        x = x_init,
        y = y_init,
        mode = "markers",
        marker = {
            "size": 7,
            "color": c_init,
            "colorscale": my_palette,
            "showscale": True,
            "colorbar": {
                "title": "<b>" + input_cvar.value + "</b>"
            }
        },        
        customdata = df_plot[[vars_names[input_cvar.value], "video_title"]],
        hovertemplate = "<b>" + input_xvar.value + ": %{x:}<br>" +
                         input_yvar.value + ": %{y:}<br>" +
                         input_cvar.value + ": %{customdata[0]}<br>" +
                         "Video: %{customdata[1]}</b><extra></extra>"
        )
    ],
    layout = go.Layout(
        xaxis_title = "<b>" + input_xvar.value + "</b>",
        yaxis_title = "<b>" + input_yvar.value + "</b>",
        font = dict(
            size = 18
        ),
        showlegend = False,
        plot_bgcolor = "white",
        hoverlabel = dict(
            font_size = 18,
            font_family = "Rockwell"
        ),
        margin = dict(
            l = 20,
            r = 20,
            t = 20,
            b = 20
        ),
        height = 600
    )
)

# Filter and update function:
def filtering(chosen_xvar, chosen_yvar, chosen_cvar, chosen_channel):
    # Filter by channel:
    if chosen_channel == "All channels":
        df_filtered = df_videos.copy()
    else:
        df_filtered = df_videos.copy()[df_videos["channel_title"] == chosen_channel]
    
    # Variables:
    x_vals = df_filtered[vars_names[chosen_xvar]]
    y_vals = df_filtered[vars_names[chosen_yvar]]
    c_vals = df_filtered[vars_names[chosen_cvar]]

    # Update the figure:
    with fig.batch_update():
        fig.data[0].x = x_vals
        fig.data[0].y = y_vals
        fig.layout.xaxis.title = "<b>" + chosen_xvar + "</b>"
        fig.layout.yaxis.title = "<b>" + chosen_yvar + "</b>"
        fig.





In [17]:


# Initialize the figure:
n_colors = 100
my_colors = ["#000000", "#E008F8", "#F81D08", "#F88A08", "#F7FE04"]
cmap = LinearSegmentedColormap.from_list("my_palette", my_colors)
my_palette = [to_hex(j) for j in  [cmap(i/n_colors) for i in np.array(range(n_colors))]]

x_init = df_videos[vars_names[list(vars_names.keys())[0]]]
y_init = df_videos[vars_names[list(vars_names.keys())[1]]]
c_init = df_videos[vars_names[list(vars_names.keys())[2]]]

fig = go.FigureWidget(
    data = [
        go.Scatter(
        x = x_init,
        y = y_init,
        mode = "markers",
        marker = {
            "size": 7,
            "color": c_init,
            "colorscale": my_palette,
            "showscale": True,
            "colorbar": {
                "title": "<b>" + input_cvar.value + "</b>"
            }
        },        
        customdata = df_plot[[vars_names[input_cvar.value], "video_title"]],
        hovertemplate = "<b>" + input_xvar.value + ": %{x:}<br>" +
                         input_yvar.value + ": %{y:}<br>" +
                         input_cvar.value + ": %{customdata[0]}<br>" +
                         "Video: %{customdata[1]}</b><extra></extra>"
        )
    ],
    layout = go.Layout(
        xaxis_title = "<b>" + input_xvar.value + "</b>",
        yaxis_title = "<b>" + input_yvar.value + "</b>",
        font = dict(
            size = 18
        ),
        showlegend = False,
        plot_bgcolor = "white",
        hoverlabel = dict(
            font_size = 18,
            font_family = "Rockwell"
        ),
        margin = dict(
            l = 20,
            r = 20,
            t = 20,
            b = 20
        ),
        height = 600
    )
)


In [None]:
# Bubble with colors:

In [None]:
# 2D density:

In [None]:
# Heatmap numeric:

In [None]:
# Lines with 3 variables (the categoric being the channel, until a limit of say 5 channels):

In [None]:
# Correlation matrix continuous: