In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.figure_factory as ff
from dateutil import parser
from matplotlib.colors import LinearSegmentedColormap, to_hex
from datetime import datetime

# Display options:
pd.set_option("display.width", 1200)
pd.set_option("display.max_columns", 300)
pd.set_option("display.max_rows", 100)

In [2]:
# Read the data:
df_videos = pd.read_csv("data/videos_data.csv", sep = ";")

In [3]:
df_videos

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments
0,Tom Scott,UCBa659QWEk1AI4Tg--mrJ2A,The shooting range where you fire over a busy ...,2h1s6S4kotE,2021-10-14T11:25:18Z,1575580.0,108576.0,602.0,6421.0
1,Tom Scott,UCBa659QWEk1AI4Tg--mrJ2A,The world's only float-through McDonalds,A6F96xSoLPg,2021-10-08T16:43:23Z,2051954.0,130550.0,916.0,4573.0
2,Tom Scott,UCBa659QWEk1AI4Tg--mrJ2A,I thought the Schmid Peoplemover was impossible,A2g4u9F9i90,2021-09-30T16:02:45Z,2639432.0,144361.0,815.0,5645.0
3,Tom Scott,UCBa659QWEk1AI4Tg--mrJ2A,The world's most useful model railway,6TLcaJdsRr0,2021-09-24T14:00:54Z,1960319.0,107866.0,623.0,3188.0
4,Tom Scott,UCBa659QWEk1AI4Tg--mrJ2A,The public toll road with no speed limit,10Y-gWNJ2Sw,2021-09-17T08:11:31Z,1954576.0,129773.0,819.0,3966.0
...,...,...,...,...,...,...,...,...,...
73087,Marcel Vos,UCBlXovStrlQkVA2xJEROUNg,OpenRCT2 Six Flags Magic Mountain Speedrun,Dv5k6Yd6CGs,2018-05-25T16:59:02Z,9187.0,103.0,4.0,10.0
73088,Marcel Vos,UCBlXovStrlQkVA2xJEROUNg,OpenRCT2 - Sherwood Forest Speedrun,mWRP2Thp2C4,2018-05-25T10:54:05Z,7455.0,90.0,5.0,5.0
73089,Marcel Vos,UCBlXovStrlQkVA2xJEROUNg,OpenRCT2 - Electric Fields Speedrun,FUlTbdqtvD8,2018-05-25T10:48:40Z,6580.0,79.0,5.0,2.0
73090,Marcel Vos,UCBlXovStrlQkVA2xJEROUNg,[Unofficial] OpenRCT2 speedrun: Amity Airfield...,Dw0e9tzLmA8,2018-03-23T20:23:25Z,29461.0,264.0,27.0,49.0


In [5]:
### Make some new variables

In [4]:
# Ratio between likes and dislikes:
df_videos["likes_dislikes_ratio"] = df_videos["likes"]/df_videos["dislikes"]

In [11]:
df_videos["likes_dislikes_ratio"].replace([np.inf, -np.inf], np.nan).dropna().describe()

count    66322.000000
mean        35.304122
std         56.862822
min          0.000000
25%          6.000000
50%         16.699682
75%         38.000000
max       2469.000000
Name: likes_dislikes_ratio, dtype: float64

In [12]:
df_videos.replace([np.inf, -np.inf], np.nan).dropna(subset = ["likes_dislikes_ratio"]).sort_values("likes_dislikes_ratio").\
apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,likes_dislikes_ratio,age_days
14253,CNBC,UCvJJ_dzjViJCoLf5uKUTwoA,TV Legend Norman Lear Calls Out Hollywood | BI...,UX4ZJ0hVBuM,2016-08-23T14:35:59Z,193.0,0.0,1.0,0.0,0.0,1882.39169
28045,O Pimentinha,UCQ3JxE-NOyZaJ3m3qIkZbhA,O Pimentinha avançou no graxaim Véio.,ws2QgK9DYFA,2021-05-13T17:53:36Z,13544.0,2469.0,1.0,199.0,2469.0,158.254456


In [5]:
# Age in days (until today):
today = datetime.today().astimezone()
df_videos["age_days"] = [(today - parser.parse(i)).total_seconds()/(24*3600) for i in df_videos["video_upload_date"]]

In [51]:
df_videos["age_days"].describe()

count    73092.000000
mean      1656.387450
std       1160.520275
min          0.091022
25%        816.160964
50%       1503.069668
75%       2142.229772
max       5623.933696
Name: age_days, dtype: float64

In [52]:
df_videos.sort_values("age_days").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,likes_dislikes_ratio,age_days
29840,Meteoro Brasil,UCk5BcU1rOy6hepflk7_q_Pw,LIVE: MAIS UMA ÚLTIMA SESSÃO DA CPI,UVFFi_KLWtk,2021-10-19T18:25:17Z,74674.0,9323.0,58.0,3.0,160.741379,0.091022
72932,cyriak,UC9Ntx-EF3LzKY1nQ5rTUP2g,dancing dave,Q9v-0rVeCIg,2006-05-27T22:11:50Z,637350.0,10749.0,605.0,1833.0,17.766942,5623.933696


In [36]:
# Ratio between comments and views:
df_videos["comments_views_ratio"] = df_videos["comments"]/df_videos["views"]

In [55]:
df_videos["comments_views_ratio"].replace([np.inf, -np.inf], np.nan).dropna().describe()

count    72815.000000
mean         0.003073
std          0.004719
min          0.000000
25%          0.000737
50%          0.001767
75%          0.003626
max          0.153484
Name: comments_views_ratio, dtype: float64

In [56]:
df_videos.sort_values("comments_views_ratio").replace([np.inf, -np.inf], np.nan).dropna().apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,likes_dislikes_ratio,age_days,comments_views_ratio
16321,CNBC,UCvJJ_dzjViJCoLf5uKUTwoA,Novavax CEO: Fighting The Biotech Bear | Mad M...,eC6HO0RRLDs,2015-09-30T23:11:21Z,3950.0,11.0,3.0,0.0,3.666667,2210.892365,0.0
52925,TODAY,UChDKyKQ59fYz3JO2fl0Z6sg,Capitol Police Exonerate Officer Who Shot And ...,4qiaPX42-3U,2021-08-21T12:18:12Z,15096.0,238.0,283.0,2317.0,0.840989,59.345941,0.153484


In [6]:
# Mean views per day:
df_videos["mean_views_day"] = df_videos["views"]/df_videos["age_days"]

In [58]:
df_videos["mean_views_day"].replace([np.inf, -np.inf], np.nan).dropna().describe()

count     73074.000000
mean        671.975995
std        6792.062150
min           0.000000
25%           3.258682
50%          42.981582
75%         281.769445
max      820395.772851
Name: mean_views_day, dtype: float64

In [59]:
df_videos.sort_values("mean_views_day").replace([np.inf, -np.inf], np.nan).dropna().apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,likes_dislikes_ratio,age_days,comments_views_ratio,mean_views_day
5097,CNBC,UCvJJ_dzjViJCoLf5uKUTwoA,"Ahead of Netflix Earnings, Money Manager Pinpo...",fIqvMaoNLQY,2018-07-12T21:11:27Z,8.0,2.0,4.0,0.0,0.5,1194.975628,0.0,0.006695
29840,Meteoro Brasil,UCk5BcU1rOy6hepflk7_q_Pw,LIVE: MAIS UMA ÚLTIMA SESSÃO DA CPI,UVFFi_KLWtk,2021-10-19T18:25:17Z,74674.0,9323.0,58.0,3.0,160.741379,0.091022,4e-05,820395.772851


In [18]:
### Plots

In [13]:
# 1D Density of views:
x_var_name = "Views"
x_vals = df_videos["views"].replace([np.inf, -np.inf], np.nan).dropna()
x_vals = x_vals[x_vals < 10000]
nbins = 200
binsize = np.abs(np.max(x_vals) - np.min(x_vals))/nbins

fig = ff.create_distplot(
    hist_data = [x_vals],
    group_labels = ["x"],
    curve_type = "kde",
    show_curve = True,
    show_hist = True,
    bin_size = binsize,
    show_rug = False,
    colors = ["#813DDA"]
)
fig.update_traces(
    hovertemplate = "<b>Density: %{y:,}<br>" + 
                    x_var_name + ": %{x:,}</b><extra></extra>"
)
fig.update_layout(
    xaxis_title = "<b>" + x_var_name + "</b>" ,
    yaxis_title = "<b>Density</b>",
    font = dict(
        size = 18
    ),
    showlegend = False,
    plot_bgcolor = "white",
    hoverlabel = dict(
        font_size = 18,
        font_family = "Rockwell"
    )
)

In [16]:
# 1D Density of likes:
x_var_name = "Likes"
x_vals = df_videos["likes"].replace([np.inf, -np.inf], np.nan).dropna()
x_vals = x_vals[x_vals < 1000]
nbins = 200
binsize = np.abs(np.max(x_vals) - np.min(x_vals))/nbins

fig = ff.create_distplot(
    hist_data = [x_vals],
    group_labels = ["x"],
    curve_type = "kde",
    show_curve = True,
    show_hist = True,
    bin_size = binsize,
    show_rug = False,
    colors = ["#813DDA"]
)
fig.update_traces(
    hovertemplate = "<b>Density: %{y:,}<br>" + 
                    x_var_name + ": %{x:,}</b><extra></extra>"
)
fig.update_layout(
    xaxis_title = "<b>" + x_var_name + "</b>" ,
    yaxis_title = "<b>Density</b>",
    font = dict(
        size = 18
    ),
    showlegend = False,
    plot_bgcolor = "white",
    hoverlabel = dict(
        font_size = 18,
        font_family = "Rockwell"
    )
)

In [20]:
# 1D Density of dislikes:
x_var_name = "Dislikes"
x_vals = df_videos["dislikes"].replace([np.inf, -np.inf], np.nan).dropna()
x_vals = x_vals[x_vals < 100]
nbins = 50
binsize = np.abs(np.max(x_vals) - np.min(x_vals))/nbins

fig = ff.create_distplot(
    hist_data = [x_vals],
    group_labels = ["x"],
    curve_type = "kde",
    show_curve = True,
    show_hist = True,
    bin_size = binsize,
    show_rug = False,
    colors = ["#813DDA"]
)
fig.update_traces(
    hovertemplate = "<b>Density: %{y:,}<br>" + 
                    x_var_name + ": %{x:,}</b><extra></extra>"
)
fig.update_layout(
    xaxis_title = "<b>" + x_var_name + "</b>" ,
    yaxis_title = "<b>Density</b>",
    font = dict(
        size = 18
    ),
    showlegend = False,
    plot_bgcolor = "white",
    hoverlabel = dict(
        font_size = 18,
        font_family = "Rockwell"
    )
)

In [23]:
# 1D Density of comments:
x_var_name = "Comments"
x_vals = df_videos["comments"].replace([np.inf, -np.inf], np.nan).dropna()
x_vals = x_vals[x_vals < 500]
nbins = 100
binsize = np.abs(np.max(x_vals) - np.min(x_vals))/nbins

fig = ff.create_distplot(
    hist_data = [x_vals],
    group_labels = ["x"],
    curve_type = "kde",
    show_curve = True,
    show_hist = True,
    bin_size = binsize,
    show_rug = False,
    colors = ["#813DDA"]
)
fig.update_traces(
    hovertemplate = "<b>Density: %{y:,}<br>" + 
                    x_var_name + ": %{x:,}</b><extra></extra>"
)
fig.update_layout(
    xaxis_title = "<b>" + x_var_name + "</b>" ,
    yaxis_title = "<b>Density</b>",
    font = dict(
        size = 18
    ),
    showlegend = False,
    plot_bgcolor = "white",
    hoverlabel = dict(
        font_size = 18,
        font_family = "Rockwell"
    )
)

In [32]:
# 1D Density of likes_dislikes_ratio:
x_var_name = "Likes-Dislikes Ratio"
x_vals = df_videos["likes_dislikes_ratio"].replace([np.inf, -np.inf], np.nan).dropna()
x_vals = x_vals[x_vals < 100]
nbins = 200
binsize = np.abs(np.max(x_vals) - np.min(x_vals))/nbins

fig = ff.create_distplot(
    hist_data = [x_vals],
    group_labels = ["x"],
    curve_type = "kde",
    show_curve = True,
    show_hist = True,
    bin_size = binsize,
    show_rug = False,
    colors = ["#813DDA"]
)
fig.update_traces(
    hovertemplate = "<b>Density: %{y:,}<br>" + 
                    x_var_name + ": %{x:,}</b><extra></extra>"
)
fig.update_layout(
    xaxis_title = "<b>" + x_var_name + "</b>" ,
    yaxis_title = "<b>Density</b>",
    font = dict(
        size = 18
    ),
    showlegend = False,
    plot_bgcolor = "white",
    hoverlabel = dict(
        font_size = 18,
        font_family = "Rockwell"
    )
)

In [39]:
# 1D Density of comments_views_ratio:
x_var_name = "Comments-views Ratio"
x_vals = df_videos["comments_views_ratio"].replace([np.inf, -np.inf], np.nan).dropna()
x_vals = x_vals[(x_vals < 0.02) & (x_vals > 0)]
nbins = 300
binsize = np.abs(np.max(x_vals) - np.min(x_vals))/nbins

fig = ff.create_distplot(
    hist_data = [x_vals],
    group_labels = ["x"],
    curve_type = "kde",
    show_curve = True,
    show_hist = True,
    bin_size = binsize,
    show_rug = False,
    colors = ["#813DDA"]
)
fig.update_traces(
    hovertemplate = "<b>Density: %{y:,}<br>" + 
                    x_var_name + ": %{x:,}</b><extra></extra>"
)
fig.update_layout(
    xaxis_title = "<b>" + x_var_name + "</b>" ,
    yaxis_title = "<b>Density</b>",
    font = dict(
        size = 18
    ),
    showlegend = False,
    plot_bgcolor = "white",
    hoverlabel = dict(
        font_size = 18,
        font_family = "Rockwell"
    )
)

In [40]:
# Scatter of Mean views per day and Likes-disikes ratio:
x_var_name = "Likes-dislikes ratio"
y_var_name = "Mean views per day"
color_var_name = "Age (days)"
x_var = "likes_dislikes_ratio"
y_var = "mean_views_day"
color_var = "age_days"

channel = "O Pimentinha"
df_plot = df_videos[df_videos["channel_title"] == channel]

n_colors = 100
my_colors = ["#000000", "#E008F8", "#F81D08", "#F88A08", "#F7FE04"]
cmap = LinearSegmentedColormap.from_list("my_palette", my_colors)
my_palette = [to_hex(j) for j in  [cmap(i/n_colors) for i in np.array(range(n_colors))]]

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = df_plot[x_var],
        y = df_plot[y_var],
        mode = "markers",
        marker = {
            "size": 7,
            "color": df_plot[color_var],
            "colorscale": my_palette,
            "showscale": True,
            "colorbar": {
                "title": "<b>" + color_var_name + "</b>"
            }
        },        
        text = df_plot["video_title"],
        customdata = df_plot[color_var],
        hovertemplate = "<b>" + x_var_name + ": %{x:}<br>" +
                         y_var_name + ": %{y:}<br>" +
                         color_var_name + ": %{customdata}<br>" +
                         "Video: %{text}</b><extra></extra>"
    )
)
fig.update_layout(
    title = "<b>" + channel + "</b>" ,
    xaxis_title = "<b>" + x_var_name + "</b>" ,
    yaxis_title = "<b>" + y_var_name + "</b>",
    font = dict(
        size = 18
    ),
    plot_bgcolor = "white",
    hoverlabel = dict(
        font_size = 20,
        font_family = "Rockwell"
    )
)

In [41]:
# Scatter of likes and dislikes:
x_var_name = "Ages (days)"
y_var_name = "Comments-views ratio"
color_var_name = "Mean views per day"
size_var_name = "Likes-dislikes ratio"
x_var = "age_days"
y_var = "comments_views_ratio"
color_var = "mean_views_day"
size_var = "likes_dislikes_ratio"

channel = "Kurzgesagt – In a Nutshell"
df_plot = df_videos[df_videos["channel_title"] == channel]

n_colors = 100
my_colors = ["#000000", "#E008F8", "#F81D08", "#F88A08", "#F7FE04"]
cmap = LinearSegmentedColormap.from_list("my_palette", my_colors)
my_palette = [to_hex(j) for j in  [cmap(i/n_colors) for i in np.array(range(n_colors))]]

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = df_plot[x_var],
        y = df_plot[y_var],
        mode = "markers",
        marker = {
            "color": df_plot[color_var],
            "colorscale": my_palette,
            "showscale": True,
            "colorbar": {
                "title": "<b>" + color_var_name + "</b>"
            },
            "size": df_plot[size_var],
            "opacity": 0.9,
            "sizemode": "area",
            "sizeref": 2.*max(df_plot[size_var])/(40.**2),
            "sizemin": 4            
        },
        text = df_plot["video_title"],
        customdata = df_plot[[color_var, size_var]],
        hovertemplate = "<b>" + x_var_name + ": %{x:}<br>" +
                         y_var_name + ": %{y:}<br>" +
                         color_var_name + ": %{customdata[0]}<br>" +
                         size_var_name + ": %{customdata[1]}<br>" +
                         "Video: %{text}</b><extra></extra>"
    )
)
fig.update_layout(
    title = "<b>" + channel + "</b>" ,
    xaxis_title = "<b>" + x_var_name + "</b>" ,
    yaxis_title = "<b>" + y_var_name + "</b>",
    font = dict(
        size = 18
    ),
    plot_bgcolor = "white",
    hoverlabel = dict(
        font_size = 20,
        font_family = "Rockwell"
    )
)

In [None]:
### Dynamic plots (choosing the channel and the variables)

In [None]:
# 1D histogram with density:


In [None]:
# Scatter with colors:

In [None]:
# Bubble with colors:

In [None]:
# 2D density:

In [None]:
# Heatmap numeric:

In [None]:
# Lines with 3 variables (the categoric being the channel, until a limit of say 5 channels):

In [None]:
# Correlation matrix continuous: