In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.figure_factory as ff
from dateutil import parser
from matplotlib.colors import LinearSegmentedColormap, to_hex
from datetime import datetime
import ipywidgets as widgets
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt


# Display options:
pd.set_option("display.width", 1200)
pd.set_option("display.max_columns", 300)
pd.set_option("display.max_rows", 100)

In [2]:
# Read the data:
df_videos = pd.read_csv("data/videos_data.csv", sep = ";")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
df_videos["channel_title"].unique().shape

(119,)

In [4]:
df_videos

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments
0,Tom Scott,UCBa659QWEk1AI4Tg--mrJ2A,The shooting range where you fire over a busy ...,2h1s6S4kotE,2021-10-14T11:25:18Z,1575580.0,108576.0,602.0,6421.0
1,Tom Scott,UCBa659QWEk1AI4Tg--mrJ2A,The world's only float-through McDonalds,A6F96xSoLPg,2021-10-08T16:43:23Z,2051954.0,130550.0,916.0,4573.0
2,Tom Scott,UCBa659QWEk1AI4Tg--mrJ2A,I thought the Schmid Peoplemover was impossible,A2g4u9F9i90,2021-09-30T16:02:45Z,2639432.0,144361.0,815.0,5645.0
3,Tom Scott,UCBa659QWEk1AI4Tg--mrJ2A,The world's most useful model railway,6TLcaJdsRr0,2021-09-24T14:00:54Z,1960319.0,107866.0,623.0,3188.0
4,Tom Scott,UCBa659QWEk1AI4Tg--mrJ2A,The public toll road with no speed limit,10Y-gWNJ2Sw,2021-09-17T08:11:31Z,1954576.0,129773.0,819.0,3966.0
...,...,...,...,...,...,...,...,...,...
73087,Marcel Vos,UCBlXovStrlQkVA2xJEROUNg,OpenRCT2 Six Flags Magic Mountain Speedrun,Dv5k6Yd6CGs,2018-05-25T16:59:02Z,9187.0,103.0,4.0,10.0
73088,Marcel Vos,UCBlXovStrlQkVA2xJEROUNg,OpenRCT2 - Sherwood Forest Speedrun,mWRP2Thp2C4,2018-05-25T10:54:05Z,7455.0,90.0,5.0,5.0
73089,Marcel Vos,UCBlXovStrlQkVA2xJEROUNg,OpenRCT2 - Electric Fields Speedrun,FUlTbdqtvD8,2018-05-25T10:48:40Z,6580.0,79.0,5.0,2.0
73090,Marcel Vos,UCBlXovStrlQkVA2xJEROUNg,[Unofficial] OpenRCT2 speedrun: Amity Airfield...,Dw0e9tzLmA8,2018-03-23T20:23:25Z,29461.0,264.0,27.0,49.0


In [5]:
### Make some new variables

In [3]:
# Ratio between likes and dislikes:
df_videos["likes_dislikes_ratio"] = df_videos["likes"]/df_videos["dislikes"]

In [11]:
df_videos["likes_dislikes_ratio"].replace([np.inf, -np.inf], np.nan).dropna().describe()

count    66322.000000
mean        35.304122
std         56.862822
min          0.000000
25%          6.000000
50%         16.699682
75%         38.000000
max       2469.000000
Name: likes_dislikes_ratio, dtype: float64

In [6]:
df_videos.replace([np.inf, -np.inf], np.nan).dropna(subset = ["likes_dislikes_ratio"]).sort_values("likes_dislikes_ratio").\
apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,likes_dislikes_ratio
14253,CNBC,UCvJJ_dzjViJCoLf5uKUTwoA,TV Legend Norman Lear Calls Out Hollywood | BI...,UX4ZJ0hVBuM,2016-08-23T14:35:59Z,193.0,0.0,1.0,0.0,0.0
28045,O Pimentinha,UCQ3JxE-NOyZaJ3m3qIkZbhA,O Pimentinha avançou no graxaim Véio.,ws2QgK9DYFA,2021-05-13T17:53:36Z,13544.0,2469.0,1.0,199.0,2469.0


In [4]:
# Age in days (until today):
today = datetime.today().astimezone()
df_videos["age_days"] = [(today - parser.parse(i)).total_seconds()/(24*3600) for i in df_videos["video_upload_date"]]

In [9]:
df_videos["age_days"].describe()

count    73092.000000
mean      1657.350452
std       1160.520275
min          1.054024
25%        817.123966
50%       1504.032670
75%       2143.192774
max       5624.896697
Name: age_days, dtype: float64

In [10]:
df_videos.sort_values("age_days").apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,likes_dislikes_ratio,age_days
29840,Meteoro Brasil,UCk5BcU1rOy6hepflk7_q_Pw,LIVE: MAIS UMA ÚLTIMA SESSÃO DA CPI,UVFFi_KLWtk,2021-10-19T18:25:17Z,74674.0,9323.0,58.0,3.0,160.741379,1.054024
72932,cyriak,UC9Ntx-EF3LzKY1nQ5rTUP2g,dancing dave,Q9v-0rVeCIg,2006-05-27T22:11:50Z,637350.0,10749.0,605.0,1833.0,17.766942,5624.896697


In [5]:
# Ratio between comments and views:
df_videos["comments_views_ratio"] = df_videos["comments"]/df_videos["views"]

In [12]:
df_videos["comments_views_ratio"].replace([np.inf, -np.inf], np.nan).dropna().describe()

count    72815.000000
mean         0.003073
std          0.004719
min          0.000000
25%          0.000737
50%          0.001767
75%          0.003626
max          0.153484
Name: comments_views_ratio, dtype: float64

In [13]:
df_videos.sort_values("comments_views_ratio").replace([np.inf, -np.inf], np.nan).dropna().apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,likes_dislikes_ratio,age_days,comments_views_ratio
16321,CNBC,UCvJJ_dzjViJCoLf5uKUTwoA,Novavax CEO: Fighting The Biotech Bear | Mad M...,eC6HO0RRLDs,2015-09-30T23:11:21Z,3950.0,11.0,3.0,0.0,3.666667,2211.855366,0.0
52925,TODAY,UChDKyKQ59fYz3JO2fl0Z6sg,Capitol Police Exonerate Officer Who Shot And ...,4qiaPX42-3U,2021-08-21T12:18:12Z,15096.0,238.0,283.0,2317.0,0.840989,60.308943,0.153484


In [6]:
# Mean views per day:
df_videos["mean_views_day"] = df_videos["views"]/df_videos["age_days"]

In [15]:
df_videos["mean_views_day"].replace([np.inf, -np.inf], np.nan).dropna().describe()

count     73074.000000
mean        618.145329
std        5089.088170
min           0.000000
25%           3.256453
50%          42.895765
75%         281.045070
max      701947.267448
Name: mean_views_day, dtype: float64

In [16]:
df_videos.sort_values("mean_views_day").replace([np.inf, -np.inf], np.nan).dropna().apply(lambda x: pd.concat([x.head(1), x.tail(1)]))

Unnamed: 0,channel_title,channel_id,video_title,video_id,video_upload_date,views,likes,dislikes,comments,likes_dislikes_ratio,age_days,comments_views_ratio,mean_views_day
5097,CNBC,UCvJJ_dzjViJCoLf5uKUTwoA,"Ahead of Netflix Earnings, Money Manager Pinpo...",fIqvMaoNLQY,2018-07-12T21:11:27Z,8.0,2.0,4.0,0.0,0.5,1195.93863,0.0,0.006689
19025,Kurzgesagt – In a Nutshell,UCsXVk37bltHxD1rDPwtNM8Q,What Dinosaurs ACTUALLY Looked Like?,xaQJbozY_Is,2021-10-12T10:03:46Z,5897971.0,388289.0,3412.0,17565.0,113.800996,8.402299,0.002978,701947.267448


In [18]:
### Plots

In [7]:
# 1D Density of views:
x_var_name = "Views"
x_vals = df_videos["views"].replace([np.inf, -np.inf], np.nan).dropna()
x_vals = x_vals[x_vals < 10000]
nbins = 200
binsize = np.abs(np.max(x_vals) - np.min(x_vals))/nbins

fig = ff.create_distplot(
    hist_data = [x_vals],
    group_labels = ["x"],
    curve_type = "kde",
    show_curve = True,
    show_hist = True,
    bin_size = binsize,
    show_rug = False,
    colors = ["#813DDA"]
)
fig.update_traces(
    hovertemplate = "<b>Density: %{y:,}<br>" + 
                    x_var_name + ": %{x:,}</b><extra></extra>"
)
fig.update_layout(
    xaxis_title = "<b>" + x_var_name + "</b>" ,
    yaxis_title = "<b>Density</b>",
    font = dict(
        size = 18
    ),
    showlegend = False,
    plot_bgcolor = "white",
    hoverlabel = dict(
        font_size = 18,
        font_family = "Rockwell"
    )
)

In [40]:
# Scatter of Mean views per day and Likes-disikes ratio:
x_var_name = "Likes-dislikes ratio"
y_var_name = "Mean views per day"
color_var_name = "Age (days)"
x_var = "likes_dislikes_ratio"
y_var = "mean_views_day"
color_var = "age_days"

channel = "O Pimentinha"
df_plot = df_videos[df_videos["channel_title"] == channel]

n_colors = 100
my_colors = ["#000000", "#E008F8", "#F81D08", "#F88A08", "#F7FE04"]
cmap = LinearSegmentedColormap.from_list("my_palette", my_colors)
my_palette = [to_hex(j) for j in  [cmap(i/n_colors) for i in np.array(range(n_colors))]]

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = df_plot[x_var],
        y = df_plot[y_var],
        mode = "markers",
        marker = {
            "size": 7,
            "color": df_plot[color_var],
            "colorscale": my_palette,
            "showscale": True,
            "colorbar": {
                "title": "<b>" + color_var_name + "</b>"
            }
        },        
        text = df_plot["video_title"],
        customdata = df_plot[color_var],
        hovertemplate = "<b>" + x_var_name + ": %{x:}<br>" +
                         y_var_name + ": %{y:}<br>" +
                         color_var_name + ": %{customdata}<br>" +
                         "Video: %{text}</b><extra></extra>"
    )
)
fig.update_layout(
    title = "<b>" + channel + "</b>" ,
    xaxis_title = "<b>" + x_var_name + "</b>" ,
    yaxis_title = "<b>" + y_var_name + "</b>",
    font = dict(
        size = 18
    ),
    plot_bgcolor = "white",
    hoverlabel = dict(
        font_size = 20,
        font_family = "Rockwell"
    )
)

In [78]:
# Scatter of likes and dislikes:
x_var_name = "Ages (days)"
y_var_name = "Comments-views ratio"
color_var_name = "Mean views per day"
size_var_name = "Likes-dislikes ratio"
x_var = "age_days"
y_var = "comments_views_ratio"
color_var = "mean_views_day"
size_var = "likes_dislikes_ratio"

channel = "Kurzgesagt – In a Nutshell"
df_plot = df_videos[df_videos["channel_title"] == channel]

n_colors = 100
my_colors = ["#000000", "#E008F8", "#F81D08", "#F88A08", "#F7FE04"]
cmap = LinearSegmentedColormap.from_list("my_palette", my_colors)
my_palette = [to_hex(j) for j in  [cmap(i/n_colors) for i in np.array(range(n_colors))]]

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = df_plot[x_var],
        y = df_plot[y_var],
        mode = "markers",
        marker = {
            "color": df_plot[color_var],
            "colorscale": my_palette,
            "showscale": True,
            "colorbar": {
                "title": "<b>" + color_var_name + "</b>"
            },
            "size": df_plot[size_var],
            "opacity": 0.9,
            "sizemode": "area",
            "sizeref": 2.*max(df_plot[size_var])/(40.**2),
            "sizemin": 4            
        },
        text = df_plot["video_title"],
        customdata = df_plot[[color_var, size_var]],
        hovertemplate = "<b>" + x_var_name + ": %{x:.0f}<br>" +
                         y_var_name + ": %{y:.2g}<br>" +
                         color_var_name + ": %{customdata[0]:.0f}<br>" +
                         size_var_name + ": %{customdata[1]:.3f}<br>" +
                         "Video: %{text}</b><extra></extra>"
    )
)
fig.update_layout(
    title = "<b>" + channel + "</b>" ,
    xaxis_title = "<b>" + x_var_name + "</b>" ,
    yaxis_title = "<b>" + y_var_name + "</b>",
    font = dict(
        size = 18
    ),
    plot_bgcolor = "white",
    hoverlabel = dict(
        font_size = 20,
        font_family = "Rockwell"
    )
)

In [None]:
### Dynamic plots (choosing the channel, the variables and filters)

In [9]:
# Variables names:
numvars = {
    "views": "View",
    "likes": "Likes",
    "dislikes": "Dislikes",
    "comments": "Comments",
    "likes_dislikes_ratio": "Likes-dislikes ratio",
    "age_days": "Age (days)",
    "comments_views_ratio": "Comment-views ratio",
    "mean_views_day": "Mean views per day"
}
catvars = {
    "channel_title": "Channel",
    "video_title": "Video"
}

In [10]:
# Options:
opts_channel = ["All channels"] + np.sort(df_videos["channel_title"].unique()).tolist()
opts_vars = list(numvars.keys())
opts_vars = [i for i in numvars.keys()]
opts_vars.sort()

In [11]:
### 1D histogram with density

# Widgets:




channel = opts_channel[0]
x_var = opts_vars[0]
nbins = 100
x_min = 10
x_max = 4000
plot_color = "#813DDA"

# Nice name:
x_var_name = numvars[x_var]

# Filter the channel:
if channel == "All channels":
    x_vals = df_videos[x_var].replace([np.inf, -np.inf], np.nan).dropna()
else:
    x_vals = df_videos.loc[df_videos["channel_title"] == channel, x_var].replace([np.inf, -np.inf], np.nan).dropna()

# Checks the range:
msng = ""
warning_msng = "*Choose valid filter values."
if x_min >= x_max or x_max < np.min(x_vals) or x_min > np.max(x_vals):
    msng = warning_msng
    pass
else:
    if x_min < np.min(x_vals):
        msng = warning_msng
        pass
    else:
        x_vals = x_vals[x_vals > x_min]
    if x_max > np.max(x_vals):
        msng = warning_msng
        pass
    else:
        x_vals = x_vals[x_vals < x_max]

# Bin size:
binsize = np.abs(np.max(x_vals) - np.min(x_vals))/nbins

# Plot:
fig = ff.create_distplot(
    hist_data = [x_vals],
    group_labels = ["x"],
    curve_type = "kde",
    show_curve = True,
    show_hist = True,
    bin_size = binsize,
    show_rug = False,
    colors = [plot_color]
)
fig.update_traces(
    hovertemplate = "<b>Density: %{y:.2g}<br>" + 
                    x_var_name + ": %{x:.0f}</b><extra></extra>"
)
fig.update_layout(
    title = msng,
    xaxis_title = "<b>" + x_var_name + "</b>" ,
    yaxis_title = "<b>Density</b>",
    font = dict(
        size = 18
    ),
    showlegend = False,
    plot_bgcolor = "white",
    hoverlabel = dict(
        font_size = 18,
        font_family = "Rockwell"
    )
)



In [234]:
plot_output = widgets.Output()

dropdown_channels = widgets.Dropdown(
    description = "Channel: ",
    options = opts_channel,
    value = opts_channel[0]
)

slide_likes = widgets.IntSlider(
    description = "Likes: ",
    min = 1000,
    max = 1000000,
    step = 1000,
    value = 50000
)

fig = go.FigureWidget(data = [go.Histogram(x = df_videos["likes"])])


def filtering(chosen_channel, chosen_likes):
    # plot_output.clear_output()

    if chosen_channel == "All channels":
        df_filtered = df_videos[df_videos["likes"] > chosen_likes]
    else:
        df_filtered = df_videos[(df_videos["likes"] > chosen_likes) & (df_videos["channel_title"] == chosen_channel)]    
    
    with fig.batch_update: #plot_output:
        x_var = "likes"
        nbins = 100
        x_min = 10
        x_max = 4000
        plot_color = "#813DDA"

        # Nice name:
        x_var_name = numvars[x_var]

        # Values:
        x_vals = df_filtered[x_var].replace([np.inf, -np.inf], np.nan).dropna()

        # Checks the range:
        msng = ""
        warning_msng = "*Choose valid filter values."
        if x_min >= x_max or x_max < np.min(x_vals) or x_min > np.max(x_vals):
            msng = warning_msng
            pass
        else:
            if x_min < np.min(x_vals):
                msng = warning_msng
                pass
            else:
                x_vals = x_vals[x_vals > x_min]
            if x_max > np.max(x_vals):
                msng = warning_msng
                pass
            else:
                x_vals = x_vals[x_vals < x_max]

        # Bin size:
        binsize = np.abs(np.max(x_vals) - np.min(x_vals))/nbins

        # Plot:
       
        fig.data[0].x = x_vals

        # fig = go.FigureWidget(
        #     data = ff.create_distplot(
                # hist_data = [x_vals],
                # group_labels = ["x"],
                # curve_type = "kde",
                # show_curve = True,
                # show_hist = True,
                # bin_size = binsize,
                # show_rug = False,
        #         colors = [plot_color]
        #     )
        # )
        # fig.update_traces(
        #     hovertemplate = "<b>Density: %{y:.2g}<br>" + 
        #                     x_var_name + ": %{x:.0f}</b><extra></extra>"
        # )
        # fig.update_layout(
        #     title = msng,
        #     xaxis_title = "<b>" + x_var_name + "</b>" ,
        #     yaxis_title = "<b>Density</b>",
        #     font = dict(
        #         size = 18
        #     ),
        #     showlegend = False,
        #     plot_bgcolor = "white",
        #     hoverlabel = dict(
        #         font_size = 18,
        #         font_family = "Rockwell"
        #     )
        # )
        # fig.show()

        # sns.kdeplot(df_filtered["likes"], shade = True)
        # plt.show()



        
def dropdown_channels_eventhandler(change):
    filtering(chosen_channel = change.new,
              chosen_likes = slide_likes.value)

def slide_likes_eventhandler(change):
    filtering(chosen_channel = dropdown_channels.value,
              chosen_likes = change.new)

dropdown_channels.observe(dropdown_channels_eventhandler,
                          names = "value")
slide_likes.observe(slide_likes_eventhandler,
                    names = "value")

display(dropdown_channels)
display(slide_likes)
display(plot_output)

Dropdown(description='Channel: ', options=('All channels', '3Blue1Brown', 'Aero Por Trás da Aviação', 'Anglo-L…

IntSlider(value=50000, description='Likes: ', max=1000000, min=1000, step=1000)

Output()

In [15]:
df = pd.read_csv(
    'https://raw.githubusercontent.com/yankev/testing/master/datasets/nycflights.csv')
df = df.drop(df.columns[[0]], axis=1)
month = widgets.IntSlider(
    value=1.0,
    min=1.0,
    max=12.0,
    step=1.0,
    description='Month:',
    continuous_update=False
)

use_date = widgets.Checkbox(
    description='Date: ',
    value=True,
)

container = widgets.HBox(children=[use_date, month])

textbox = widgets.Dropdown(
    description='Airline:   ',
    value='DL',
    options=df['carrier'].unique().tolist()
)

origin = widgets.Dropdown(
    options=list(df['origin'].unique()),
    value='LGA',
    description='Origin Airport:',
)


# Assign an empty figure widget with two traces
trace1 = go.Histogram(x=df['arr_delay'], opacity=0.75, name='Arrival Delays')
trace2 = go.Histogram(x=df['dep_delay'], opacity=0.75, name='Departure Delays')
g = go.FigureWidget(data=[trace1, trace2],
                    layout=go.Layout(
                        title=dict(
                            text='NYC FlightDatabase'
                        ),
                        barmode='overlay'
                    ))

def validate():
    if origin.value in df['origin'].unique() and textbox.value in df['carrier'].unique():
        return True
    else:
        return False


def response(change):
    if validate():
        if use_date.value:
            filter_list = [i and j and k for i, j, k in
                           zip(df['month'] == month.value, df['carrier'] == textbox.value,
                               df['origin'] == origin.value)]
            temp_df = df[filter_list]

        else:
            filter_list = [i and j for i, j in
                           zip(df['carrier'] == 'DL', df['origin'] == origin.value)]
            temp_df = df[filter_list]
        x1 = temp_df['arr_delay']
        x2 = temp_df['dep_delay']
        with g.batch_update():
            g.data[0].x = x1
            g.data[1].x = x2
            g.layout.barmode = 'overlay'
            g.layout.xaxis.title = 'Delay in Minutes'
            g.layout.yaxis.title = 'Number of Delays'


origin.observe(response, names="value")
textbox.observe(response, names="value")
month.observe(response, names="value")
use_date.observe(response, names="value")

In [16]:
container2 = widgets.HBox([origin, textbox])
widgets.VBox([container,
              container2,
              g])

VBox(children=(HBox(children=(Checkbox(value=True, description='Date: '), IntSlider(value=1, continuous_update…

In [None]:
# Scatter with colors:

In [None]:
# Bubble with colors:

In [None]:
# 2D density:

In [None]:
# Heatmap numeric:

In [None]:
# Lines with 3 variables (the categoric being the channel, until a limit of say 5 channels):

In [None]:
# Correlation matrix continuous: