In [742]:
import pandas as pd
import numpy as np
from collections import defaultdict
from bs4 import BeautifulSoup as bs4
from tqdm import tqdm
tqdm.pandas()
import seaborn as sns
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import matplotlib.pyplot as plt
from calendar import monthrange
from wordcloud import WordCloud

In [None]:
# Open HTML
with open(path, 'r', encoding='utf-8') as f:
    html = f.read()

In [None]:
# Parse HTML
soup = bs4(html, 'html.parser')

In [433]:
# Create DataFrame from parsed HTML
df_dict = defaultdict(list)

data = soup.find_all('div', {'class':'content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1'})
for d in data:
    find = d.find('a')
    df_dict['search'].append(find.get_text())
    df_dict['url'].append(find['href'])
    df_dict['datetime'].append(str(d.next.next.next.next.next.string))

df = pd.DataFrame(df_dict)

In [434]:
# Remove non-searches
df['datetime'] = df['datetime'].replace(r'(Watched at )(\d|\d\d):\d\d(pm|am)', np.nan, regex=True)
df = df.dropna()

In [435]:
# Convert to datetime
df['datetime'] = pd.to_datetime(df['datetime'])



In [436]:
# Extract datetime
df['year'] = df['datetime'].apply(lambda x: x.year)
df['month'] = df['datetime'].apply(lambda x: x.month)
df['day'] = df['datetime'].apply(lambda x: x.day)
df['hour'] = df['datetime'].apply(lambda x: x.hour)
df['second'] = df['datetime'].apply(lambda x: x.second)
df['minute'] = df['datetime'].apply(lambda x: x.minute)
df.to_csv('youtube_data.csv')

df.head()

Unnamed: 0,search,url,datetime,year,month,day,hour,second,minute
0,world's hardest game aaron,https://www.youtube.com/results?search_query=w...,2022-11-12 22:35:20,2022,11,12,22,20,35
1,world's hardest game shiroma,https://www.youtube.com/results?search_query=w...,2022-11-12 22:07:26,2022,11,12,22,26,7
2,3blue1brown mandlebrot,https://www.youtube.com/results?search_query=3...,2022-11-12 20:15:40,2022,11,12,20,40,15
3,niggas in parris,https://www.youtube.com/results?search_query=n...,2022-11-12 19:52:54,2022,11,12,19,54,52
4,xps 15 9520 short circuit,https://www.youtube.com/results?search_query=x...,2022-11-12 19:31:06,2022,11,12,19,6,31


In [438]:
df = pd.read_csv('youtube_data.csv', index_col = 0)

In [1033]:
# Functions
def int_to_month(month):
    month_dict = {
        1:'Jan',
        2:'Feb',
        3:'Mar',
        4:'Apr',
        5:'May',
        6:'Jun',
        7:'Jul',
        8:'Aug',
        9:'Sep',
        10:'Oct',
        11:'Nov',
        12:'Dec'
    }
    return month_dict[month]

def fill_hours(grouped_df):
    grouped_df_dict = defaultdict(list)
    for hour in range(0,24):
        if hour in grouped_df['hour'].unique():
            grouped_df_dict['hour'].append(hour)
            grouped_df_dict['search'].append(grouped_df[grouped_df['hour']==hour]['search'].values[0])
        else:
            grouped_df_dict['hour'].append(hour)
            grouped_df_dict['search'].append(0)

    return pd.DataFrame(grouped_df_dict)

def df_to_single_string(df):
    return ' '.join(df['search'].str.lower())

def string_to_wordcloud(text):
    plt.figure(figsize=(15,10))
    wc = WordCloud(background_color='white', width=700,height=350)
    wc.generate(text)

    plt.axis('off')
    plt.imshow(wc, interpolation='bilinear')
    plt.show()

# string_to_wordcloud(df_to_single_string(df))

def high_overview(min_max = (2015, 2022), toggle_search = False, toggle_wc = False):
    def window_size(size=10):
        display(show_df[['search','datetime']][:size])
        per_year = show_df.groupby('year')['search'].count().reset_index()

        sns.barplot(data = per_year, x='year',y='search').set(title= str(min) + ' - ' + str(max))

    min, max = min_max
    show_df = df[df['year'].between(min, max)]

    if toggle_wc:
        string_to_wordcloud(df_to_single_string(show_df))
    
    if toggle_search:
        size_slider = widgets.IntSlider(value=10, min = 0, max = 100)
        interact(window_size, show_df = show_df, size = size_slider)

    

    per_year = show_df.groupby('year')['search'].count().reset_index()
    sns.barplot(data = per_year, x='year',y='search').set(title= str(min) + ' - ' + str(max))

    

def select_year(year, toggle_search = False, toggle_wc = False):
    def window_size(size=10):
        display(show_df[['search','datetime']][:size])
        per_year = show_df.groupby('month')['search'].count().reset_index()

        sns.barplot(data = per_year, x='month',y='search').set(title= str(min) + ' - ' + str(max))
        
    if year:
        show_df = df[df['year'] == year]

        if toggle_wc:
            string_to_wordcloud(df_to_single_string(show_df))

        if toggle_search:
            size_slider = widgets.IntSlider(value=10, min = 0, max = 100)
            interact(window_size, show_df = show_df, size = size_slider)

        # show_df['month'] = show_df['month'].apply(lambda x: int_to_month(x))
        per_year = show_df.groupby('month')['search'].count().reset_index()
        sns.barplot(data = per_year, x='month',y='search').set(title=year)

        min_year, max_year = df['year'].min(), df['year'].max()
        years = list(range(min_year, max_year+1))
        year_dropdown = widgets.Dropdown(value=year, options=years)
        year_dropdown.layout.visibility='hidden'

        months = list(range(1,13))
        select_month_dropdown = widgets.Dropdown(options=months, value=None)
        toggle_search_button = widgets.ToggleButton(value = False)
        toggle_wc_button = widgets.ToggleButton(value = False)
        
        interact(select_month, year = year_dropdown, month = select_month_dropdown, toggle_search = toggle_search_button, toggle_wc = toggle_wc_button)

    

def select_month(year, month, toggle_search = False, toggle_wc = False):
    def window_size(size=10):
        display(show_df[['search','datetime']][:size])
        per_month = show_df.groupby('day')['search'].count().reset_index()

        sns.barplot(data = per_month, x='day', y='search').set(title=str(year) + ', ' + int_to_month(month))


    if month != None:
        show_df = df[(df['year'] == year) & (df['month'] == month)]

        if toggle_wc:
            string_to_wordcloud(df_to_single_string(show_df))

        if toggle_search:
            size_slider = widgets.IntSlider(value=10, min = 0, max = 100)
            interact(window_size, show_df = show_df, size = size_slider)
        
        per_month = show_df.groupby('day')['search'].count().reset_index()
        sns.barplot(data = per_month, x='day', y='search').set(title=str(year) + ', ' + int_to_month(month))

        # Data
        min_year, max_year = df['year'].min(), df['year'].max()
        years = list(range(min_year, max_year+1))
        year_dropdown = widgets.Dropdown(value=year, options=years)
        year_dropdown.layout.visibility='hidden'

        months = list(range(1,13))
        select_month_dropdown = widgets.Dropdown(options=months, value=month)
        select_month_dropdown.layout.visibility='hidden'

        days_options = range(1, monthrange(year, month)[1]+1)
        select_days_dropdown = widgets.Dropdown(options=days_options, value=None)
        toggle_search_button = widgets.ToggleButton(value = False)
        toggle_wc_button = widgets.ToggleButton(value = False)

        interact(select_day, year=year_dropdown, month=select_month_dropdown, day = select_days_dropdown, toggle_search = toggle_search_button, toggle_wc = toggle_wc_button)
    

def select_day(year, month, day, toggle_search = False, toggle_wc = False):
    def window_size(size=10):
        display(show_df[['search','datetime']][:size])
        per_day = show_df.groupby('hour')['search'].count().reset_index()
        filled_day = fill_hours(per_day)
        
        sns.barplot(data=filled_day, x='hour',y='search').set(title=str(year) + ', ' + int_to_month(month) + ', ' + str(day))

    if day != None:
        show_df = df[(df['year'] == year) & (df['month'] == month) & (df['day'] == day)]

        if toggle_wc:
            string_to_wordcloud(df_to_single_string(show_df))

        if toggle_search:
            size_slider = widgets.IntSlider(value=10, min = 0, max = 100)
            interact(window_size, show_df = show_df, size = size_slider)

        per_day = show_df.groupby('hour')['search'].count().reset_index()
        filled_day = fill_hours(per_day)
        
        sns.barplot(data=filled_day, x='hour',y='search').set(title=str(year) + ', ' + int_to_month(month) + ', ' + str(day))

        min_year, max_year = df['year'].min(), df['year'].max()
        years = list(range(min_year, max_year+1))
        year_dropdown = widgets.Dropdown(value=year, options=years)
        year_dropdown.layout.visibility='hidden'

        months = list(range(1,13))
        select_month_dropdown = widgets.Dropdown(options=months, value=month)
        select_month_dropdown.layout.visibility='hidden'

        days_options = range(1, monthrange(year, month)[1]+1)
        select_days_dropdown = widgets.Dropdown(options=days_options, value=day)
        select_days_dropdown.layout.visibility='hidden'

        hour_options = list(range(0,24))
        select_hours_dropdown = widgets.Dropdown(options=hour_options, value=None)
        toggle_wc_button = widgets.ToggleButton(value = False)

        interact(select_hour, year = year_dropdown, month = select_month_dropdown, day= select_days_dropdown, hour=select_hours_dropdown, toggle_wc = toggle_wc_button)
        
        

def select_hour(year, month, day, hour, toggle_wc = False):
    if hour != None:
        show_df = df[(df['year'] == year) & (df['month'] == month) & (df['day'] == day) & (df['hour'] == hour)]

        if toggle_wc:
            string_to_wordcloud(df_to_single_string(show_df))
            
        display(show_df[['search','url']])


select_hour(2020,11,27,13)

Unnamed: 0,search,url
1999,bayes probability examples,https://www.youtube.com/results?search_query=b...
2000,conditionalisation example,https://www.youtube.com/results?search_query=c...
2001,bayes theorem example,https://www.youtube.com/results?search_query=b...
2002,bayes theorem conditionalisation,https://www.youtube.com/results?search_query=b...


In [1034]:
# Testing
def f(x):
    display(df[:x])

x_dr = widgets.IntSlider(value=10)
interact(f, x=x_dr)

interactive(children=(IntSlider(value=10, description='x'), Output()), _dom_classes=('widget-interact',))

<function __main__.f(x)>

In [1035]:
# Initialization
min_year, max_year = df['year'].min(), df['year'].max()
years = list(range(min_year, max_year+1))
months = list(range(1,13))
years

[2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

In [1036]:
# Widgets
min_max_slider = widgets.IntRangeSlider(options=df['year'].unique(), value=[min_year, max_year], min=min_year,max=max_year, description='Filter years')
select_year_dropdown = widgets.Dropdown(options=years, value=None, description = 'Select a year')
toggle_search_button = widgets.ToggleButton(value = False)
toggle_wc_button = widgets.ToggleButton(value = False)
toggle_search_button2 = widgets.ToggleButton(value = False)
toggle_wc_button2 = widgets.ToggleButton(value = False)

In [1037]:
interact(high_overview, min_max = min_max_slider, toggle_search = toggle_search_button, toggle_wc = toggle_wc_button)
interact(select_year, year = select_year_dropdown, toggle_search = toggle_search_button2, toggle_wc = toggle_wc_button2)

interactive(children=(IntRangeSlider(value=(2015, 2022), description='Filter years', max=2022, min=2015), Togg…

interactive(children=(Dropdown(description='Select a year', options=(2015, 2016, 2017, 2018, 2019, 2020, 2021,…

<function __main__.select_year(year, toggle_search=False, toggle_wc=False)>

# TODO:
- Add days with no usage to the plot
- Format month integer to readable month
- Dynamic dropdown options
- Render plots next to each other
- Add button to show list of searches at year, month, day and hour level
- Fix disappearance of next dropdown when search is toggled in year++ stages

In [985]:
def df_to_single_string(df):
    return ' '.join(df['search'].str.lower())