In [1]:
import pandas as pd
import requests as req
import bs4 as bs

In [2]:
with open(r'data\raw\history\watch-history.html', 'r', encoding='utf-8') as f:
    html = f.read()
    print("File read\nParsing HTML...")
    parsed = bs.BeautifulSoup(html, 'lxml')
    
# Create a duplicate of the parsed HTML
parsed_safe = parsed

File read
Parsing HTML...


In [3]:
# Get the divs containing the watch history items
watched_videos_divs = parsed.find_all('div', class_='outer-cell mdl-cell mdl-cell--12-col mdl-shadow--2dp')

print("Number of videos watched:", len(watched_videos_divs))

Number of videos watched: 37063


In [4]:
from dateutil.parser import parse
import pytz

def parse_date(date_string):
    """Parses a date string into a datetime object"""
    # Create a timezone object for 'CET'   
    tz = pytz.timezone(date_string[-3:])

    # convert the date string to a datetime object (18 Jan 2023, 20:57:44 CET)
    date_object = parse(date_string[:-4])

    #localize datetime object
    date_object = tz.localize(date_object)
    
    return date_object


def extract_info(watched_video):
    """Extracts the video title, channel name, and watch date from a watched video div"""
    is_removed = False
    is_full = True
    
    # Extract the correct div
    video_info = watched_video.find('div', class_='content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1')
    
    # Get the watch date
    date_string = video_info.find_all('br')[-1].next_sibling
    watch_date = parse_date(date_string)

    try:
        title, uploader = video_info.find_all('a')
    except ValueError:
        try:
            is_full = False
            title = video_info.find_all('a')[0]
            uploader = video_info.find_all('a')[0]
        except IndexError:
            is_removed = True
    
    if not is_removed:
        # Get the video title and link
        video_title = title.text
        video_url = title['href']
        
        if is_full:
            # Get the channel name and link
            channel_name = uploader.text
            channel_url = uploader['href']
        else:
            channel_name = None
            channel_url = None
    else: 
        video_title = None
        video_url = None
        channel_name = None
        channel_url = None
        
    return video_title, video_url, channel_name, channel_url, watch_date

In [5]:
from tqdm import tqdm

watched_videos = []

for video in tqdm(watched_videos_divs):
    watched_videos.append(extract_info(video))

100%|██████████| 37063/37063 [00:07<00:00, 4831.43it/s]


In [6]:
# Create a dataframe from the list of watched videos
watched_videos_df = pd.DataFrame(watched_videos, columns=['video_title', 'video_url', 'channel_name', 'channel_url', 'date_watched'])

In [8]:
# Export the dataframe to a CSV file
watched_videos_df.to_csv(r'data\processed\history\watch-history.csv', index=False)

In [9]:
# Export the dataframe to a JSON file
watched_videos_df.to_json(r'data\processed\history\watch-history.json', orient='records')