In [79]:
import pandas as pd
import requests as req
import bs4 as bs
from datetime import datetime
import pytz

In [5]:
with open(r'data\history\watch-history.html', 'r', encoding='utf-8') as f:
    html = f.read()
    print("File read\nParsing HTML...")
    parsed = bs.BeautifulSoup(html, 'lxml')
    
# Create a duplicate of the parsed HTML
parsed_safe = parsed

File read
Parsing HTML...


In [68]:
# Get the divs containing the watch history items
watched_videos = parsed.find_all('div', class_='outer-cell mdl-cell mdl-cell--12-col mdl-shadow--2dp')

print("Number of videos watched:", len(watched_videos))

Number of videos watched: 37063


In [88]:
def parse_date(date_string):
    """Parses a date string into a datetime object"""
    # Create a timezone object for 'CET'   
    tz = pytz.timezone(date_string[-3:])

    # convert the date string to a datetime object (18 Jan 2023, 20:57:44 CET)
    date_object = datetime.strptime(date_string[:-4], '%d %b %Y, %H:%M:%S')

    #localize datetime object
    date_object = tz.localize(date_object)
    
    return date_object


def extract_info(watched_video):
    """Extracts the video title, channel name, and watch date from a watched video div"""
    # Extract the correct div
    video_info = watched_video.find('div', class_='content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1')
    
    title, uploader = video_info.find_all('a')
    
    # Get the video title and link
    video_title = title.text
    video_url = title['href']
    
    # Get the channel name and link
    channel_name = uploader.text
    channel_url = uploader['href']
    
    # Get the watch date
    date_string = video_info.find_all('br')[1].next_sibling
    watch_date = parse_date(date_string)
    
    return video_title, video_url, channel_name, channel_url, watch_date

In [89]:
print("Extracting video info...")
print(extract_info(watched_videos[3]))

Extracting video info...
('30000€ DE PRÉPA SUR CETTE GOLF 3 AVEC UN MOTEUR SPÉCIAL!!', 'https://www.youtube.com/watch?v=ZjWqytNE2RA', 'Fresh Detailing', 'https://www.youtube.com/channel/UCDJMvF9-pN-rF9Lo6BIMIEQ', datetime.datetime(2023, 1, 18, 20, 57, 44, tzinfo=<DstTzInfo 'CET' CET+1:00:00 STD>))
