# News

In [1]:
!pip install google-api-python-client google-auth google-auth-oauthlib google-auth-httplib2 
!pip install --upgrade google-auth-oauthlib



In [2]:
import re
import html
from googleapiclient.discovery import build
import os
import pandas as pd
from datetime import datetime, timedelta
import pytz

## Channel Definitions and Setups

In [3]:
# Channels 
CNBC_TV = "UCrp_UI8XtuYfpiqluWLD7Lw"
MY_TIMEZONE='America/Chicago'

In [4]:
# Youtube Data API v3
api_key = os.getenv('YOUTUBE_API_KEY')
if not api_key:
    raise ValueError("No API key found. Make sure the YOUTUBE_API_KEY environment variable is set.")

youtube = build('youtube', 'v3', developerKey=api_key)

## Utils

In [5]:
# Get date ranges (timezone aware)
from datetime import datetime, timedelta
import pytz

def get_date_range(period_type, number=1):
    local_timezone = pytz.timezone(MY_TIMEZONE)
    now = datetime.now(pytz.utc).astimezone(local_timezone)    
    
    if period_type == 'today':
        start_date = datetime(now.year, now.month, now.day, 0, 0, 0)
        end_date = datetime(now.year, now.month, now.day, 23, 59, 59, 999999)
    elif period_type == 'days':
        start_date = datetime(now.year, now.month, now.day, 0, 0, 0) - timedelta(days=number-1)
        end_date = datetime(now.year, now.month, now.day, 23, 59, 59, 999999)
    elif period_type == 'weeks':
        start_date = datetime(now.year, now.month, now.day, 0, 0, 0) - timedelta(weeks=number)
        end_date = datetime(now.year, now.month, now.day, 23, 59, 59, 999999)
    elif period_type == 'months':
        start_date = datetime(now.year, now.month, now.day, 0, 0, 0) - timedelta(days=30*number)
        end_date = datetime(now.year, now.month, now.day, 23, 59, 59, 999999)
    else:
        raise ValueError("Unsupported period type specified.")
    
    # Localize the datetime objects
    start_date = local_timezone.localize(start_date)
    end_date = local_timezone.localize(end_date)
    
    return start_date, end_date

# # Test get_date_range
# start_date, end_date = get_date_range('days',3)
# print(start_date, end_date)

def iso_duration_to_minutes(iso_duration):
    # Parse ISO 8601 duration format to total minutes
    pattern = re.compile(r'PT((?P<hours>\d+)H)?((?P<minutes>\d+)M)?((?P<seconds>\d+)S)?')
    matches = pattern.match(iso_duration)
    if not matches:
        return 0  # Return 0 if the pattern does not match

    hours = int(matches.group('hours') or 0)
    minutes = int(matches.group('minutes') or 0)
    seconds = int(matches.group('seconds') or 0)

    # Calculate total minutes, rounding up if there are any seconds
    total_minutes = hours * 60 + minutes
    if seconds > 0:
        total_minutes += 1  # Round up if there are any remaining seconds

    return total_minutes

# Get formated date
def get_formated_date_today():
    timezone = pytz.timezone(MY_TIMEZONE)
    now = datetime.now(timezone)
    formatted_date = now.strftime('%Y-%m-%d')
    return formatted_date

In [6]:
def fetch_videos(start_date, end_date, channel_id):
    video_data = []
    page_token = None
    local_timezone = pytz.timezone('America/Chicago')  # Define the local timezone

    while True:
        request = youtube.search().list(
            part="snippet",
            channelId=channel_id,
            publishedAfter=start_date.isoformat(),
            publishedBefore=end_date.isoformat(),
            maxResults=50,
            pageToken=page_token,
            type="video"
        )
        response = request.execute()

        video_ids = [item['id']['videoId'] for item in response.get("items", []) if 'videoId' in item['id']]
        if video_ids:
            video_request = youtube.videos().list(
                part='contentDetails, snippet',
                id=','.join(video_ids)
            )
            video_details_response = video_request.execute()

            for video in video_details_response.get("items", []):
                video_id = video['id']  # Ensure video_id is defined here
                content_details = video['contentDetails']
                snippet = video['snippet']
                video_data.append({
                    "Title": html.unescape(snippet['title']),
                    "Published At": datetime.strptime(snippet['publishedAt'], '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=pytz.UTC),
                    "Duration (Min)": iso_duration_to_minutes(content_details.get('duration', 'PT0S')),
                    "Video ID":video_id,
                    "URL": f"https://www.youtube.com/watch?v={video_id}"
                })

        page_token = response.get('nextPageToken')
        if not page_token:
            break

    df = pd.DataFrame(video_data)
    if not df.empty:
        df_sorted = df.sort_values(by=['Published At'])
        return df_sorted
    else:
        print("No videos found.")
        return pd.DataFrame()  # Return an empty DataFrame if no videos were added

In [20]:
import pandas as pd

def filter_videos_by_date_and_time(df_videos, period_type='today', number=1, hour_range=None):
    """
    Filters videos based on the 'Published At' column within the specified date and optional time range.

    Parameters:
    - df_videos (DataFrame): DataFrame containing video data with 'Published At' as datetime.
    - period_type (str): 'today', 'days', 'weeks', or 'months'.
    - number (int): Number of days, weeks, or months to look back from today.
    - hour_range (str, optional): Hour range in the format 'HH-HH'. Only applicable if period_type is 'today'.

    Returns:
    - DataFrame: Filtered DataFrame based on the date and optional time range.
    """
    # Use the existing function to get the date range
    start_date, end_date = get_date_range(period_type, number)

    # Ensure datetime is timezone-aware for comparison
    df_videos['Published At'] = pd.to_datetime(df_videos['Published At']).dt.tz_localize(None)
    start_date = start_date.replace(tzinfo=None)
    end_date = end_date.replace(tzinfo=None)

    # Filter the DataFrame based on the date range
    filtered_df = df_videos[(df_videos['Published At'] >= start_date) & (df_videos['Published At'] <= end_date)]

    # Apply additional filtering by hour range if specified and period_type is 'today'
    if period_type == 'today' and hour_range:
        start_hour, end_hour = map(int, hour_range.split('-'))
        filtered_df = filtered_df[filtered_df['Published At'].dt.hour >= start_hour]
        filtered_df = filtered_df[filtered_df['Published At'].dt.hour <= end_hour]

    return filtered_df

In [8]:
import pandas as pd
from IPython.display import HTML

def make_clickable(title, url):
    return f'<a href="{url}" target="_blank">{title}</a>'

def display_and_save_df(df, file_name=None):
    """
    Modifies the DataFrame to make titles clickable, hides the URL in the display, 
    and optionally saves the original DataFrame to an HTML file.

    Parameters:
    - df (DataFrame): DataFrame to display and save.
    - file_name (str, optional): Name of the HTML file to save the DataFrame. If None, does not save to file.
    """
    # Create a copy for display to avoid altering the original DataFrame
    df_display = df.copy()
    
    # Make titles clickable
    df_display['Title'] = df_display.apply(lambda x: make_clickable(x['Title'], x['URL']), axis=1)

    # Display the DataFrame using HTML in Jupyter Notebook without the URL column
    display(HTML(df_display.drop('URL', axis=1).to_html(escape=False, index=False)))

    # Check if a file name is provided and save the DataFrame with clickable titles to HTML
    if file_name:
        df_display.drop('URL', axis=1).to_html(file_name, index=False, escape=False)
        print(f"DataFrame has been saved to {file_name}.")

## Get videos from Youtube CNBC TV Channel

### Get Today's videos

In [9]:
## Getting the videos from the CNBC TV channel
period_type = 'today'  # 'today', 'days', 'weeks', 'months'
number = 2  # The 'today' setting does not use 'number', adjust if using other settings
channel_id = CNBC_TV  # Channel ID as a parameter
start_date, end_date = get_date_range(period_type, number)
df_videos_today = fetch_videos(start_date, end_date, channel_id)

In [10]:
# ## Save today's videos to an Excel file

# # Create a copy of the DataFrame for Excel export
# # The reason for creating a copy is to convert the timezone-aware datetime objects to timezone-naive for file saving 
# df_videos_for_excel = df_videos_today.copy()

# # Convert timezone-aware datetime objects to timezone-naive in the 'Published At' column
# df_videos_for_excel['Published At'] = df_videos_for_excel['Published At'].dt.tz_localize(None)

# # Append the formatted date to the filename
# filename = f'youtube_videos{get_formated_date_today()}.xlsx'

# # Now save the cloned DataFrame to an Excel file
# df_videos_for_excel.to_excel(filename, index=False)

In [28]:
# Assume df_videos is your initial DataFrame loaded with video data

# Filter videos from the last 3 days
filtered_df = filter_videos_by_date_and_time(df_videos_today, 'today', 1,'12-14')

# Append the formatted date to the filename
filename = f'videos_{get_formated_date_today()}.html'

# Display and optionally save the results to HTML
display_and_save_df(filtered_df, filename)

Title,Published At,Duration (Min),Video ID
Sen. Coons: President Biden made a choice that very few presidents have had the courage to make,2024-07-23 12:25:14,6,DQNMDUExkdQ
"Really optimistic for the sectors that have been left out, says Ariel Investments' John Rogers",2024-07-23 12:43:47,7,W5rRxDIWQ94
Rep. Comer: Pharmacy benefit managers are price gouging and not saving patients money,2024-07-23 12:53:24,8,yu7ADkNQreA
AMC CEO on new debt refinancing deal: Gives us 'years and years' of breathing room,2024-07-23 13:17:04,7,gNe9Vcigsmc
CBO Director on growing U.S. debt: Even modestly low interest rates won't change fiscal trajectory,2024-07-23 13:28:01,5,Ci072vgWEY4
"Opening Bell: July 23, 2024",2024-07-23 14:07:05,2,Tk4kYXriwF8
Cramer’s Mad Dash: Spotify,2024-07-23 14:08:40,2,60zqd-whXIQ
"Galaxy CEO Mike Novogratz on ether ETF approval, Pres. Biden dropping out and state of 2024 race",2024-07-23 14:10:25,8,x3qewqgd8sg
Cramer’s Stop Trading: MSCI,2024-07-23 14:18:32,2,q7utAcqjEhA
FTC demands information on how AI is used in pricing from 8 companies,2024-07-23 14:21:45,4,am5nr3MVS68


DataFrame has been saved to videos_2024-07-23.html.


## Send emails

In [23]:
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import config  # Import the configuration file
import pandas as pd

# Get today's formatted date
formatted_date = get_formated_date_today()

# Append the formatted date to the filename
filename = f'videos_{formatted_date}.html'

# Load the HTML content from the file
with open(filename, 'r') as file:
    html_content = file.read()

# Set up the MIME
message = MIMEMultipart()
message['From'] = config.email_user
message['To'] = config.email_send
message['Subject'] = f'CNBC_Videos_{formatted_date}'

# Attach the HTML content to the email
message.attach(MIMEText(html_content, 'html'))

# Function to send email
def send_email():
    try:
        server = smtplib.SMTP('smtp-mail.outlook.com', 587)  # Outlook SMTP server
        server.set_debuglevel(1)  # Enable debugging output
        server.starttls()
        server.login(config.email_user, config.email_password)
        text = message.as_string()
        server.sendmail(config.email_user, config.email_send, text)
        server.quit()
        print("Email sent successfully!")
    except Exception as e:
        print(f"Failed to send email: {e}")

# Send the email
send_email()

send: 'ehlo steves-mini-2.lan\r\n'
reply: b'250-SN1PR12CA0114.outlook.office365.com Hello [24.55.39.248]\r\n'
reply: b'250-SIZE 157286400\r\n'
reply: b'250-PIPELINING\r\n'
reply: b'250-DSN\r\n'
reply: b'250-ENHANCEDSTATUSCODES\r\n'
reply: b'250-STARTTLS\r\n'
reply: b'250-8BITMIME\r\n'
reply: b'250-BINARYMIME\r\n'
reply: b'250-CHUNKING\r\n'
reply: b'250 SMTPUTF8\r\n'
reply: retcode (250); Msg: b'SN1PR12CA0114.outlook.office365.com Hello [24.55.39.248]\nSIZE 157286400\nPIPELINING\nDSN\nENHANCEDSTATUSCODES\nSTARTTLS\n8BITMIME\nBINARYMIME\nCHUNKING\nSMTPUTF8'
send: 'STARTTLS\r\n'
reply: b'220 2.0.0 SMTP server ready\r\n'
reply: retcode (220); Msg: b'2.0.0 SMTP server ready'
send: 'ehlo steves-mini-2.lan\r\n'
reply: b'250-SN1PR12CA0114.outlook.office365.com Hello [24.55.39.248]\r\n'
reply: b'250-SIZE 157286400\r\n'
reply: b'250-PIPELINING\r\n'
reply: b'250-DSN\r\n'
reply: b'250-ENHANCEDSTATUSCODES\r\n'
reply: b'250-AUTH LOGIN XOAUTH2\r\n'
reply: b'250-8BITMIME\r\n'
reply: b'250-BINARYMIME\r

Email sent successfully!


reply: b'221 2.0.0 Service closing transmission channel\r\n'
reply: retcode (221); Msg: b'2.0.0 Service closing transmission channel'


## Get transcripts for each video

In [13]:
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
import os
import pickle

def authenticate():
    """Authenticate the user using OAuth2 and save the credentials for later reuse."""
    creds = None
    # The file token.pickle stores the user's access and refresh tokens.
    # It is created automatically when the authorization flow completes for the first time.
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)

    # If there are no valid credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())  # Refresh the access token (requires `from google.auth.transport.requests import Request`)
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'client_secret_447470198174-bqv62db8l4lnsasqdahafbdes1fgm0ha.apps.googleusercontent.com.json',
                scopes=['https://www.googleapis.com/auth/youtube.force-ssl'])
            creds = flow.run_local_server(port=0)

        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    return build('youtube', 'v3', credentials=creds)

In [14]:
def get_captions(youtube, video_id):
    """Fetch captions for a given video using the authenticated YouTube client."""
    try:
        caption_list = youtube.captions().list(part='snippet', videoId=video_id).execute()
        print("Caption list fetched successfully")
        
        for item in caption_list['items']:
            if item['snippet']['language'] == 'en':  # Assuming we want English captions
                caption_id = item['id']
                print(f"Found English captions with ID: {caption_id}")
                caption = youtube.captions().download(id=caption_id, tfmt='srt').execute()
                return caption.decode('utf-8')
        
        print("No English captions found.")
        return None

    except HttpError as error:
        print(f"An HTTP error occurred: {error}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [15]:
# Ensure that necessary imports are available
from googleapiclient.errors import HttpError

# Authenticate and create a YouTube service object
youtube = authenticate()

# Specify the video ID for which to retrieve captions
video_id = 'LC6e25x3V6Y'  # Replace with an actual YouTube video ID

# Fetch captions
captions = get_captions(youtube, video_id)
if captions:
    print("Captions retrieved successfully!")
    print(captions)
else:
    print("No captions found or an error occurred.")

NameError: name 'Request' is not defined