In [None]:
'''
Part 1
Goal: Find gaming videos.
Description: I use a multitude of search queries to find a lot of videos. I had to do a multitude of search queries because I am limited to 550 videos per search. Here are all the search parameters I looped through
Additional Queries: These are words that are popular through Google Trends.
Video Durations: I used Short, Medium, and Long to find a variety of length of videos
Published After/Published Before: This was used to find a multitude of videos per day during the time period of 1/29 to ⅖.
These are the parameters of the search that never change
videoCategoryId=20: This never changed and this is the Category for Gaming Videos
regionCode=US: This is for videos found in the US
relevanceLanguage=en: This is for English Videos
maxResults=50: This is the most videos you can find per page
Output: Here is the csv it leads to (https://drive.google.com/file/d/1hCP06fC4PFFZjzsy1_MNrhHc4s3-wLaK/view)

'''

import requests
import pandas as pd
from datetime import datetime, timedelta
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# URL template for subsequent requests. This uses the Youtube API
url_template = 'https://yt4.lemnoslife.com/noKey/search?part=snippet&type=video&videoDuration={}&videoCategoryId=20&publishedAfter={}&publishedBefore={}&order={}&regionCode=US&relevanceLanguage=en&maxResults=50&pageToken={}&q={}'
# Category id 20 is built into the URL to get gaming videos. My goal is to see what characteristics a video needs to be a popular video
# Define the list of additional search queries. I used these words to find games. I also used no query as well to find videos
additional_queries = ['game', 'sonic', 'gameplay', 'games', 'gta', 'mario', 'kids', 'tekken', 'pokemon', 'tekken 8', 'play', 'roblox', 'fortnite', 'music', 'warzone', 'fnf', 'type beat', 'xbox', 'gta 5', 'minecraft', 'madden', 'call of duty', 'freestyle', 'slime', 'guide']

# Define the list of orders. This allows me to search for games in different ways. I need this because I am only limited to 550 videos and 11 page tokens per search
orders = ['date', 'rating', 'relevance', 'title', 'videoCount', 'viewCount']

# Define the list of video durations. This allows to me to get various types of videos
video_durations = ['short', 'medium', 'long']

def fetch_data(url):
    response = requests.get(url)
    return response.json()

def process_response(published_after, published_before, query, order, video_duration):
    next_page_token = ''
    all_videos_data = []
    while True:
        url = url_template.format(video_duration, published_after, published_before, order, next_page_token, query)
        print("URL:", url)  # Print the URL being used

        try:
            data = fetch_data(url)
            if 'items' in data:
                for item in data['items']:
                    video_id = item['id']['videoId']
                    snippet = item['snippet']
                    published_at = snippet.get('publishedAt', '')
                    channel_id = snippet.get('channelId', '')
                    title = snippet.get('title', '')
                    description = snippet.get('description', '')

                    all_videos_data.append({
                        'Video ID': video_id,
                        'Published At': published_at,
                        'Channel ID': channel_id,
                        'Title': title,
                        'Description': description
                    })

                if 'nextPageToken' in data:
                    next_page_token = data['nextPageToken']
                else:
                    break
            else:
                print("No items found in response. End of results")
                break
        except Exception as e:
            print("Error:", e)
            break
    return all_videos_data

def main():
    start_date = datetime(2024, 1, 29)
    end_date = datetime(2024, 2, 5)

    for current_date in pd.date_range(start_date, end_date, freq='D'):
        published_after = current_date.strftime('%Y-%m-%dT00:00:00Z')
        published_before = (current_date + timedelta(days=1)).strftime('%Y-%m-%dT00:00:00Z')
        print("Published After:", published_after)
        print("Published Before:", published_before)

        final_result_df = pd.DataFrame()

        for query in additional_queries:
            for order in orders:
                for video_duration in video_durations:
                    videos_data = process_response(published_after, published_before, query, order, video_duration)
                    final_result_df = pd.concat([final_result_df, pd.DataFrame(videos_data)])

        final_result_df.drop_duplicates(subset=['Video ID'], inplace=True)

        csv_filename = r'C:\Users\sulli\Downloads\Youtube Folder\search_results.csv'
        # Check if the file exists and create it if it doesn't
        if not os.path.exists(csv_filename):
            final_result_df.to_csv(csv_filename, mode='w', index=False)
        else:
            final_result_df.to_csv(csv_filename, mode='a', index=False, header=False)
        print(f"Data for {current_date.strftime('%Y-%m-%d')} appended to {csv_filename}")


if __name__ == "__main__":
    main()

'''
    Part 2
    Goal: Find information on each video
Description: I use this to find information points on Youtube Videos. Here are some examples of what I collected:
What Channel the video is from
What tags are used on the video
What is the description of the video
What is the title of the video
I am trying to collect features for a video to understand what works and what doesn’t for a high viewing youtube video
Output: Here is the csv it leads to (https://drive.google.com/file/d/1kJ3E83pcbLmw8nn2__r3J_JbnLG4LEkL/view)
'''

import pandas as pd
import requests
import time


# File path in your Google Drive
search_results_file_path = '/content/drive/My Drive/Capstone Project 2/search_results.csv'
video_info_file_path = '/content/drive/My Drive/Capstone Project 2/search_results_video_info.csv'

# Load CSV file containing new video IDs
search_results_df = pd.read_csv(search_results_file_path)

# Extract video IDs
video_ids_all = set(search_results_df['Video ID'])

# If there are no new video IDs, exit
if not video_ids_all:
    print("No new video IDs to fetch.")
    exit()

# Convert video IDs to a comma-separated string
video_id_string = ','.join(video_ids_all)

# Determine the batch size
batch_size = min(len(video_ids_all), 50)

# Calculate the total number of batches
num_batches = ((len(video_ids_all) - 1) // batch_size) + 1

# Function to fetch data from URL synchronously with retries
def fetch_data_with_retries(url, retries=3):
    for _ in range(retries):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Request failed. Retrying...")
                time.sleep(1)  # Adding a small delay before retrying
        except Exception as e:
            print(f"Error occurred: {e}. Retrying...")
            time.sleep(1)  # Adding a small delay before retrying
    print("Failed after multiple retries. Skipping this batch.")
    return None

# Main function to process batches synchronously
def process_batches():
    for batch_index in range(num_batches):
        start_index = batch_index * batch_size
        end_index = min((batch_index + 1) * batch_size, len(video_ids_all))

        # Extract video IDs for the current batch
        video_ids_batch = list(video_ids_all)[start_index:end_index]
        video_id_string_batch = ','.join(video_ids_batch)

        # Construct URL for batch request
        url = f"https://yt.lemnoslife.com/noKey/videos?part=snippet,contentDetails,statistics,liveStreamingDetails,status&id={video_id_string_batch}"

        # Fetch data from URL synchronously with retries
        data = fetch_data_with_retries(url)
        if data is not None and 'items' in data:
            # Create list to store rows for current batch
            batch_rows = []
            for item in data['items']:
                # Process each item and append to batch_rows
                video_id = item['id']
                snippet = item.get('snippet', {})
                content_details = item.get('contentDetails', {})
                statistics = item.get('statistics', {})
                live_streaming_details = item.get('liveStreamingDetails', {})
                status = item.get('status', {})

                channel_id = snippet.get('channelId', '')
                channel_title = snippet.get('channelTitle', '')
                tags = ','.join(snippet.get('tags', []))
                duration = content_details.get('duration', '')
                content_rating = content_details.get('contentRating', '')
                definition = content_details.get('definition', '')
                caption = content_details.get('caption', '')
                licensed_content = content_details.get('licensedContent', '')
                projection = content_details.get('projection', '')
                dimension = content_details.get('dimension', '')

                yt_rating = content_details.get('contentRating', {}).get('ytRating', '')
                made_for_kids = status.get('madeForKids', '')
                self_declared_made_for_kids = status.get('selfDeclaredMadeForKids', '')
                view_count = statistics.get('viewCount', '')
                like_count = statistics.get('likeCount', '')
                stream_start_time = live_streaming_details.get('actualStartTime', '')
                stream_end_time = live_streaming_details.get('actualEndTime', '')
                published_at = snippet.get('publishedAt', '')
                title = snippet.get('title', '')
                description = snippet.get('description', '')

                # Append row to batch_rows list if Stream Start Time is null
                if stream_start_time is None:
                    batch_rows.append({
                        'Video ID': video_id,
                        'Channel ID': channel_id, #This allows me to look up channel information in the future
                        'Channel Title': channel_title,
                        'Tags': tags, #Tags are used to allow people to find videos easier or even click on tags to search for a particular kind of video
                        'Duration': duration, #This will allow me to see if the length of the video matters for popularity
                        'Content Rating': content_rating, #This is similar to movie ratings
                        'YT Rating': yt_rating, #This is a subset of content rating
                        'Made For Kids': made_for_kids, #Youtube has a kids app. This will decide if it is on the kids app or not. If on the kids app, it will get to a wider audience
                        'Self Declared Made For Kids': self_declared_made_for_kids, #This generally was empty
                        'View Count': view_count, #The main thing I am looking for, views
                        'Like Count': like_count, #This is when users click on the thumbs up button in youtube and favorite a video. Supposedly it puts it more into the algorithm and on the trending page
                        'Stream Start Time': stream_start_time, #This allows me to find if it streamed or not. Also, creators can use a live function where they post the video early and people can react together about the video
                        'Stream End Time': stream_end_time, #This will help me figure out if the Youtube Premier function was used
                        'Published At': published_at, # When the video was created in UTC-0
                        'Video Title': title, #What was the title of the video
                        'Video Description': description, #This is the description used for the video
                        'Video Definition': definition, #This will tell you what quality the video is
                        'Video Captions': caption, #This tells me if Captions are available
                        'Video Licensed': licensed_content, #This will tell me if the Video is licensed https://support.google.com/youtube/answer/2797468?hl=en
                        'Video Projection': projection, #This will tell you if it is a normal video or a 360 video
                        'Video Dimension': dimension #This will tell you if it is 2d or 3d
                    })

            # Create DataFrame for current batch
            batch_data = pd.DataFrame(batch_rows)

            # Save batch DataFrame to CSV in append mode
            batch_data.to_csv(video_info_file_path, mode='a', index=False, header=not bool(batch_index))

        # Calculate remaining videos after this batch
        remaining_videos = len(video_ids_all) - ((batch_index + 1) * batch_size)
        remaining_videos = max(remaining_videos, 0)  # Ensure remaining videos count is not negative

        # Print remaining videos
        print(f"Batch {batch_index + 1} completed. {remaining_videos} videos remaining.")

# Run the process_batches function
start_time = time.time()
process_batches()
end_time = time.time()
print(f"Total processing time: {end_time - start_time} seconds.")

'''
Part 3
Goal: Convert duration to seconds and remove Shorts
Description: Convert duration into seconds and remove anything that is less than 60 seconds. Anything less than 60 seconds counts as a short and has a different advertising model. Therefore, I am only concerned with videos over 60 seconds.
Output: Here is the csv it leads to https://drive.google.com/file/d/1d5qPg3Fu974CQlknLQb_kGrWJKORj2nH/view

'''
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Read the CSV file into a DataFrame
search_results_df = pd.read_csv('/content/drive/My Drive/Capstone Project 2/search_results_video_info.csv')

#This will convert duration into seconds. I will call this field duration_seconds
def extract_seconds(duration_str):
    print("Processing duration:", duration_str)

    if pd.isna(duration_str) or duration_str == '' or duration_str == 'P0D':  # Check if duration_str is null, empty, or 'P0D'
        print("Output:", 0)
        return 0

    total_seconds = 0

    if 'S' in duration_str:  # If seconds are present
        index_s = duration_str.find('S')
        seconds_str = duration_str[index_s-2:index_s]
        if seconds_str.isdigit():  # If the substring before 'S' is a number
            total_seconds += int(seconds_str)
        else:  # If the substring before 'S' is not a number
            total_seconds += int(duration_str[index_s-1])

    if 'M' in duration_str:  # If minutes are present
        index_m = duration_str.find('M')
        minutes_str = duration_str[index_m-2:index_m]
        if minutes_str.isdigit():  # If the substring before 'M' is a number
            total_seconds += int(minutes_str) * 60
        else:  # If the substring before 'M' is not a number
            total_seconds += int(duration_str[index_m-1]) * 60

    if 'H' in duration_str:  # If hours are present
        index_h = duration_str.find('H')
        hours_str = duration_str[index_h-2:index_h]
        if hours_str.isdigit():  # If the substring before 'H' is a number
            total_seconds += int(hours_str) * 3600
        else:  # If the substring before 'H' is not a number
            total_seconds += int(duration_str[index_h-1]) * 3600

    if 'D' in duration_str:  # If days are present
        index_d = duration_str.find('D')
        days_str = duration_str[index_d-2:index_d]
        if days_str.isdigit():  # If the substring before 'D' is a number
            total_seconds += int(days_str) * 86400
        else:  # If the substring before 'D' is not a number
            total_seconds += int(duration_str[index_d-1]) * 86400

    print("Output:", total_seconds)
    return total_seconds

# Apply extract_seconds to the Duration column to convert it into seconds
search_results_df['duration_seconds'] = search_results_df['Duration'].apply(extract_seconds)

# Remove any videos shorter than 60 seconds
search_results_df = search_results_df[search_results_df['duration_seconds'] >= 60]

# Output CSV file path. I have been storing things in csvs so I have a database to pull from
output_csv_file = '/content/drive/My Drive/Capstone Project 2/search_results_video_info_no_shorts.csv'

# Write the final results to the CSV file
search_results_df.to_csv(output_csv_file, index=False)

print(search_results_df.info())

'''
Part 4
Goal: Find information on the channel for each video
Description: Trying to find info on the channel that should help understand a successful Youtube video. Here are some examples of the info being collected:
Country Origin of the Channel
Date Channel it is created
Is the Channel Made for Kids
Channel Subscriber Count
Channel Views
Output: Here is the csv it leads to https://drive.google.com/file/d/10MrSXtNPSAYpOs3BDFdqXVcegw42Lr7_/view
'''
import pandas as pd
import requests
import time
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# File paths
search_results_file_path = '/content/drive/My Drive/Capstone Project 2/search_results_video_info_no_shorts.csv'
video_info_file_path = '/content/drive/My Drive/Capstone Project 2/search_results_video_info_no_shorts_with_channel_info.csv'

# Function to fetch data from URL synchronously with retries
def fetch_data_with_retries(url, retries=3):
    for _ in range(retries):
        try:
            print(f"Fetching data from URL: {url}")  # Print the URL being accessed
            response = requests.get(url)
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Request failed. Retrying...")
                time.sleep(1)  # Adding a small delay before retrying
        except Exception as e:
            print(f"Error occurred: {e}. Retrying...")
            time.sleep(1)  # Adding a small delay before retrying
    print("Failed after multiple retries. Skipping this batch.")
    return None

# Function to process all batches and append to CSV
def main():
    # Load CSV file containing new channel IDs
    search_results_df = pd.read_csv(search_results_file_path)
    # Extract channel IDs
    channel_ids_all = set(search_results_df['Channel ID'].unique())
    # If there are no new video IDs, exit
    if not channel_ids_all:
        print("No new channel IDs to fetch.")
        return

    # Check the CSV file to see which videos still need to be processed
    try:
        processed_channel_df = pd.read_csv(video_info_file_path)
    except FileNotFoundError:
        processed_channel_df = pd.DataFrame()

    processed_channel_ids = set(processed_channel_df['Channel ID']) if not processed_channel_df.empty else set()
    channel_ids_remaining = channel_ids_all - processed_channel_ids

    total_channels_remaining = len(channel_ids_remaining)
    print(f"Total channels remaining to be processed: {channel_ids_remaining}")

    # Determine the batch size
    batch_size = min(total_channels_remaining, 50)
    # Calculate the total number of batches
    num_batches = ((total_channels_remaining - 1) // batch_size) + 1

    # Fetch data for remaining videos
    batch_rows = []
    for i in range(num_batches):
        start_index = i * batch_size
        end_index = min((i + 1) * batch_size, len(channel_ids_remaining))
        channel_ids_batch = list(channel_ids_remaining)[start_index:end_index]
        channel_id_string_batch = ','.join(channel_ids_batch)
        url = f"https://yt4.lemnoslife.com/noKey/channels?part=snippet,statistics,status&id={channel_id_string_batch}"
        data = fetch_data_with_retries(url)

        if data is not None and 'items' in data:
            for item in data['items']:
                # Process each item and append to batch_rows
                channel_id = item['id']
                subscriber_count = item['statistics'].get('subscriberCount', None) #Allows me to see sub count
                channel_view_count = item['statistics'].get('viewCount', None) #Allows me to see the history of the channel and how many views it has
                channel_made_for_kids = item['status'].get('madeForKids', None) #Allows me to see if the channel is made for kids
                country = item['snippet'].get('country', None) #In the search, I do set to try and find US videos but sometimes the channel is not from the country. So I search for this
                channel_publish_date = item['snippet'].get('publishedAt', None) #Was curious to understand if age of the channel or age of the account mattered


                # Append item to batch_rows
                batch_rows.append({
                    'Channel ID': channel_id,
                    'Subscriber Count': subscriber_count,
                    'Channel Views': channel_view_count,
                    'Channel Made for Kids': channel_made_for_kids,
                    'Country': country,
                    'Channel Published Date': channel_publish_date
                })

        # Append to CSV after processing each batch of 50 videos
        if len(batch_rows) >= 50 or i == num_batches - 1:
            append_to_csv(batch_rows)
            batch_rows = []

        # Print remaining videos
        remaining_channels = total_channels_remaining - (i + 1) * batch_size
        print(f"Remaining videos to be processed: {remaining_channels}")

    print("All videos have been added to the CSV file.")

# Function to append to CSV
def append_to_csv(batch_rows):
    # Append DataFrame to CSV file
    if batch_rows:
        df = pd.DataFrame(batch_rows)
        mode = 'a' if pd.read_csv(video_info_file_path, nrows=1).empty else 'w'
        df.to_csv(video_info_file_path, mode=mode, index=False, header=mode=='w')
        print("Videos have been added to the CSV file.")

# Run the main function
start_time = time.time()
main()
end_time = time.time()

# Total processing time
print(f"Total processing time: {end_time - start_time} seconds.")

'''
Part 5
Goal: Finding the Game Played and Game Year for the Video
Description: Wanted to understand if the Game Played and the Game Year matters when having a video over 100k
Note: This has a big delay. Ran multiple processes for this to go faster but it did eventually work. Because it took a while, I added a part where it double checks a CSV to see what else needs to be added.
Output: Here is the csv it leads to https://drive.google.com/file/d/1yD3hD2rcFz27lKkCl00LyL2_UrXZbiub/view
'''
import pandas as pd
import requests
import time
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# File paths
search_results_file_path = '/content/drive/My Drive/Capstone Project 2/search_results_video_info_no_shorts.csv'
video_info_file_path = '/content/drive/My Drive/Capstone Project 2/video_activity_channel.csv'

# Function to fetch data from URL synchronously with retries
def fetch_data_with_retries(url, retries=3, timeout=60):
    for _ in range(retries):
        try:
            print(f"Fetching data from URL: {url}")  # Print the URL being accessed
            response = requests.get(url, timeout=timeout)
            if response.status_code == 200:
                return response.json()
            elif response.status_code == 404:
                print("URL not found.")
                return None
            elif response.status_code == 429:
                print("Rate limit exceeded. Waiting before retrying...")
                time.sleep(60)  # Wait for a minute before retrying
            else:
                print(f"Unexpected status code: {response.status_code}. Retrying...")
                time.sleep(30)  # Adding a small delay before retrying
        except requests.Timeout:
            print("Timeout occurred. Retrying...")
            time.sleep(30)  # Adding a small delay before retrying
        except Exception as e:
            print(f"Error occurred: {e}. Retrying...")
            time.sleep(30)  # Adding a small delay before retrying
    print("Failed after multiple retries. Skipping this batch.")
    return None

# Function to process all batches and append to CSV
def main():
    # Load CSV file containing video IDs
    search_results_df = pd.read_csv(search_results_file_path)
    # Extract video IDs
    video_ids_all = set(search_results_df['Video ID'].unique())
    # If there are no new video IDs, exit
    if not video_ids_all:
        print("No new video IDs to fetch.")
        return

    # Check the CSV file to see which videos still need to be processed
    processed_video_ids = set()
    try:
        processed_video_df = pd.read_csv(video_info_file_path)
        processed_video_ids = set(processed_video_df['Video ID'])
    except FileNotFoundError:
        pass

    video_ids_remaining = list(video_ids_all - processed_video_ids)
    total_videos_remaining = len(video_ids_remaining)
    print(f"Total videos remaining to be processed: {total_videos_remaining}")

    # Reverse the order of video IDs
    video_ids_remaining = video_ids_remaining[::-1]

    # Asynchronously fetch data for remaining videos
    batch_rows = []
    for i in range(0, total_videos_remaining, 50):
        video_ids_batch = video_ids_remaining[i:i + 50]
        video_id_string_batch = ','.join(video_ids_batch)
        url = f"https://yt4.lemnoslife.com/videos?part=activity&id={video_id_string_batch}"
        data = fetch_data_with_retries(url)

        if data is not None and 'items' in data:
            for item in data['items']:
                # Process each item and append to batch_rows
                video_id = item['id']
                activity = item.get('activity', {})

                # Extract Published At, Description, and Title
                game_listed = activity.get('name', '')
                game_year = activity.get('year', '')

                # Append item to batch_rows if video ID is not already in CSV
                if video_id not in processed_video_ids:
                    batch_rows.append({
                        'Video ID': video_id,
                        'Game Played': game_listed,#I found that this held what game was played. The creator sometimes had it blank though
                        'Game Year': game_year #Interested to know if year of the game mattered
                    })
                    processed_video_ids.add(video_id)

        # Append to CSV after processing each batch of 50 videos
        if len(batch_rows) >= 50 or i + 50 >= total_videos_remaining:
            append_to_csv(batch_rows)
            batch_rows = []

            # Print remaining videos
            remaining_videos = total_videos_remaining - (i + len(batch_rows))
            print(f"Remaining videos to be processed: {remaining_videos}")

    print("All videos have been added to the CSV file.")

# Function to append to CSV
def append_to_csv(batch_rows):
    if batch_rows:
        df = pd.DataFrame(batch_rows)
        mode = 'a' if pd.read_csv(video_info_file_path, nrows=1).empty else 'w'
        df.to_csv(video_info_file_path, mode=mode, index=False, header=mode=='w')
        print("Videos have been added to the CSV file.")

# Run the main function
start_time = time.time()
main()
end_time = time.time()

# Total processing time
print(f"Total processing time: {end_time - start_time} seconds.")

'''
Part 6
Goal: Collecting past videos for each channel to better understand if a channel creator is consistent and see if past views matter
Description: For these past views, I collected numerous fields. The main fields I eventually focus on are the following
Likes: Trying to understand if past likes factor into understanding a good video
Views: Trying to understand if past
Videos: Trying to understand if you post videos consistently
Note: This also took a while but it definitely paid dividends in the end. Also, the file ends up being over 1 GB. I ran the code multiple times because sometimes it was someones first video ever posted. So had to run multiple times to confirm that was the case.
Output: Here is the csv it leads to https://drive.google.com/file/d/1Rafp0nQd6jGXO-VstrqAgqGMSRviW92a/view
'''
import csv
import pandas as pd
from datetime import datetime
import requests
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the base URL template
base_url_template = "https://yt4.lemnoslife.com/noKey/search?part=snippet&type=video&channelId={}&publishedAfter={}&publishedBefore={}&maxResults=50&pageToken={}"

# File paths
all_csv_file = '/content/drive/My Drive/Capstone Project 2/video_activity_channel.csv'
output_csv_file = '/content/drive/My Drive/Capstone Project 2/past_channel_videos.csv'

# Read the CSV file containing all channel IDs
all_df = pd.read_csv(all_csv_file)

# Filter out the unique channel IDs for which video counts are not already present in the existing CSV
unique_channel_ids = all_df['Channel ID'].unique()
unique_remaining_ids = unique_channel_ids

# Define the start and end dates for the date range
start_date = datetime(2023, 11, 1).strftime('%Y-%m-%dT%H:%M:%SZ')
end_date = datetime(2024, 1, 29).strftime('%Y-%m-%dT%H:%M:%SZ')

# Function to get the list of videos uploaded by a channel within the date range
def get_videos(channel_id):
    max_results = 50  # Maximum results per page
    page_token = ''   # Initialize page token to an empty string

    all_videos_data = []  # List to store all video data

    while True:  # Loop until all pages are fetched
        url = base_url_template.format(channel_id, start_date, end_date, page_token)

        try:
            print("Fetching data for channel ID:", channel_id)
            print("URL:", url)  # Print the URL being used for each channel ID
            response = requests.get(url, timeout=10)  # Set timeout to 10 seconds
            response.raise_for_status()  # Raise an error for non-2xx status codes
            json_data = response.json()

            # Extract video data from the current page
            for item in json_data.get('items', []):
                video_id = item['id']
                snippet = item.get('snippet', {})
                content_details = item.get('contentDetails', {})
                statistics = item.get('statistics', {})
                live_streaming_details = item.get('liveStreamingDetails', {})
                status = item.get('status', {})

                channel_id = snippet.get('channelId', '')
                channel_title = snippet.get('channelTitle', '')
                tags = ','.join(snippet.get('tags', []))
                duration = content_details.get('duration', '')
                content_rating = content_details.get('contentRating', '')
                yt_rating = content_details.get('contentRating', {}).get('ytRating', '')
                made_for_kids = status.get('madeForKids', '')
                self_declared_made_for_kids = status.get('selfDeclaredMadeForKids', '')
                view_count = statistics.get('viewCount', '')
                like_count = statistics.get('likeCount', '')
                stream_start_time = live_streaming_details.get('actualStartTime', '')

                # Append row to batch_rows list
                all_videos_data.append({
                    'Video ID': video_id,
                    'Channel ID': channel_id,
                    'Channel Title': channel_title,
                    'Tags': tags,
                    'Duration': duration,
                    'Content Rating': content_rating,
                    'YT Rating': yt_rating,
                    'Made For Kids': made_for_kids,
                    'Self Declared Made For Kids': self_declared_made_for_kids,
                    'View Count': view_count,
                    'Like Count': like_count,
                    'Stream Start Time': stream_start_time
                })

            # Check if there are more pages to fetch
            next_page_token = json_data.get('nextPageToken')
            if not next_page_token:
                break  # Break the loop if no more pages

            page_token = next_page_token  # Update page token for the next request

        except requests.Timeout:
            print(f"Timeout error: Request timed out for channel {channel_id}")
            return channel_id, []
        except requests.RequestException as e:
            print(f"Error retrieving data for channel {channel_id}: {e}")
            return channel_id, []

    return channel_id, all_videos_data


# Check if the output CSV file already exists
file_exists = False

for idx, channel_id in enumerate(unique_remaining_ids, start=1):
    channel_id, videos_data = get_videos(channel_id)  # Unpack the tuple returned by get_videos

    # Open the CSV file in append mode ('a') if it exists, otherwise open it in write mode ('w') to create it
    with open(output_csv_file, 'a' if file_exists else 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Channel ID', 'Video ID', 'Published At','View Count','Like Count']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # Write the header only if the file is being created for the first time
        if not file_exists:
            writer.writeheader()
            file_exists = True  # Set flag to indicate that the file now exists

        for video_data in videos_data:
            writer.writerow(video_data)

        remaining_ids = unique_remaining_ids[idx:]
        print(f"[{idx}/{len(unique_remaining_ids)}] Video data for channel {channel_id} appended to channel_videos.csv. {len(remaining_ids)} unique channel IDs remain.")

print("Video data for all unique remaining channels appended to channel_videos.csv")
'''
Part 7
Goal: Merging Video Info Captured, Channel Info Captured, Channel Stats Captured, and the Video Information
Description: I wanted to merge all the info into one place so I can then have a CSV where I can add new features where necessary
Note: Nov_Jan_Channel_Stats.csv is a shortened version of past_channel_videos. I did this because past_channel_videos was 1 gb.
Output: This will be the outputted csv. It is from another csv because I overwrote it
https://drive.google.com/file/d/1yD3hD2rcFz27lKkCl00LyL2_UrXZbiub/view
'''

import csv
import pandas as pd
from datetime import datetime
import requests
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the base URL template
base_url_template = "https://yt4.lemnoslife.com/noKey/search?part=snippet&type=video&channelId={}&publishedAfter={}&publishedBefore={}&maxResults=50&pageToken={}"

# File paths
all_csv_file = '/content/drive/My Drive/Capstone Project 2/video_activity_channel.csv'
output_csv_file = '/content/drive/My Drive/Capstone Project 2/past_channel_videos.csv'

# Read the CSV file containing all channel IDs
all_df = pd.read_csv(all_csv_file)

# Filter out the unique channel IDs for which video counts are not already present in the existing CSV
unique_channel_ids = all_df['Channel ID'].unique()
unique_remaining_ids = unique_channel_ids

# Define the start and end dates for the date range
start_date = datetime(2023, 11, 1).strftime('%Y-%m-%dT%H:%M:%SZ')
end_date = datetime(2024, 1, 29).strftime('%Y-%m-%dT%H:%M:%SZ')

# Function to get the list of videos uploaded by a channel within the date range
def get_videos(channel_id):
    max_results = 50  # Maximum results per page
    page_token = ''   # Initialize page token to an empty string

    all_videos_data = []  # List to store all video data

    while True:  # Loop until all pages are fetched
        url = base_url_template.format(channel_id, start_date, end_date, page_token)

        try:
            print("Fetching data for channel ID:", channel_id)
            print("URL:", url)  # Print the URL being used for each channel ID
            response = requests.get(url, timeout=10)  # Set timeout to 10 seconds
            response.raise_for_status()  # Raise an error for non-2xx status codes
            json_data = response.json()

            # Extract video data from the current page
            for item in json_data.get('items', []):
                video_id = item['id']
                snippet = item.get('snippet', {})
                content_details = item.get('contentDetails', {})
                statistics = item.get('statistics', {})
                live_streaming_details = item.get('liveStreamingDetails', {})
                status = item.get('status', {})

                channel_id = snippet.get('channelId', '')
                channel_title = snippet.get('channelTitle', '')
                tags = ','.join(snippet.get('tags', []))
                duration = content_details.get('duration', '')
                content_rating = content_details.get('contentRating', '')
                yt_rating = content_details.get('contentRating', {}).get('ytRating', '')
                made_for_kids = status.get('madeForKids', '')
                self_declared_made_for_kids = status.get('selfDeclaredMadeForKids', '')
                view_count = statistics.get('viewCount', '')
                like_count = statistics.get('likeCount', '')
                stream_start_time = live_streaming_details.get('actualStartTime', '')

                # Append row to batch_rows list
                all_videos_data.append({
                    'Video ID': video_id,
                    'Channel ID': channel_id,
                    'Channel Title': channel_title,
                    'Tags': tags,
                    'Duration': duration,
                    'Content Rating': content_rating,
                    'YT Rating': yt_rating,
                    'Made For Kids': made_for_kids,
                    'Self Declared Made For Kids': self_declared_made_for_kids,
                    'View Count': view_count,
                    'Like Count': like_count,
                    'Stream Start Time': stream_start_time
                })

            # Check if there are more pages to fetch
            next_page_token = json_data.get('nextPageToken')
            if not next_page_token:
                break  # Break the loop if no more pages

            page_token = next_page_token  # Update page token for the next request

        except requests.Timeout:
            print(f"Timeout error: Request timed out for channel {channel_id}")
            return channel_id, []
        except requests.RequestException as e:
            print(f"Error retrieving data for channel {channel_id}: {e}")
            return channel_id, []

    return channel_id, all_videos_data


# Check if the output CSV file already exists
file_exists = False

for idx, channel_id in enumerate(unique_remaining_ids, start=1):
    channel_id, videos_data = get_videos(channel_id)  # Unpack the tuple returned by get_videos

    # Open the CSV file in append mode ('a') if it exists, otherwise open it in write mode ('w') to create it
    with open(output_csv_file, 'a' if file_exists else 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Channel ID', 'Video ID', 'Published At','View Count','Like Count']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # Write the header only if the file is being created for the first time
        if not file_exists:
            writer.writeheader()
            file_exists = True  # Set flag to indicate that the file now exists

        for video_data in videos_data:
            writer.writerow(video_data)

        remaining_ids = unique_remaining_ids[idx:]
        print(f"[{idx}/{len(unique_remaining_ids)}] Video data for channel {channel_id} appended to channel_videos.csv. {len(remaining_ids)} unique channel IDs remain.")

print("Video data for all unique remaining channels appended to channel_videos.csv")

'''
Part 8
Goal: Create new features to better understand how a good youtube video is made. Also filled in null values
Description: Created a variety of new features while also filling in null values. For filling null values, this is what I did:
If Channel Views were null, I filled it in with the View Count of the current video
If the Channel Country was null, I filled it in with US
If Channel Published Date was null, I filled it in with Published at date due to the amount of nulls being low
If any past channel stats were null, I filled it in as 0 because they had no videos
For Game Played, if it was null, I tried to look at title and tags to fill in the correct game. If it was null, I kept it that way because it usually was a rare game or it actually ended up not being a game. I also made games the same name where necessary
For Game Year, I filled in for any games that I already had or that I knew
Used Google Trends to find where a trending word was located in the video details. Separated it into Top Terms and Rising Words because Google trends has each
Found top games in (https://steamdb.info/). I looked at top games sold from Jan 15th to Feb 15th because those were the games that would potentially be popular
Found Top Selling Games for 2024 (https://www.gamespot.com/gallery/2024s-best-selling-games-in-the-us/2900-5106/) to understand if people played these games,would they do better
Found the Avg Views, Likes and Videos for last 3 months to understand how popular the channel was in the past. Also found the percentage of likes for past videos to better understand if the videos have been consistently popular over time
Found the hour that the video got launched using EST to see if that matters. Also put hours into different blocks to see if that mattered
Put Day of Week to better understand if launching on a certain day mattered
I found channel age to see if older channels would do better because they are around longer
I wanted to see how a video was launched, so I double checked to know if it used the premier feature or if it was a stream
I checked what kind of Social Media links used in the description to see if Promoting their own videos mattered
For tags, I checked if they existed and how many tags you would have to see if adding more tags would mean more views and if tags mattered at all
For a description, I was trying to understand if that mattered and found if a description mattered or not
Finally, I put a number code to everything so it would work for a linear regression model
Output: Here is the CSV it gets outputted to
'''
import pandas as pd
import numpy as np
from pytz import timezone

#Creating df of all videos together
activity_df = pd.read_csv("C:/Users/sulli/Downloads/Youtube Folder/video_activity_channel.csv")

#If there were no channel views.More likely than not, it was there first video. Hence filling it in with view count
activity_df['Channel Views'] = activity_df['Channel Views'].fillna(activity_df['View Count'])
activity_df['Country'] = activity_df['Country'].fillna('US')

# Convert boolean values to True/False
activity_df['Channel Made for Kids'] = activity_df['Channel Made for Kids'].astype(bool)
activity_df['Video Captions'] = activity_df['Video Captions'].astype(bool)
activity_df['Video Licensed'] = activity_df['Video Licensed'].astype(bool)

# Convert NaN values to False
activity_df['Channel Made for Kids'] = activity_df['Channel Made for Kids'].fillna(False)
activity_df['Video Captions'] = activity_df['Video Captions'].fillna(False)
activity_df['Video Licensed'] = activity_df['Video Licensed'].fillna(False)


#This is from the Nov_Jan_Channel CSV
activity_df['Channel Published Date'] = activity_df['Channel Published Date'].fillna(activity_df['Published At'])
fields_to_fill = [
    'January Views', 'November Views', 'December Views',
    'January Likes', 'November Likes', 'December Likes',
    'January Videos', 'November Videos', 'December Videos',
    'Last Two Months Views', 'Last Two Months Likes', 'Last Two Months Videos',
    'Last Three Months Views', 'Last Three Months Likes', 'Last Three Months Videos',
    'Subscriber Count','Like Count'
]




# Fill all specified fields with 0
activity_df[fields_to_fill] = activity_df[fields_to_fill].fillna(0)
#I filled in the game played where I could based on a bunch of factors you will see below. If it was null, I still kept the videos
activity_df['Game Played Corrected'] = activity_df['Game Played']
activity_df['Game Played Corrected'] = activity_df['Game Played'].str.title()
activity_df['Game Year Corrected'] = activity_df['Game Year']


# If game exists in title or tags, fill it in.

unique_games = activity_df['Game Played Corrected'].dropna().unique()  # Get unique non-null games

# Function to find if any game exists in the tags
def find_matching_game(tags):
    matched_games = []
    for game in unique_games:
        if game.lower() in tags.lower():
            matched_games.append(game)
            if len(matched_games) > 1:
                return None  # More than one game found in tags, return None
    if len(matched_games) == 1:
        return matched_games[0]  # Return the single matched game
    return None  # No game found in tags, return None

activity_df['Game Played Corrected'] = activity_df['Game Played Corrected'].str.replace('®', '')
activity_df['Game Played Corrected'] = activity_df['Game Played Corrected'].str.replace('Ã©', 'é')
activity_df['Game Played Corrected'] = activity_df['Game Played Corrected'].str.replace('GTA', 'Grand Theft Auto')

name_mapping = {
    'UNO!™': 'Uno',
    'The Simpsons': 'The Simpsons Game',
    'GTA: San Andreas - Definitive': 'Grand Theft Auto: San Andreas - The Definitive Edition',
    'Call of Duty: Modern Warfare III': 'Call of Duty: Modern Warfare 3',
    'Call of Duty: Modern Warfare II': 'Call of Duty: Modern Warfare 2'

}

# Replace values in the 'Game Played' column using the defined mapping
activity_df['Game Played Corrected'] = activity_df['Game Played Corrected'].replace(name_mapping)
#I looked at what was empty and created this to fill in the game where I could along with what was popular and missing
for index, row in activity_df.iterrows():
    print(f"Processing row {index}...")
    if pd.isnull(row['Game Played Corrected']):  # Debug statement
        title_value = row['Video Title']
        print(f"Title Value: {title_value}")
        tags_value = row['Tags']
        print(f"Tags Value: {tags_value}")
        if isinstance(title_value, str):  # Check if 'Video Title' is a string
            print("Title is a string.")
            title_value = title_value.title()# Apply .title() to the string
            title_value = title_value.str.replace('®', '')
            print(f"Modified Title Value: {title_value}")
            for game in unique_games:
                if str(game) in title_value:  # Convert game to string before checking
                    print(f"Game {game} found in title.")
                    activity_df.at[index, 'Game Played Corrected'] = game
                    print(f"Game Played Corrected set to: {game}")
                    break  # Stop searching if a match is found

            print(f"Title Value (lowercase) after for loop: {title_value.lower()}")
            if 'madden' in title_value.lower() and '24' in title_value.lower():
                print("Performing check for 'Madden NFL 24")
                print(f"Game Played Corrected set to: {game}")
                activity_df.at[index, 'Game Played Corrected'] = 'Madden NFL 24'
            elif 'buckshot roulette' in title_value.lower():
                print("Performing check for 'Buckshot")
                activity_df.at[index, 'Game Played Corrected'] = 'Buckshot Roulette'
                print(f"Game Played Corrected set to: {game}")
            elif 'mw3' in title_value.lower() or 'modern warfare iii' in title_value.lower() or 'modern warfare 3' in title_value.lower():
                print("Performing check for 'mw3")
                activity_df.at[index, 'Game Played Corrected'] = 'Call of Duty: Modern Warfare 3'
                print(f"Game Played Corrected set to: {game}")
            elif 'mw2' in title_value.lower():
                print("Performing check for 'mw2")
                activity_df.at[index, 'Game Played Corrected'] = 'Call of Duty: Modern Warfare 2'
                print(f"Game Played Corrected set to: {game}")
            elif 'black ops 2' in title_value.lower():
                print("Performing check for 'black ops 2")
                activity_df.at[index, 'Game Played Corrected'] = 'Call of Duty: Black Ops II'
                print(f"Game Played Corrected set to: {game}")
            elif 'black ops cold war' in title_value.lower():
                print("Performing check for 'cold war")
                activity_df.at[index, 'Game Played Corrected'] = 'Call of Duty: Black Ops Cold War'
                print(f"Game Played Corrected set to: {game}")
            elif 'gta 5' in title_value.lower():
                print("Performing check for 'gta 5")
                activity_df.at[index, 'Game Played Corrected'] = 'Grand Theft Auto V'
                print(f"Game Played Corrected set to: {game}")
            elif 'ww2' in title_value.lower() and 'cod' in title_value.lower():
                print("Performing check for 'wwii")
                activity_df.at[index, 'Game Played Corrected'] = 'Call of Duty: WWII'
                print(f"Game Played Corrected set to: {game}")
            elif 'wwii' in title_value.lower() and 'cod' in title_value.lower():
                print("Performing check for 'wwii")
                activity_df.at[index, 'Game Played Corrected'] = 'Call of Duty: WWII'
                print(f"Game Played Corrected set to: {game}")
            elif 'ww2' in title_value.lower() and 'call of' in title_value.lower():
                print("Performing check for 'wwii")
                activity_df.at[index, 'Game Played Corrected'] = 'Call of Duty: WWII'
                print(f"Game Played Corrected set to: {game}")
            elif 'wwii' in title_value.lower() and 'call of' in title_value.lower():
                print("Performing check for 'wwii")
                activity_df.at[index, 'Game Played Corrected'] = 'Call of Duty: WWII'
            elif 'call of' in title_value.lower() and 'mobile' in title_value.lower():
                print("Performing check for 'Call of Duty: Mobile'...")  # New print statement
                activity_df.at[index, 'Game Played Corrected'] = 'Call of Duty: Mobile'
                print(f"Game Played Corrected set to: Call of Duty: Mobile")
            elif 'fc 24' in title_value.lower() or 'fc24' in title_value.lower():
                print("Performing check for 'fc24")
                activity_df.at[index, 'Game Played Corrected'] = 'EA Sports FC 24'
                print(f"Game Played Corrected set to: {game}")

        else:  # Check if any game from unique_games is in tags
            if isinstance(tags_value, str):  # Check if 'Tags' is a string
                tags_value = tags_value.title()  # Apply .title() to the string
                activity_df.at[index, 'Game Played Corrected'].fillna(find_matching_game(tags_value), inplace=True)
                if 'nopixel' in tags_value.lower():
                    activity_df.at[index, 'Game Played Corrected'] = 'Grand Theft Auto V'
                elif 'madden 24' in tags_value.lower():
                    activity_df.at[index, 'Game Played Corrected'] = 'Madden NFL 24'

'''
Part 9
Goal: Trying to see which model is better
Description: I printed out all the models and imported them into the following Google Sheet. Here are the Important Tabs:
Fields: Explaining the name of each field and then categorizing them into why they are important. I use this to ensure I am ideally using all the categories to come up with a good model
Feature Importance (EX): Understanding which features were important so I know what to tweak. For “All Features” Model, this was my way to understand which features were important and knowing what to keep. For each version I tried to keep the top features per categories. In my model names, it explains what it was doing
Compare 3 (Classification): For each model, I captured which ones had the best metrics. For Overall Score, it is a weighted average of Precision, Recall, F1 Score, and ROC AUC. I weighted Precision and ROC AUC higher because, if I received a FP, it would be worse than receiving an FN. I want to be conservative and not predict a video will hit 100k and it won’t
Note: I kept Regression metrics as well because I thought I had a regression problem. But, because I want to know if a video hits 100k or not, it is a Classification problem. Hence, Classification metrics were used. I also tweaked the models to better understand the features to better understand the relationship between features.
What Model did I choose and Why: I ended up choosing a model with 10 features. It wasn’t the best performing but it was only worse than a model with 16 features by .11%. Because it would take less computational power, I stuck with a model with 10 features.  I chose XGBoost because it performed the best overall. Decision Tree was a close second but XGBoost performed better overall
Output: I put all coefficients/feature importance into CSVs, then would put them in this Sheet to better understand what models were better and why
(https://docs.google.com/spreadsheets/d/14jlshDILuqAhJBXq2VUmHmBKPk5Fhhhka_UIY8qREiI/edit#gid=201532733)
'''

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from google.colab import drive
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, roc_auc_score, precision_score, recall_score, f1_score

# Mount Google Drive
drive.mount('/content/drive')



# Load dataset
model_df = pd.read_csv("/content/drive/My Drive/Capstone Project 2/video_activity_channel_altered.csv")

# Define categorical and continuous features
categorical_features = ['Channel ID', 'Channel Title', 'Tags', 'Made For Kids', 'Stream End Time', 'Published At',
                        'Video Title', 'Video Description', 'Video Definition', 'Video Captions', 'Video Licensed',
                        'Video Projection', 'Video Dimension', 'Game Played', 'Game Year', 'Channel Made for Kids',
                        'Country', 'Channel Published Date','Game Played Corrected','Game Year Corrected',
                        'Top Search Term in Title','Top Search Term in Game Listed','Top Search Term in Description',
                        'Top Search Term in Tags','YouTube Rising Word in Title','YouTube Rising Word in Game Listed',
                        'YouTube Rising Word in Description','YouTube Rising Word in Tags','Top Selling Steam Game',
                        'Top Selling Game 2024','Day of Week','Time of Day', 'Channel Age', 'Hour','Video Definition Code',
                        'Video Projection Code','Video Dimension Code','Game Played Corrected Code','Country Code',
                        'Day of Week Code','Time of Day Code','Links Twitter','Links Instagram','Links Facebook',
                        'Links Twitch','Premier Stream Video Code','Primary Social Media Link','Tags Exist',
                        'Video Licensed Code','Description Exist','Game Played Listed in Activity']
continuous_features = ['View Count', 'Like Count', 'duration_seconds', 'Subscriber Count', 'Channel Views',
                        'January Views','November Views','December Views','January Likes','November Likes','December Likes',
                        'January Videos','November Videos','December Videos','Last Two Months Views','Last Two Months Likes',
                        'Last Two Months Videos','Last Three Months Views','Last Three Months Likes','Last Three Months Videos',
                        'View Count Percentile','Avg Views Last 3 Months','Avg Views Last 3 Months','Avg Views Last 2 Months',
                        'Avg Views Last 1 Month','Amount of Tags','Avg Likes Last 3 Months','Avg Likes Last 2 Months',
                        'Avg Likes Last 1 Month','Likes to Views Ratio','Likes to Views Ratio Last 3 Month',
                        'Likes to Views Ratio Last 2 Month','Likes to Views Ratio Last 1 Month']
# Define the versions with different sets of features
versions = [
    {
        'name': 'All Features',
        'features': ['Like Count', 'duration_seconds', 'Subscriber Count', 'Channel Views',
                        'January Views','November Views','December Views','January Likes','November Likes','December Likes',
                        'January Videos','November Videos','December Videos','Last Two Months Views','Last Two Months Likes',
                        'Last Two Months Videos','Last Three Months Views','Last Three Months Likes','Last Three Months Videos',
                        'Avg Views Last 3 Months','Avg Views Last 2 Months', 'Avg Views Last 1 Month','Made For Kids',
                        'Video Definition Code', 'Video Captions', 'Video Licensed Code',
                        'Video Projection Code', 'Video Dimension Code', 'Channel Made for Kids',
                        'Country Code', 'Game Played Corrected Code','Game Year Corrected',
                        'Top Search Term in Title','Top Search Term in Game Listed','Top Search Term in Description',
                        'Top Search Term in Tags','YouTube Rising Word in Title','YouTube Rising Word in Game Listed',
                        'YouTube Rising Word in Description','YouTube Rising Word in Tags','Top Selling Steam Game',
                        'Top Selling Game 2024','Day of Week Code','Time of Day Code', 'Channel Age', 'Hour',
                        'Links Twitter','Links Instagram','Links Facebook','Links Twitch','Primary Social Media Link',
                        'Tags Exist','Amount of Tags','Premier Stream Video Code','Description Exist',
                        'Game Played Listed in Activity','Avg Likes Last 3 Months','Avg Likes Last 2 Months',
                        'Avg Likes Last 1 Month','Likes to Views Ratio','Likes to Views Ratio Last 3 Month',
                        'Likes to Views Ratio Last 2 Month','Likes to Views Ratio Last 1 Month']
    },
    {
        'name': 'Top Two Important Features from each category (XGB Boost)',
        'features': ['Like Count','Likes to Views Ratio','Video Definition Code','Video Licensed Code','Top Selling Game 2024',
                     'Time of Day Code','Links Twitter','Channel Age','Tags Exist','Premier Stream Video Code',
                     'Video Projection Code','Channel Made for Kids','Country Code','Links Instagram','Links Twitch',
                     'Amount of Tags']
    },
    {
        'name': 'Top Two Important Features from each category (Random Forest)',
        'features': ['Like Count','Avg Views Last 2 Months','duration_seconds','Channel Age','Day of Week Code','Hour',
                     'Amount of Tags','Game Played Corrected Code','Country Code','Channel Made for Kids',
                     'Primary Social Media Link','Links Twitch','Video Projection Code','Video Licensed Code']
    },
    {
        'name': 'Top Two Important Features from each category (Decision Tree)',
        'features': ['Like Count','Likes to Views Ratio','duration_seconds','Channel Age','Game Played Corrected Code','Hour',
                     'Day of Week Code','Amount of Tags','Country Code','Links Instagram','Made For Kids',
                     'Video Definition Code','Video Projection Code','Video Dimension Code','Channel Made for Kids']
    },
    {
        'name': 'Top Two Important Features from each category (Linear Regression)',
        'features': ['Video Projection Code','Video Licensed Code','Description Exist','Links Facebook','Links Twitter',
                     'Tags Exist','Video Definition Code','Hour','Day of Week Code','Made For Kids','Like Count',
                     'November Videos','Channel Made for Kids']
    },
    {
        'name': 'Top 1 Important Features from each category (XGB Boost)',
        'features': ['Like Count','Video Definition Code','Video Licensed Code','Time of Day Code','Links Twitter',
                     'Channel Age','Tags Exist','Channel Made for Kids','Country Code']
    },
    {
        'name': 'Top 1 Important Features from each category (Random Forest)',
        'features': ['Like Count','duration_seconds','Channel Age','Day of Week Code','Hour','Amount of Tags',
                     'Country Code','Links Twitter','Video Definition Code']
    },
    {
        'name': 'Top 1 Important Features from each category (Decision Tree)',
        'features': ['Like Count','duration_seconds','Channel Age','Hour','Amount of Tags','Country Code','Links Instagram',
                     'Video Definition Code']
    },
    {
        'name': 'Top 1 Important Features from each category (Linear Regression)',
        'features': ['Video Projection Code','Video Licensed Code','Links Facebook','Tags Exist','Hour',
                     'Made For Kids','Like Count','YouTube Rising Word in Title']
    },
    {
        'name': 'Tweaking 2 Important Features from Decision Tree',
        'features': ['Like Count','Likes to Views Ratio','Avg Views Last 3 Months','duration_seconds','Video Licensed Code',
                     'Game Played Corrected Code', 'Primary Social Media Link','Premier Stream Video Code','Time of Day Code',
                    'Top Selling Game 2024']
    },
    {
        'name': 'Top Coefficients from Each Category',
        'features': ['Video Projection Code', 'Primary Social Media Link', 'Made For Kids', 'Like Count',
                     'Top Search Term in Description', 'Channel Age', 'Avg Views Last 1 Month']
    }
]

# Initialize lists to store results for each version
results = []
all_coefficients = []

# Iterate over versions and train linear regression models
for version in versions:
    # Define features and target variable
    X = model_df[version['features']]
    y = model_df['Views']

    # Split the dataset into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the linear regression model
    model = sm.OLS(y_train, sm.add_constant(X_train)).fit()

'''
Part 10
Goal: Trying to see which parameters to use. Then based on best parameters, show the final model I will use
Description: The script would loop through about 6300 permutations to see the best parameters. All parameters would be stored into a CSV which I would then look into and calculate what the best parameters would be to use. Similar to what model I chose, I weighted Precision, and ROC AUC as 30% and Recall + F1 score as 20%. Based on the weightage I chose what parameters I would be using
Output: Here is the way I decided what parameters to use. It improved every metric with ROC AUC seeing the biggest jump. The parameters found in Row 2 are the ones I ended up using


'''
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import ParameterGrid
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


# Load your dataset
model_df = pd.read_csv("/content/drive/My Drive/Youtube_Folder/video_activity_channel_altered.csv")

# Decided on these features because it is a smaller amount while also testing for different aspects of a youtube video
feature_set =  ['Like Count','Likes to Views Ratio','Avg Views Last 3 Months','duration_seconds','Video Licensed Code',
                     'Game Played Corrected Code', 'Primary Social Media Link','Premier Stream Video Code','Time of Day Code',
                    'Top Selling Game 2024']

# Define the hyperparameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 2, 3],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}

# Initialize lists to store results
results = []
feature_importance_results = []

# Initialize XGBoost regressor
xgb_reg = XGBRegressor(random_state=42)

# Iterate over hyperparameter tests
for i, params in enumerate(ParameterGrid(param_grid)):
    print(f"Testing Hyperparameter Set {i + 1}/{len(ParameterGrid(param_grid))}")

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(model_df[feature_set], model_df['Views'], test_size=0.2, random_state=42)

    # Initialize XGBoost regressor with current hyperparameters
    xgb_reg.set_params(**params)

    # Train the model
    xgb_reg.fit(X_train, y_train)

    # Make predictions (continuous)
    y_pred_continuous = xgb_reg.predict(X_test)

    # Convert to binary predictions using a threshold (e.g., 0.5)
    y_pred_binary = (y_pred_continuous > 0.5).astype(int)

    # Calculate evaluation metrics for regression
    mae = mean_absolute_error(y_test, y_pred_continuous)
    mse = mean_squared_error(y_test, y_pred_continuous)
    r2 = r2_score(y_test, y_pred_continuous)

    # Calculate evaluation metrics for classification
    accuracy = accuracy_score(y_test, y_pred_binary)
    precision = precision_score(y_test, y_pred_binary)
    recall = recall_score(y_test, y_pred_binary)
    f1 = f1_score(y_test, y_pred_binary)
    roc_auc = roc_auc_score(y_test, y_pred_continuous)

    # Store results for both regression and classification metrics
    results.append({
        'Hyperparameters': params,
        'MAE': mae,
        'MSE': mse,
        'R2': r2,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

    print(f"  MAE: {mae}, MSE: {mse}, R2: {r2}")
    print(f"  Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}, ROC AUC: {roc_auc}")

# Convert results to DataFrame
results_df = pd.DataFrame(results)

results_df.to_csv("/content/drive/My Drive/Youtube_Folder/xgboost_results_for_parameters.csv", index=False)

# Display results
print(results_df)

# Here is the model I chose based on number of features and based on the metrics I used to measure performance

# Selected features
feature_set = ['Like Count', 'Likes to Views Ratio', 'Avg Views Last 3 Months', 'duration_seconds',
               'Video Licensed Code', 'Game Played Corrected Code', 'Primary Social Media Link',
               'Premier Stream Video Code', 'Time of Day Code', 'Top Selling Game 2024']

# Selected hyperparameters
params = {
    'colsample_bytree': 1.0, #Sampling each and every feature for all samples
    'gamma': 0, #Essentially no regularization being used
    'learning_rate': 0.2, #Because it is a lower value, it makes the model more robust by shrinking the weights on each step
    'max_depth': 5, #5 decisions are made before it comes to a conclusion on if the video will be higher/lower than 100k views
    'min_child_weight': 2, #Each child must have 2 instances. This will hopefully prevent overfitting
    'reg_alpha': 0, # no L1 regularization is applied to the weights of the model.
    'reg_lambda': 0.5,#a moderate level of L2 regularization. It strikes a balance between preventing overfitting and allowing the model to learn from the data effectively.
    'subsample': 1.0 #All samples are being used to validate the model
}

# Initialize XGBoost regressor with specified hyperparameters
xgb_reg = XGBRegressor(**params, random_state=42)

# Train the model
xgb_reg.fit(model_df[feature_set], model_df['Views'])

# Print the model
print(xgb_reg)

#For xg boost, these parameters end up being the best: {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 5, 'min_child_weight': 2, 'reg_alpha': 0, 'reg_lambda': 0.5, 'subsample': 1.0}; So I go with this model









