In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tabulate import tabulate
import csv
import platform


import sys
import os

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(parent_dir)

# Internal functions
from utills import process_thumbnail


def clear_console():
    # Check the platform
    current_os = platform.system()

    if current_os == "Windows":
        os.system('cls')
    else:  # Linux and macOS
        os.system('clear')

# Main function to iterate through the dataset and save results to a CSV file
def process_youtube_data(dataset):
    error_log = []
    total_rows = len(dataset)
    processed_rows = 0

    # Check if the file already exists to determine if headers should be written
    file_exists = os.path.exists('../data/youtube_data_processed.csv')

    file_path = '../data/youtube_data_processed.csv'

    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"{file_path} has been removed.")
    else:
        print(f"{file_path} does not exist.")

    with open(file_path, mode='a', newline='', encoding='utf-8') as file:
        # Initialize the CSV writer
        writer = None

        for index, row in dataset.iterrows():
            clear_console()
            print(f"Processing row {processed_rows + 1} of {total_rows}...")
            row_data, error = process_thumbnail(row)
            
            if row_data:
                if writer is None:
                    # Write the header only if the file doesn't exist
                    writer = csv.DictWriter(file, fieldnames=row_data.keys())
                    if not file_exists:
                        writer.writeheader()

                # Write the processed row data
                writer.writerow(row_data)
                processed_rows += 1

            if error:
                error_log.append(error)

            print(f"Processed row {processed_rows + 1}")

    clear_console()

    print("Feature extraction complete and saved to CSV.")
    print(f"Total rows processed: {processed_rows} of {total_rows}")
    print(f"Total errors: {len(error_log)}")

    # Convert the error log list into a list of lists for tabulate
    table_data = [[entry['video_id'], entry['error']] for entry in error_log]

    # Define headers
    headers = ['Video ID', 'Error']

    # Display the table using tabulate
    print(tabulate(table_data, headers=headers, tablefmt='grid'))

print("Processing the YouTube data now...")
dataset = pd.read_csv('../data/youtube_data.csv')

# Sample 5 random rows from the dataset, comment this line to process the entire dataset
dataset = dataset.sample(n=5, random_state=1) 

process_youtube_data(dataset)
print("Finished processing the YouTube data. Run the script again and choose 'Create model' to proceed.")


Processing the YouTube data now...
../data/youtube_data_processed.csv does not exist.
[H[2JProcessing row 1 of 5...
Downloading image from URL:  https://i.ytimg.com/vi/9J_BZG30GWo/maxresdefault.jpg

0: 384x640 1 person, 6 books, 62.1ms
Speed: 2.4ms preprocess, 62.1ms inference, 5.1ms postprocess per image at shape (1, 3, 384, 640)
Processed row 2
[H[2JProcessing row 2 of 5...
Downloading image from URL:  https://i.ytimg.com/vi/ll8f05AeDys/maxresdefault.jpg

0: 384x640 2 persons, 59.0ms
Speed: 1.3ms preprocess, 59.0ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)
Processed row 3
[H[2JProcessing row 3 of 5...
Downloading image from URL:  https://i.ytimg.com/vi/L9QcQKbZUvk/maxresdefault.jpg

0: 384x640 1 person, 52.4ms
Speed: 1.3ms preprocess, 52.4ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)
Processed row 4
[H[2JProcessing row 4 of 5...
Downloading image from URL:  https://i.ytimg.com/vi/VBw0L-pYIHM/maxresdefault.jpg

0: 384x640 1 person, 