# Generative Video Caption Processing
- Set date, input directory, and output directory parameters
- Load and stack data from input directory
- Find relevant rows for export 
    - New `callback_metadata`
    - Status of `yes` or `neutral`
- Export

## Set Parameters

In [5]:
p_date = "061623"

p_dir_in  = "/Users/smarshall/Desktop/projects/generative/video_captioning/hive_exports/"
p_dir_out = "/Users/smarshall/Desktop/projects/generative/video_captioning/videos_to_caption/"

## Write Processing Function

In [3]:
def new_videos_only(input_directory, output_directory, date):

    import os
    import pandas as pd

    # pull list of all filenames
    csv_files = [file_name for file_name in os.listdir(p_dir_in) if file_name.endswith(".csv")]

    # create list for data frames
    dataframes = []

    # laod in data frames
    for csv_file in csv_files:
        file_path = os.path.join(p_dir_in, csv_file)
        df = pd.read_csv(file_path)
        df["export_id"] = os.path.splitext(csv_file)[0]
        dataframes.append(df)

    # stack data
    stacked_files = pd.concat(dataframes, axis=0, ignore_index=True)

    # count callback metadata occurences 
    stacked_files["callback_metadata_count"] = stacked_files.groupby("callback_metadata")["callback_metadata"].transform("count")

    # define subset conditions
    callback_count_1   = stacked_files["callback_metadata_count"] == 1
    status_yes_neutral = stacked_files["status"].str.contains("yes")| stacked_files["status"].str.contains("neutral")

    # subset on conditions and by desired columns
    new_videos = stacked_files[callback_count_1 & status_yes_neutral][["callback_metadata", "export_id", "download_url"]]

    # export 
    new_videos.to_csv(p_dir_out + "videos_to_caption__" + p_date + ".csv", index=False)

    # print n new videos
    print("Generating CSV with ", len(new_videos), " new videos for captioning")

## Run Function

In [6]:
new_videos_only(p_dir_in, p_dir_out, p_dir_in)

Generating CSV with  2169  new videos for captioning
