<a href="https://colab.research.google.com/github/steinhaug/stable-diffusion/blob/main/BLIP-2/AutoCaptioner-postWorker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Download .txt files created by BLIP-2 notebook, create the caption files and parquet file.

## AutoCaptioner using LLaVA - Post Work

After running first part, run this one to put together the captions.

[![Buy me a beer](https://raw.githubusercontent.com/steinhaug/stable-diffusion/main/assets/buy-me-a-beer.png ) ](https://steinhaug.com/donate/)

In [15]:
#@markdown 1.0 load notebook functions
import tarfile
import os
import glob

def compress_tar(directory_path, output_tar_file, inclusion_pattern=None):
    if output_tar_file is None:
        output_tar_file = f"{directory_path}/{return__folderName(directory_path)}.tar"
    if not output_tar_file.endswith('.gz'):
        output_tar_file += '.gz'
    with tarfile.open(output_tar_file, 'w:gz') as tar:
        for root, dirs, files in os.walk(directory_path):
            if inclusion_pattern:
                files_to_include = glob.glob(os.path.join(root, inclusion_pattern))
            else:
                files_to_include = [os.path.join(root, file) for file in files]
            for file_path in files_to_include:
                arcname = os.path.relpath(file_path, directory_path)
                print(f"{arcname}")
                tar.add(file_path, arcname=arcname)

def decompress_tar_(file_path, output_dir='.', tar_params='r:gz'):
    os.makedirs(output_dir, exist_ok=True)
    try:
        with tarfile.open(file_path, tar_params) as tar:
            tar.extractall(output_dir)
        print(f"Successfully decompressed '{file_path}' to '{output_dir}'.")
    except tarfile.TarError as e:
        print(f"Error decompressing '{file_path}': {e}")

def decompress_tar_gz(file_path, output_dir='.'):
    decompress_tar_(file_path, output_dir, 'r:gz')
def decompress_tar(file_path, output_dir='.'):
    decompress_tar_(file_path, output_dir, 'r')

def __decompress_tar(tar_file, destination=None, flatten_structure=False):
    with tarfile.open(tar_file, 'r') as tar:
        if destination is not None:
            os.makedirs(destination, exist_ok=True)

        for member in tar.getmembers():
            if flatten_structure:
                # Use just the filename without directories
                member.name = os.path.basename(member.name)
            if destination is not None:
                # Join the destination directory with the member's name
                member_path = os.path.join(destination, os.path.dirname(member.name))
            else:
                member_path = os.path.dirname(member.name)

            if flatten_structure and destination==None:
                member_path = os.path.dirname(tar_file)

            tar.extract(member, path=member_path)

def ensure_array(input_var):
    if isinstance(input_var, list):
        return input_var
    elif isinstance(input_var, str):
        return [input_var]
    else:
        raise ValueError("Input must be a string or a list")

def array__prefix_with(filter_extensions, prefix='.'):
    return [ext if ext.startswith(prefix) else prefix + ext for ext in filter_extensions]

def return__fileCount(directory_path, extensions=None):
    matching_files_count = 0

    if extensions is not None:
        extensions = array__prefix_with(ensure_array(extensions)) #['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']

    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if extensions is None:
                matching_files_count += 1
            elif any(file.lower().endswith(extension.lower()) for extension in extensions):
                matching_files_count += 1

    return matching_files_count

def write_the_file(path, data_string):
    if len(str(data_string)):
        with open(path, 'w+') as fw:
            fw.write(str(data_string))
    else:
        if os.path.exists(path):
            os.remove(path)
    return '';

def fill_array_length(arr, length=10):
    length = length + 1
    if len(arr) < length:
        arr += [''] * (length - len(arr))
    return arr

def contains_any_substring(input_string, substrings, force_lowercase=True):
    for substring in substrings:
        if substring in input_string.lower():
            return True
    return False

def extract_first_string_in_quotes(input_string):
    start_quote = input_string.find('"')
    end_quote = input_string.find('"', start_quote + 1)

    if start_quote != -1 and end_quote != -1:
        contents_in_quotes = input_string[start_quote + 1:end_quote]
        return contents_in_quotes
    else:
        return None

def string__return_left_from_match(input_string, array_of_strings, replacement=''):
    for match in array_of_strings:
        if match in input_string:
            left_part = input_string.split(match)[0]
            return left_part + replacement
    return input_string

def remove_prefix(input_string, prefixes):
    for prefix in prefixes:
        if input_string.startswith(prefix):
            input_string = input_string[len(prefix):]

    return input_string

def remove_matches(main_string, substrings_to_remove):
    for substring in substrings_to_remove:
        main_string = main_string.replace(substring, '')
    return main_string

# Strip ordered list, number - prefix
def strip_ol(input_string):
    index_of_space = input_string.find('-')
    if index_of_space != -1:
        result_string = input_string[index_of_space + 1:]
        return result_string.strip()
    else:
        return input_string

def true_or_false(string, matches=["yes"]):
    if contains_any_substring(string, matches):
        return True
    else:
        return False


In [2]:
#@markdown 1.1 Caption generator from questionaire
def generate_caption(text):

    caption = ''

    text = text.strip()
    lines = fill_array_length( text.split('\n') )
    lineout = fill_array_length([])

    lineout[0] = extract_first_string_in_quotes(lines[0])
    caption += lineout[0] + '. '

    lineout[1] = remove_prefix(lines[1][4:], ["The subject is "]).capitalize()
    caption += lineout[1] + ' '

    result = remove_prefix(lines[2][4:], ["The subject is "])
    if contains_any_substring(result, ["completely naked"]):
        lineout[2] = 'NSWF, totally naked.'.capitalize()
    else:
        lineout[2] = remove_matches(result, [" as she is",", which are visible in the image"]).capitalize()
    caption += lineout[2] + ' '

    if contains_any_substring(strip_ol(lines[3]), ["not possible"]):
        result = ''
    elif contains_any_substring(strip_ol(lines[3]), ["shaved", "shaven"]):
        result = 'Shaved pussy.'
    elif contains_any_substring(strip_ol(lines[3]), ["trimmed"]):
        result = 'Trimmed pussy.'
    elif contains_any_substring(strip_ol(lines[3]), ["hairy"]):
        result = 'Hairy pussy.'
    else:
        result = ''
    lineout[3] = result
    caption += lineout[3] + ' '

    if contains_any_substring(strip_ol(lines[4]), ["not possible"]):
        result = ''
    elif contains_any_substring(strip_ol(lines[4]), ["tiny"]):
        result = 'Tiny tits, tiny breasts.'
    elif contains_any_substring(strip_ol(lines[4]), ["small"]):
        result = 'Small tits, small breasts.'
    elif contains_any_substring(strip_ol(lines[4]), ["normal"]):
        result = 'Normal tits, normal breasts.'
    elif contains_any_substring(strip_ol(lines[4]), ["large"]):
        result = 'Large tits, large breasts.'
    elif contains_any_substring(strip_ol(lines[4]), ["huge","big"]):
        result = 'Big tits, big breasts.'
    else:
        result = ''
    lineout[4] = result
    caption += lineout[4] + ' '

    if contains_any_substring(strip_ol(lines[5]), ["slender and slim"]):
        result = 'Slim figure.'
    elif contains_any_substring(strip_ol(lines[5]), ["appears to be normal","normal body"]):
        result = 'Normal figure.'
    elif contains_any_substring(strip_ol(lines[5]), ["thin"]):
        result = 'Thin figure.'
    elif contains_any_substring(strip_ol(lines[5]), ["skinny"]):
        result = 'Skinny figure.'
    elif contains_any_substring(strip_ol(lines[5]), ["large", "overweight", "fat", "fuller figure"]):
        result = 'Large figure.'
    else:
        result = ''
    lineout[5] = result
    caption += lineout[5] + ' '

    if not contains_any_substring(strip_ol(lines[10]), ['not visible']):
        result = remove_matches(strip_ol(lines[10]), ["The woman's ","The woman is ","The woman ", "likely engaging in "])
        result = string__return_left_from_match(result, [', possibly',', exposing it to',', adding to the'], '.').capitalize()
    else:
        result = ''
    lineout[10] = result
    caption += lineout[10]

    return caption


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
captions_file = '/content/drive/MyDrive/datasets/SkinnyHotGirls/skinny_hot_girls_captions2.tar.gz'

In [5]:
output_dir = '/content/datasets'
os.makedirs(output_dir, exist_ok=True)
decompress_tar_gz(captions_file, output_dir)

Successfully decompressed '/content/drive/MyDrive/datasets/SkinnyHotGirls/skinny_hot_girls_captions2.tar.gz' to '/content/datasets'.


In [12]:
!rm -Rf /content/datasets/slim-camel-toe-girl-jemma-8.txt
!rm -Rf /content/datasets/slim-camel-toe-girl-jemma-8.png

In [None]:
#@markdown Create .caption from .txt files
extensions = ['.txt']
total_count = return__fileCount(output_dir, extensions)
caption_count = 0
for root, dirs, files in os.walk(output_dir):
    for file in files:
        #print( f"{root}/{file}" )
        file_path = os.path.join(root, file)
        file_root, file_ext = os.path.splitext(file_path)
        if file_ext.lower() in extensions:
            caption_count = caption_count + 1
            #print( f"{caption_count}/{total_count} {root}/{file}" )

            caption_file = f"{file_root}.caption"
            if not os.path.isfile(caption_file):

                with open(file_path, 'r') as file:
                    file_content = file.read()
                print( file_path )
                caption_result = generate_caption(file_content)

                write_the_file(caption_file, caption_result)

                print( caption_result )
                #print( file_content )
#            else:
#                print ( 'skip' )
            #break


In [8]:
#@markdown Login to Huggingface hub
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')

!huggingface-cli login --token {HF_TOKEN}
from huggingface_hub import snapshot_download

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [9]:
#@markdown Download image sets
import os
SAVE_PATH = '/content/datasets'
REPO_ID = 'steinhaug/onceUponAtimeInPornVille'
os.makedirs(f"{SAVE_PATH}/{REPO_ID}", exist_ok=True)
path = snapshot_download(repo_id=REPO_ID, repo_type="dataset", revision="main", allow_patterns="SkinnyHotGirls/*", local_dir=f"{SAVE_PATH}/{REPO_ID}", local_dir_use_symlinks=False)

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

SkinnyHotGirls/README.md:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

skinny_hot_girls.tar:   0%|          | 0.00/723M [00:00<?, ?B/s]

In [10]:
image_file = '/content/datasets/steinhaug/onceUponAtimeInPornVille/SkinnyHotGirls/skinny_hot_girls.tar'
decompress_tar(image_file)

Successfully decompressed '/content/datasets/steinhaug/onceUponAtimeInPornVille/SkinnyHotGirls/skinny_hot_girls.tar' to '.'.


In [11]:
image_folder = '/content/skinny_hot_girls'
!cp $output_dir/*.caption $image_folder
# remove missing file
!rm -Rf /content/skinny_hot_girls/slim-camel-toe-girl-jemma-8.txt
!rm -Rf /content/skinny_hot_girls/slim-camel-toe-girl-jemma-8.caption
!rm -Rf /content/skinny_hot_girls/slim-camel-toe-girl-jemma-8.png

In [13]:
#@markdown Create .parquet file: /content/skinny_hot_girls.parquet
# Needs High-RAM to complete

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from PIL import Image
import numpy as np
import os

dataset_folder = "/content/skinny_hot_girls"

# Function to read caption from text file
def read_caption(file_path):
    with open(file_path, 'r') as file:
        caption = file.read().strip()
    return caption

# Function to process each image and caption
def process_image(image_path):
    image = np.array(Image.open(image_path))  # Convert PIL Image to NumPy array
    flattened_image = image.flatten()  # Flatten the 2D array to 1D
    caption_path = os.path.splitext(image_path)[0] + '.caption'
    caption = read_caption(caption_path)
    return {'image': flattened_image, 'caption': caption}

# List to store processed data
data_list = []

# Iterate through the dataset folder
for filename in os.listdir(dataset_folder):
    if filename.endswith(".png"):
        image_path = os.path.join(dataset_folder, filename)
        data = process_image(image_path)
        data_list.append(data)

# Create a Pandas DataFrame from the list
df = pd.DataFrame(data_list)

# Convert the DataFrame to a PyArrow Table
table = pa.Table.from_pandas(df)

# Save the PyArrow Table to a Parquet file
parquet_file_path = "/content/skinny_hot_girls.parquet"
pq.write_table(table, parquet_file_path)

In [None]:
# Create .tar.gz file for captions
compress_tar('/content/skinny_hot_girls', '/content/skinny_hot_girls.captions.tar', '*.caption')

In [17]:
#@markdown Upload .parquet and .tar.gz file to Huggingface dataset
from huggingface_hub import HfApi, HfFolder, CommitOperationAdd
from huggingface_hub import create_repo

hf_token = userdata.get('HF_TOKEN')

api = HfApi()
your_username = api.whoami(token=hf_token)["name"]
repo_id = f"{your_username}/onceUponAtimeInPornVille"

operations = [
    CommitOperationAdd(path_in_repo="SkinnyHotGirls/skinny_hot_girls.parquet",path_or_fileobj="/content/skinny_hot_girls.parquet"),
    CommitOperationAdd(path_in_repo="SkinnyHotGirls/skinny_hot_girls.captions.tar.gz",path_or_fileobj="/content/skinny_hot_girls.captions.tar.gz"),
]

api.create_commit(
    repo_id=repo_id,
    repo_type="dataset",
    operations=operations,
    commit_message=f"Adding skinny_hot_girls.parquet",
    token=hf_token
)


skinny_hot_girls.captions.tar.gz:   0%|          | 0.00/45.9k [00:00<?, ?B/s]

ok


Complete.

### String work for "Caption generator from questionaire"

In [None]:
text = '''1 - If this was a photo for an adult movie, the title would be "Sultry Seduction."
2 - The subject is a beautiful, blonde woman with a captivating gaze.
3 - The woman is wearing a black bra and panties, which are visible in the image.
4 - The woman's pussy is shaved, giving her a clean and polished appearance.
5 - The woman has large breasts, which are prominently displayed in the photo.
6 - The woman has a normal body build, with a healthy and attractive physique.
7 - Yes
8 - Yes
9 - No
10 - Lying on her back
11 - The woman's hands are resting on her thighs, adding to the sensual and alluring atmosphere of the image.'''

In [None]:
text = text.strip()
lines = fill_array_length( text.split('\n') )
lineout = fill_array_length([])

In [None]:
lineout[0] = extract_first_string_in_quotes(lines[0])
print(lineout[0])

Sultry Seduction.


In [None]:
lineout[1] = remove_prefix(lines[1][4:], ["The subject is "]).capitalize()
print( lineout[1] )

A beautiful, blonde woman with a captivating gaze.


In [None]:
result = remove_prefix(lines[2][4:], ["The subject is "])
if contains_any_substring(result, ["completely naked"]):
    lineout[2] = 'NSWF, totally naked.'.capitalize()
else:
    lineout[2] = remove_matches(result, [" as she is",", which are visible in the image"]).capitalize()

print( lineout[2] )

The woman is wearing a black bra and panties.


In [None]:
if contains_any_substring(strip_ol(lines[3]), ["not possible"]):
    result = '-'
elif contains_any_substring(strip_ol(lines[3]), ["shaved", "shaven"]):
    result = 'Shaved pussy.'
elif contains_any_substring(strip_ol(lines[3]), ["trimmed"]):
    result = 'Trimmed pussy.'
elif contains_any_substring(strip_ol(lines[3]), ["hairy"]):
    result = 'Hairy pussy.'
else:
    result = '-'

lineout[3] = result
print( lineout[3] )

Shaved pussy.


In [None]:
if contains_any_substring(strip_ol(lines[4]), ["not possible"]):
    result = '-'
elif contains_any_substring(strip_ol(lines[4]), ["tiny"]):
    result = 'Tiny tits, tiny breasts.'
elif contains_any_substring(strip_ol(lines[4]), ["small"]):
    result = 'Small tits, small breasts.'
elif contains_any_substring(strip_ol(lines[4]), ["normal"]):
    result = 'Normal tits, normal breasts.'
elif contains_any_substring(strip_ol(lines[4]), ["large"]):
    result = 'Large tits, large breasts.'
elif contains_any_substring(strip_ol(lines[4]), ["huge","big"]):
    result = 'Big tits, big breasts.'
else:
    result = '-'

lineout[4] = result
print( lineout[4] )

Large tits, large breasts.


In [None]:
if contains_any_substring(strip_ol(lines[5]), ["slender and slim"]):
    result = 'Slim figure.'
elif contains_any_substring(strip_ol(lines[5]), ["appears to be normal","normal body"]):
    result = 'Normal figure.'
elif contains_any_substring(strip_ol(lines[5]), ["thin"]):
    result = 'Thin figure.'
elif contains_any_substring(strip_ol(lines[5]), ["skinny"]):
    result = 'Skinny figure.'
elif contains_any_substring(strip_ol(lines[5]), ["large", "overweight", "fat", "fuller figure"]):
    result = 'Large figure.'
else:
    result = '-'

lineout[5] = result
print( lineout[5] )

Normal figure.


In [None]:
x1 = true_or_false(strip_ol(lines[6]))
x2 = true_or_false(strip_ol(lines[7]), ["hands are visible","yes"])
x3 = true_or_false(strip_ol(lines[8]))

print(f"{x1} {x2} {x3}")

True True False


In [None]:
result = remove_matches(strip_ol(lines[9]), ["The woman's ","The woman is ","The woman "]).capitalize()
lineout[9] = result
print( lineout[9] )

Lying on her back


In [None]:
#print( lines[10] )
if not contains_any_substring(strip_ol(lines[10]), ['not visible']):
    result = remove_matches(strip_ol(lines[10]), ["The woman's ","The woman is ","The woman ", "likely engaging in "])
    result = string__return_left_from_match(result, [', possibly',', exposing it to',', adding to the'], '.').capitalize()
else:
    result = '-'


lineout[10] = result
print( lineout[10] )

Hands are resting on her thighs, adding to the sensual and alluring atmosphere of the image.
