In [2]:
###############################################
# Extract the circumstance from the image
################################################

import requests
import ollama
# Ollama 서버 설정
ollama_host = "http://sg035:11434"
client = ollama.Client(host=ollama_host)  # 클라이언트 인스턴스 생성
prompt = "Don't guess it is drama or tv show. Analyze the given image and describe only the specific actions and interactions of the people in the scene. Focus on what they are doing, their gestures, expressions, and interactions, rather than general details about the environment or objects. Avoid assumptions beyond what is explicitly shown in the image."
# "Analyze the given image and describe only the specific actions and interactions of the people in the scene. Focus on what they are doing, their gestures, expressions, and interactions, rather than general details about the environment or objects. Avoid assumptions beyond what is explicitly shown in the image."

try:
    response = requests.get(ollama_host)
    print("Server connected")
    print(response.text)
    
except requests.ConnectionError:
    print("Not connected")

Server connected
Ollama is running


In [3]:
###############################################
# Test llama vision working
# Must choose a100:2, for llama3.2-vision:90b
# a100:1 works for llama3.2-vision
################################################

import base64

with open("./image.png", "rb") as img_file:
    base64_image = base64.b64encode(img_file.read()).decode("utf-8")
    
response = client.chat(
    model='llama3.2-vision', # need a100:1 gpu
    # model='llama3.2-vision:90b', # need a100:2 gpu
    # model='llama3.2-vision', # need a100:1 gpu
    messages=[{
        'role': 'user',
        'content': prompt,
        'images': [base64_image]  # Base64 문자열 전달
    }]
)

print(response['message']['content'])

The image depicts a woman sitting at a desk, engaged in conversation with a man across from her.

In this scene, the woman is seated behind a dark-colored desk, with her hands clasped together on its surface. She wears a dark grey blazer over a black shirt and has her hair pulled back into a neat bun. The man opposite her faces away from the camera, his head turned towards the woman as he speaks to her.

The background of the image features white walls, beige curtains, and large windows that allow natural light to pour in through sheer white blinds. A dark wood desk sits on the left side of the room, accompanied by a chair with curved legs.


In [3]:
###############################################
# Test llama vision working
# Must choose a100:2, for llama3.2-vision:90b
# a100:1 works for llama3.2-vision
################################################

import base64

with open("./image.png", "rb") as img_file:
    base64_image = base64.b64encode(img_file.read()).decode("utf-8")
    
response = client.chat(
    # model='llama3.2-vision', # need a100:1 gpu
    model='llama3.2-vision:90b', # need a100:2 gpu
    # model='llama3.2-vision', # need a100:1 gpu
    messages=[{
        'role': 'user',
        'content': prompt,
        'images': [base64_image]  # Base64 문자열 전달
    }]
)

print(response['message']['content'])

The image depicts a woman sitting at a desk with her hands clasped together in front of her. She has dark hair pulled back into a bun and is wearing a black turtleneck sweater paired with a gray blazer.

In the foreground, there are several papers on the desk, including what appears to be a document or report, as well as a pen holder containing pens and pencils. The background of the image shows a window with white curtains drawn closed, suggesting that it may be daytime outside but not bright enough for natural light to illuminate the room.

Overall, the atmosphere suggests a professional setting, possibly an office or meeting room, where important discussions or decisions are being made.


In [6]:
import os
import json
import base64
import ollama
import time
import traceback
from pathlib import Path
from datetime import datetime

# 경로 설정
drama_folder_name = "Juilliard•NYC VLOG 석사 2학년 시이작!"
image_dir = Path(f'/scratch/jsong132/Can_LLM_Learn_New_Language/Data_Images/{drama_folder_name}')
output_file = Path(f'/scratch/jsong132/Can_LLM_Learn_New_Language/Data_llama_vision/{drama_folder_name}.json')

#######################
# model choose
#######################
used_model = "llama3.2-vision"

# 출력 디렉토리 생성
output_file.parent.mkdir(parents=True, exist_ok=True)

# 로깅 설정
def log(message, level="INFO"):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{timestamp}] [{level}] {message}")

# 초기화
results = []
VALID_EXTENSIONS = ('.png', '.jpg', '.jpeg')
total_images = len([f for f in image_dir.iterdir() if f.suffix.lower() in VALID_EXTENSIONS])
processed = 0

log(f"Starting image processing for {total_images} images")

# 이미지 처리
for image_path in image_dir.iterdir():
    if not (image_path.is_file() and image_path.suffix.lower() in VALID_EXTENSIONS):
        continue

    processed += 1
    log(f"Processing image ({processed}/{total_images}): {image_path.name}")
    start_time = time.time()
    
    try:
        # 이미지 인코딩
        encode_start = time.time()
        with open(image_path, "rb") as img_file:
            base64_image = base64.b64encode(img_file.read()).decode("utf-8")
        encode_time = time.time() - encode_start
        log(f"Image encoded in {encode_time:.2f}s")

        # API 요청
        api_start = time.time()
        used_model = "llama3.2-vision"
        response = client.chat(
        ###################### Choose Model ###################
            model="llama3.2-vision",
            # "llama3.2-vision:90b"
            # llama3.2-vision"
            messages=[{
                'role': 'user',
                'content': prompt,
                'images': [base64_image]
            }]
        )
        api_time = time.time() - api_start
        log(f"API response received in {api_time:.2f}s")

        # 결과 저장
        results.append({
            'image': str(image_path),
            'response': response['message']['content'],
            'processing_time': {
                'encoding': encode_time,
                'api_call': api_time,
                'total': time.time() - start_time
            },
            'status': 'success'
        })

    except Exception as e:
        error_msg = f"Error processing {image_path.name}: {str(e)}"
        error_trace = traceback.format_exc()
        log(error_msg, "ERROR")
        log(f"Error details:\n{error_trace}", "DEBUG")
        
        results.append({
            'image': str(image_path),
            'error': error_msg,
            'error_trace': error_trace,
            'status': 'failed'
        })

# 결과 저장
save_start = time.time()
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)
save_time = time.time() - save_start

log(f"Results saved to {output_file} in {save_time:.2f}s")
log(f"Processing completed. Success: {len([x for x in results if x['status']=='success'])}, Failed: {len([x for x in results if x['status']=='failed'])}")

[2025-03-07 23:20:56] [INFO] Starting image processing for 734 images
[2025-03-07 23:20:56] [INFO] Processing image (1/734): frame_0506.png
[2025-03-07 23:20:56] [INFO] Image encoded in 0.09s
[2025-03-07 23:21:00] [INFO] API response received in 3.34s
[2025-03-07 23:21:00] [INFO] Processing image (2/734): frame_0396.png
[2025-03-07 23:21:00] [INFO] Image encoded in 0.07s
[2025-03-07 23:21:03] [INFO] API response received in 3.45s
[2025-03-07 23:21:03] [INFO] Processing image (3/734): frame_0565.png
[2025-03-07 23:21:03] [INFO] Image encoded in 0.04s
[2025-03-07 23:21:07] [INFO] API response received in 3.86s
[2025-03-07 23:21:07] [INFO] Processing image (4/734): frame_0495.png
[2025-03-07 23:21:07] [INFO] Image encoded in 0.09s
[2025-03-07 23:21:10] [INFO] API response received in 3.01s
[2025-03-07 23:21:10] [INFO] Processing image (5/734): frame_0264.png
[2025-03-07 23:21:10] [INFO] Image encoded in 0.02s
[2025-03-07 23:21:14] [INFO] API response received in 4.03s
[2025-03-07 23:21:14

In [9]:
import json
from pathlib import Path

# 파일 경로 설정
input_file = Path(f'/scratch/jsong132/Can_LLM_Learn_New_Language/Data_llama_vision/{drama_folder_name}.json')
output_file = Path(f'/scratch/jsong132/Can_LLM_Learn_New_Language/Data_llama_vision/{drama_folder_name}_organized.json')

# JSON 파일 읽기
with open(input_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

# 아이템 처리 함수
def process_item(item):
    try:
        filename = Path(item['image']).name
        
        # 파일명 구조: frame_0001.png → ['frame', '0001.png']
        parts = filename.split('_')
        
        # 숫자 부분 추출 (frame_0001.png → 0001)
        frame_number = int(parts[1].split('.')[0])  # frame_0001.png → 0001
        
        return {
            'image': filename,
            'frame_number': frame_number,
            'response': item['response'],
            'status': item['status']
        }
    except Exception as e:
        print(f"파일명 형식 오류: {filename} → {str(e)}")
        return None

# 데이터 처리 (오류 항목 필터링)
processed_data = [item for item in (process_item(i) for i in data) if item is not None]

# 숫자 순으로 정렬 (frame_number 기준)
sorted_data = sorted(processed_data, key=lambda x: x['frame_number'])

# 최종 출력 형식
final_data = [
    {
        'used_model': 'llama3.2-vision',  # 모델 이름을 하드코딩 (필요 시 수정)
        'image': sorted_data[0]['image'],
        'response': sorted_data[0]['response'],
    }
] + [
    {
        'image': item['image'],
        'response': item['response'],
    }
    for item in sorted_data[1:]
]

# JSON 파일로 저장
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(final_data, f, ensure_ascii=False, indent=4)

print(f"정렬 완료! 결과 파일: {output_file}")


파일명 형식 오류: frame_0001.png → 'status'
파일명 형식 오류: frame_0002.png → 'status'
파일명 형식 오류: frame_0003.png → 'status'
파일명 형식 오류: frame_0004.png → 'status'
파일명 형식 오류: frame_0005.png → 'status'
파일명 형식 오류: frame_0006.png → 'status'
파일명 형식 오류: frame_0007.png → 'status'
파일명 형식 오류: frame_0008.png → 'status'
파일명 형식 오류: frame_0009.png → 'status'
파일명 형식 오류: frame_0010.png → 'status'
파일명 형식 오류: frame_0011.png → 'status'
파일명 형식 오류: frame_0012.png → 'status'
파일명 형식 오류: frame_0013.png → 'status'
파일명 형식 오류: frame_0014.png → 'status'
파일명 형식 오류: frame_0015.png → 'status'
파일명 형식 오류: frame_0016.png → 'status'
파일명 형식 오류: frame_0017.png → 'status'
파일명 형식 오류: frame_0018.png → 'status'
파일명 형식 오류: frame_0019.png → 'status'
파일명 형식 오류: frame_0020.png → 'status'
파일명 형식 오류: frame_0021.png → 'status'
파일명 형식 오류: frame_0022.png → 'status'
파일명 형식 오류: frame_0023.png → 'status'
파일명 형식 오류: frame_0024.png → 'status'
파일명 형식 오류: frame_0025.png → 'status'
파일명 형식 오류: frame_0026.png → 'status'
파일명 형식 오류: frame_0027.png → 'status'
파

IndexError: list index out of range

In [19]:
import json
from pathlib import Path

# 원본 파일 경로와 출력 파일 경로 설정
input_file = Path(f'/scratch/jsong132/Can_LLM_Learn_New_Language/Data_llama_vision/{drama_folder_name}.json')
output_file = Path(f'/scratch/jsong132/Can_LLM_Learn_New_Language/Data_llama_vision/{drama_folder_name}_organized.json')

# JSON 파일 읽기
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# 데이터 처리 함수
def process_item(item):
    try:
        filename = Path(item['image']).name  # 이미지 파일 이름만 추출 (경로 제거)

        # 파일명 구조: frame_0506.png → ['frame', '0506.png']
        parts = filename.split('_')
        
        # 숫자 부분 추출 (frame_0506.png → 0506)
        frame_number = int(parts[1].split('.')[0])  # frame_0506.png → 0506
        formatted_frame_number = str(frame_number).zfill(4)
        return {
            'image': filename,  # 'frame_0506.png'
            'frame_number': formatted_frame_number,
            'response': item['response'],
        }
    except Exception as e:
        print(f"파일명 형식 오류: {item['image']} → {str(e)}")
        return None

# 데이터 처리 (오류 항목 필터링)
processed_data = [item for item in (process_item(i) for i in data) if item is not None]

# 처리된 데이터가 없는 경우 처리
if not processed_data:
    print("처리할 데이터가 없습니다.")
else:
    # 숫자 순으로 정렬 (frame_number 기준)
    sorted_data = sorted(processed_data, key=lambda x: x['frame_number'])

    # 최종 출력 형식
    final_data = [
        {
            'image': sorted_data[0]['image'],
            'response': sorted_data[0]['response'],
        }
    ] + [
        {
            'image': item['image'],
            'response': item['response'],
        }
        for item in sorted_data[1:]
    ]

    # 정렬된 데이터를 새로운 JSON 파일로 저장
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(final_data, f, ensure_ascii=False, indent=4)

    print(f"정렬된 데이터가 {output_file}로 저장되었습니다.")


0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197
0198
0199
0200


In [30]:
################################################
# merge_subtitle_w_vision_result.py
# Merging llama-vision result + subtitle + timestamp
################################################
import json
from pathlib import Path

output_file = Path(f'/scratch/jsong132/Can_LLM_Learn_New_Language/Data_llama_vision/{drama_folder_name}_organized.json')


# llama_vision_data.json 파일 로드
with open(output_file, "r", encoding="utf-8") as f:
    llama_vision_data = json.load(f)

subtitle = Path(f'/scratch/jsong132/Can_LLM_Learn_New_Language/Data_Subtitles/{drama_folder_name}_organized.ko.json')
# subtitle.json 파일 로드
with open(subtitle, "r", encoding="utf-8") as f:
    subtitle_data = json.load(f)

# final_output.json으로 저장할 데이터 리스트 초기화
dataset = []

# 두 파일의 데이터를 매칭하여 dataset 생성
for result_item, subtitle_item in zip(llama_vision_data, subtitle_data):
    input_text = subtitle_item.get("context", "")  # subtitle_data.json의 "context"를 input으로
    timestamp = subtitle_item.get("timestamp", "")  # subtitle_data.json의 "timestamp"를 timestamp
    output_text = result_item.get("response", "")  # llama_vision_data.json의 "response"를 output으로
    
    # input, output, timestamp가 모두 비어있지 않은 경우만 추가
    if input_text and output_text:
        dataset.append({"timestamp": timestamp, "input": input_text, "output": output_text})


final_output = Path(f'/scratch/jsong132/Can_LLM_Learn_New_Language/Data_Final/{drama_folder_name}_final.json')
# dataset.json 파일로 저장
with open(final_output, "w", encoding="utf-8") as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)

print(f"{drama_folder_name}_final.json 파일이 생성되었습니다.")

Juilliard•NYC VLOG 석사 2학년 시이작!_final.json 파일이 생성되었습니다.


In [1]:
###############################################
# Reverse input <-> output
# Professor suggestion
################################################

import json
from pathlib import Path
drama_folder_name="Juilliard•NYC VLOG 석사 2학년 시이작!"
# final_output 파일 경로 설정
final_output = Path(f'/scratch/jsong132/Can_LLM_Learn_New_Language/Data_Final/{drama_folder_name}_final.json')

# final_output에서 데이터 읽기
with open(final_output, "r", encoding="utf-8") as f:
    dataset = json.load(f)

# input과 output 값을 교환하는 작업
reversed_dataset = []
for item in dataset:
    # input과 output을 서로 바꾸기
    reversed_item = {
        'timestamp': item['timestamp'],
        'input': item['output'],  # output을 input으로
        'output': item['input'],  # input을 output으로
    }
    reversed_dataset.append(reversed_item)

# 기존 dataset과 reversed_dataset을 합치기
combined_dataset = dataset + reversed_dataset

# reversed_final_output 파일 경로 설정
reversed_final_output = Path(f'/scratch/jsong132/Can_LLM_Learn_New_Language/Data_Reversed_Final/{drama_folder_name}_reversed_final.json')

# combined dataset을 새로운 JSON 파일로 저장
with open(reversed_final_output, "w", encoding="utf-8") as f:
    json.dump(combined_dataset, f, ensure_ascii=False, indent=4)

print(f"{drama_folder_name}_reversed_final.json 파일이 생성되었습니다.")


NameError: name 'drama_folder_name' is not defined

In [10]:
###############################################
# json_pretty.py
# Make json file readible.
################################################

import json
import textwrap

# JSON 파일 로드
with open(final_output, 'r', encoding='utf-8') as f:
    data = json.load(f)

# response 값을 50자씩 나누어 배열로 분할
for item in data:
    if 'response' in item:
        # 50자씩 나누기
        wrapped_text = textwrap.wrap(item['response'], width=100)
        item['response'] = wrapped_text

# 수정된 JSON 파일 저장
with open(output_pretty, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print("JSON 파일이 수정되어 저장되었습니다.")


JSON 파일이 수정되어 저장되었습니다.
