In [5]:
import requests
import ollama
# Ollama 서버 설정
ollama_host = "http://sg014:11434"
client = ollama.Client(host=ollama_host)  # 클라이언트 인스턴스 생성
prompt = "Analyze the given image and describe only the specific actions and interactions of the people in the scene. Focus on what they are doing, their gestures, expressions, and interactions, rather than general details about the environment or objects. Avoid assumptions beyond what is explicitly shown in the image."
# "Analyze the given image and describe only the specific actions and interactions of the people in the scene. Focus on what they are doing, their gestures, expressions, and interactions, rather than general details about the environment or objects. Avoid assumptions beyond what is explicitly shown in the image."

try:
    response = requests.get(ollama_host)
    print("Server connected")
    print(response.text)
    
except requests.ConnectionError:
    print("Not connected")

Server connected
Ollama is running


In [6]:
import base64

with open("./image.png", "rb") as img_file:
    base64_image = base64.b64encode(img_file.read()).decode("utf-8")
    
response = client.chat(
    model='llama3.2-vision:90b', # need a100:1 gpu
    # model='llama3.2-vision:90b', # need a100:2 gpu
    # model='llama3.2-vision', # need a100:1 gpu
    messages=[{
        'role': 'user',
        'content': prompt,
        'images': [base64_image]  # Base64 문자열 전달
    }]
)

print(response['message']['content'])

The image presents a split-screen composition, with two distinct scenes unfolding simultaneously.

**Top Image:**

* A woman sits at a desk, her hands clasped together in front of her.
* She wears a black turtleneck sweater and has dark hair pulled back into a ponytail.
* The background features a window with white curtains, accompanied by a table and chair to the left.
* The overall atmosphere suggests a professional or business setting.

**Bottom Image:**

* A man stands in front of a wall adorned with framed pictures and a clock.
* He wears a dark suit jacket over a light-colored shirt and tie.
* His hands are clasped behind his back, conveying confidence or authority.
* The background features a doorway on the right side of the image.

**Comparison:**

* Both images depict individuals in professional attire, suggesting a work-related context.
* The top image focuses on the woman's upper body, while the bottom image shows the man from head to toe.
* The backgrounds differ significan

In [9]:
import os
import json
import base64
import ollama
import time
import traceback
from pathlib import Path
from datetime import datetime

# 경로 설정
image_dir = Path('/scratch/jsong132/Can_LLM_Learn_New_Language/cropped_img')
output_file = Path('/scratch/jsong132/Can_LLM_Learn_New_Language/result.json')

# 출력 디렉토리 생성
output_file.parent.mkdir(parents=True, exist_ok=True)

# 로깅 설정
def log(message, level="INFO"):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{timestamp}] [{level}] {message}")

# 초기화
results = []
VALID_EXTENSIONS = ('.png', '.jpg', '.jpeg')
total_images = len([f for f in image_dir.iterdir() if f.suffix.lower() in VALID_EXTENSIONS])
processed = 0

log(f"Starting image processing for {total_images} images")

# 이미지 처리
for image_path in image_dir.iterdir():
    if not (image_path.is_file() and image_path.suffix.lower() in VALID_EXTENSIONS):
        continue

    processed += 1
    log(f"Processing image ({processed}/{total_images}): {image_path.name}")
    start_time = time.time()
    
    try:
        # 이미지 인코딩
        encode_start = time.time()
        with open(image_path, "rb") as img_file:
            base64_image = base64.b64encode(img_file.read()).decode("utf-8")
        encode_time = time.time() - encode_start
        log(f"Image encoded in {encode_time:.2f}s")

        # API 요청
        api_start = time.time()
        response = client.chat(
        ###################### Choose Model ###################
            model="llama3.2-vision:90b",
            # "llama3.2-vision:90b"
            # llama3.2-vision"
            messages=[{
                'role': 'user',
                'content': prompt,
                'images': [base64_image]
            }]
        )
        api_time = time.time() - api_start
        log(f"API response received in {api_time:.2f}s")

        # 결과 저장
        results.append({
            'image': str(image_path),
            'response': response['message']['content'],
            'processing_time': {
                'encoding': encode_time,
                'api_call': api_time,
                'total': time.time() - start_time
            },
            'status': 'success'
        })

    except Exception as e:
        error_msg = f"Error processing {image_path.name}: {str(e)}"
        error_trace = traceback.format_exc()
        log(error_msg, "ERROR")
        log(f"Error details:\n{error_trace}", "DEBUG")
        
        results.append({
            'image': str(image_path),
            'error': error_msg,
            'error_trace': error_trace,
            'status': 'failed'
        })

# 결과 저장
save_start = time.time()
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)
save_time = time.time() - save_start

log(f"Results saved to {output_file} in {save_time:.2f}s")
log(f"Processing completed. Success: {len([x for x in results if x['status']=='success'])}, Failed: {len([x for x in results if x['status']=='failed'])}")

[2025-03-03 22:34:57] [INFO] Starting image processing for 824 images
[2025-03-03 22:34:57] [INFO] Processing image (1/824): cropped_00_37_23.png
[2025-03-03 22:34:57] [INFO] Image encoded in 0.00s
[2025-03-03 22:35:14] [INFO] API response received in 16.68s
[2025-03-03 22:35:14] [INFO] Processing image (2/824): cropped_00_34_42.png
[2025-03-03 22:35:14] [INFO] Image encoded in 0.00s
[2025-03-03 22:35:17] [INFO] API response received in 3.50s
[2025-03-03 22:35:17] [INFO] Processing image (3/824): cropped_00_14_58.png
[2025-03-03 22:35:17] [INFO] Image encoded in 0.00s
[2025-03-03 22:35:20] [INFO] API response received in 2.56s
[2025-03-03 22:35:20] [INFO] Processing image (4/824): cropped_00_03_16.png
[2025-03-03 22:35:20] [INFO] Image encoded in 0.00s
[2025-03-03 22:35:23] [INFO] API response received in 2.77s
[2025-03-03 22:35:23] [INFO] Processing image (5/824): cropped_00_50_59.png
[2025-03-03 22:35:23] [INFO] Image encoded in 0.00s
[2025-03-03 22:35:26] [INFO] API response receive

In [10]:
# Sort by the timestamp

import json
from pathlib import Path

input_file = Path('result.json')
output_file = Path('result_origin_v7.json')

with open(input_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

def process_item(item):
    try:
        filename = Path(item['image']).name
        
        # 파일명 구조: cropped_00_02_45.png → ['cropped', '00', '02', '45.png']
        parts = filename.split('_')
        
        # 시간 정보 추출 (시, 분, 초)
        hours = int(parts[1])          # 첫 번째 숫자 (00)
        mins = int(parts[2])           # 두 번째 숫자 (02)
        secs = int(parts[3].split('.')[0])  # 세 번째 숫자 (45.png → 45)

        return {
            'image': filename,
            'time_seconds': hours*3600 + mins*60 + secs,
            'response': item['response'],
            'status': item['status']
        }
    except Exception as e:
        print(f"파일명 형식 오류: {filename} → {str(e)}")
        return None

# 데이터 처리 (오류 항목 필터링)
processed_data = [item for item in (process_item(i) for i in data) if item is not None]

# 시간 순 정렬
sorted_data = sorted(processed_data, key=lambda x: x['time_seconds'])

# 최종 출력 형식
final_data = [
    {
        'image': item['image'],
        'response': item['response'],
        'status': item['status']
    }
    for item in sorted_data
]

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(final_data, f, ensure_ascii=False, indent=4)

print(f"정렬 완료! 결과 파일: {output_file}")

정렬 완료! 결과 파일: result_origin.json


In [12]:
# merge_subtitle_w_vision_result.PY
import json

# result_origin.json 파일 로드
with open("result_origin_v7.json", "r", encoding="utf-8") as f:
    result_data = json.load(f)

# subtitle.json 파일 로드
with open("subtitle.json", "r", encoding="utf-8") as f:
    subtitle_data = json.load(f)

# dataset.json으로 저장할 데이터 리스트 초기화
dataset = []

# 두 파일의 데이터를 매칭하여 dataset 생성
for result_item, subtitle_item in zip(result_data, subtitle_data):
    input_text = subtitle_item.get("context", "")  # subtitle.json의 "context"를 input으로
    output_text = result_item.get("response", "")  # result_origin_v4.json의 "response"를 output으로
    
    # input과 output이 모두 비어있지 않은 경우만 추가
    if input_text and output_text:
        dataset.append({"input": input_text, "output": output_text})

# dataset.json 파일로 저장
with open("dataset.json", "w", encoding="utf-8") as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)

print("dataset.json 파일이 생성되었습니다.")

dataset.json 파일이 생성되었습니다.


In [14]:
# json_pretty.py
# Make json file readible.

import json
import textwrap

# JSON 파일 로드
with open('result_origin_v7.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# response 값을 50자씩 나누어 배열로 분할
for item in data:
    if 'response' in item:
        # 50자씩 나누기
        wrapped_text = textwrap.wrap(item['response'], width=100)
        item['response'] = wrapped_text

# 수정된 JSON 파일 저장
with open('result_pretty_v6.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print("JSON 파일이 수정되어 저장되었습니다.")


JSON 파일이 수정되어 저장되었습니다.
