In [64]:
###############################################
# Extract the circumstance from the image
################################################

import requests
import ollama
# Ollama 서버 설정
ollama_host = "http://sg029:11434"
client = ollama.Client(host=ollama_host)  # 클라이언트 인스턴스 생성

prompt = """Analyze the given image and describe the specific actions and interactions of the people in this circumstance.
Focus on what they are doing, their gestures, expressions, and interactions, and provide general details about the environment or objects.
Guess what kind of conversation might typically occur in this situation. Ignore any text visible in the image.

Example 1:
[Image shows: A group of people are gathered around a table, looking at documents and gesturing.]
Description: This circumstance shows a business meeting in progress. There are four people sitting around a wooden conference table, reviewing printed documents. The woman in the red blazer appears to be presenting, gesturing with her hands as she explains something on the paper. The others are listening attentively, with one man taking notes on his laptop.
**Possible conversation:
"Let's review the project updates"
"We need to discuss the new marketing strategy" 
"What's the status of the client proposal?"
"I think we should prioritize the eastern market expansion"
"Has everyone received the quarterly forecast report?"
"Could you explain the decline in these numbers?"
"What feedback did we get from the focus group?"

Example 2:
[Image shows: A person is sitting on a bench in a park, reading a book and looking relaxed.]
Description: This circumstance shows someone enjoying leisure time outdoors. A young woman with curly hair is sitting on a wooden bench in a public park. She's reading a paperback book while smiling, suggesting she's enjoying the content. The park around her has green trees and a walking path visible in the background.
**Possible conversation:
"This book is really interesting"
"The weather is perfect today"
"I'm glad I took some time to relax"
"I've been meaning to finish this novel for weeks"
"This park is my favorite spot in the city"
"The author's perspective on climate change is fascinating"
"I should come here more often to disconnect from work"

Example 3:
[Image shows: Two friends having coffee at an outdoor café, one showing something on their phone to the other.]
Description: This circumstance shows a casual social meetup between friends. Two young adults are sitting at a small round table outside a café. The person on the left is holding up their smartphone and showing something on the screen to their friend, who is leaning in with interest. There are two coffee cups on the table along with a small plate of pastries. The café has a striped awning overhead and there are other patrons visible in the background.
**Possible conversation:
"Look at these photos from my weekend trip"
"Have you seen this funny video that's going viral?"
"What do you think about this apartment I'm considering?"
"I can't believe what our old classmate posted on social media"
"This coffee shop has the best pastries in town"
"Should I buy these shoes? They're on sale"
"I'm thinking about applying for this job opportunity"

Example 4:
[Image shows: A classroom with a teacher standing at the front and students engaged in a group activity.]
Description: This circumstance shows an educational environment in progress. A middle-aged teacher with glasses is standing at the whiteboard, pointing to diagrams while explaining a concept. The students are seated in small groups of three to four, with notebooks open and colorful project materials spread across their desks. Some students are raising their hands to ask questions while others are collaborating on what appears to be a science project with small models on their tables. The classroom has educational posters on the walls and a digital projector displaying related content.
**Possible conversation:
"Can someone explain how photosynthesis works in your own words?"
"For your group project, focus on demonstrating the carbon cycle"
"Does anyone have questions before we move on to the next section?"
"Remember to include your research sources in your final presentation"
"Let's brainstorm solutions to this environmental challenge"
"Make sure everyone in your group has a chance to contribute"
"I like how your team approached this problem differently"
"We'll need to finish this activity before the bell rings"
"Can you explain your reasoning behind this conclusion?"
"Excellent question! Let's explore that concept further"

[Now, describe the image I've shared and guess possible conversation:]
"""

try:
    response = requests.get(ollama_host)
    print("Server connected")
    print(response.text)
    
except requests.ConnectionError:
    print("Not connected")

Server connected
Ollama is running


In [67]:
###############################################
# Test llama vision working
# Must choose a100:2, for llama3.2-vision:90b
# a100:1 works for llama3.2-vision
################################################

import base64

with open("./Data_Images/고물가시대 혜자로운 코스트코 장보기/frame_0013.png", "rb") as img_file:
    base64_image = base64.b64encode(img_file.read()).decode("utf-8")
    
response = client.chat(
    model='llama3.2-vision', # need a100:1 gpu
    # model='llama3.2-vision:90b', # need a100:2 gpu
    messages=[{
        'role': 'user',
        'content': prompt,
        'images': [base64_image]  # Base64 문자열 전달
    }]
)

print(response['message']['content'])

This image depicts a man sitting in front of a white desk with a computer monitor on it. The room is well-lit and appears to be an office or workspace.

**Possible Conversation:**

* "What's that new software you're using?"
* "Have you tried the latest version of Adobe Creative Cloud?"
* "I'm having trouble finding a specific file, can you help me locate it?"
* "Do you think I should invest in a new graphics tablet for my work?"
* "How do you stay organized with all your projects and deadlines?"

The conversation is likely focused on discussing computer-related topics or asking for technical assistance.


In [14]:
###############################################
# Test llama vision working
# Must choose a100:2, for llama3.2-vision:90b
# a100:1 works for llama3.2-vision
################################################
response = client.chat(
    # model='llama3.2-vision', # need a100:1 gpu
    model='llama3.2-vision:90b', # need a100:2 gpu
    messages=[{
        'role': 'user',
        'content': prompt,
        'images': [base64_image]  # Base64 문자열 전달
    }]
)

print(response['message']['content'])

The image depicts a living room with a man standing by the window, gazing outside. The room is furnished with a couch, coffee table, and TV stand. A large window dominates one wall, allowing natural light to flood the space.

Possible conversation topics in this situation could include:

* "I love how much sunlight we get in here during the day."
* "Have you seen that new show everyone's talking about? We should watch it tonight."
* "We need to replace those curtains; they're getting old."
* "This room feels so cozy when it rains outside like this."
* "Let's have our guests sit on the couch and we'll take the armchairs."

These topics reflect typical conversations that might occur in a living room setting, such as discussing the ambiance, watching TV together, or making plans for hosting guests.


In [72]:
#######################
# Check all the images in one drama folder
# Read and analyze through llama3.2-vision
#######################

import os
import json
import base64
import ollama
import time
import traceback
from pathlib import Path
from datetime import datetime


def process(base_path, drama_folder_name, version):
    # 경로 설정
    base_path = base_path
    drama_folder_name = drama_folder_name
    version = version
    
    image_dir = Path(f'{base_path}/Data_Images/{drama_folder_name}')
    output_file = Path(f'{base_path}/Refined_Datas/{version}/Data_llama_vision/{drama_folder_name}.json')
    
    #######################
    # model choose
    #######################
    used_model = "llama3.2-vision"
    
    # 출력 디렉토리 생성
    output_file.parent.mkdir(parents=True, exist_ok=True)
    
    # 로깅 설정
    def log(message, level="INFO"):
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        print(f"[{timestamp}] [{level}] {message}")
    
    # 초기화
    results = []
    VALID_EXTENSIONS = ('.png', '.jpg', '.jpeg')
    total_images = len([f for f in image_dir.iterdir() if f.suffix.lower() in VALID_EXTENSIONS])
    processed = 0
    
    log(f"Starting image processing for {total_images} images")
    
    # 이미지 처리
    for image_path in image_dir.iterdir():
        if not (image_path.is_file() and image_path.suffix.lower() in VALID_EXTENSIONS):
            continue
    
        processed += 1
        log(f"Processing image ({processed}/{total_images}): {image_path.name}")
        start_time = time.time()
        
        try:
            # 이미지 인코딩
            encode_start = time.time()
            with open(image_path, "rb") as img_file:
                base64_image = base64.b64encode(img_file.read()).decode("utf-8")
            encode_time = time.time() - encode_start
            log(f"Image encoded in {encode_time:.2f}s")
    
            # API 요청
            api_start = time.time()
            used_model = "llama3.2-vision"
            response = client.chat(
            ###################### Choose Model ###################
                model="llama3.2-vision",
                # "llama3.2-vision:90b"
                # llama3.2-vision"
                messages=[{
                    'role': 'user',
                    'content': prompt,
                    'images': [base64_image]
                }]
            )
            api_time = time.time() - api_start
            log(f"API response received in {api_time:.2f}s")
            
            # print("API Response:")
            # print(response['message']['content'])
    
            # 결과 저장
            results.append({
                'image': str(image_path),
                'response': response['message']['content'],
                'processing_time': {
                    'encoding': encode_time,
                    'api_call': api_time,
                    'total': time.time() - start_time
                },
                'status': 'success'
            })
        
        except Exception as e:
            error_msg = f"Error processing {image_path.name}: {str(e)}"
            error_trace = traceback.format_exc()
            log(error_msg, "ERROR")
            log(f"Error details:\n{error_trace}", "DEBUG")
            
            results.append({
                'image': str(image_path),
                'error': error_msg,
                'error_trace': error_trace,
                'status': 'failed'
            })
            

    
    # 결과 저장
    save_start = time.time()
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=4)
    save_time = time.time() - save_start
    
    log("=========================================")
    log(f"Results saved to {output_file} in {save_time:.2f}s")
    log(f"Processing completed. Success: {len([x for x in results if x['status']=='success'])}, Failed: {len([x for x in results if x['status']=='failed'])}")
    log("=========================================")
    
    #######################
    # Organize by frame number
    #######################
    
    # 파일 경로 설정
    input_file = Path(f'{base_path}/Refined_Datas/{version}/Data_llama_vision/{drama_folder_name}.json')
    output_file = Path(f'{base_path}/Refined_Datas/{version}/Data_llama_vision/{drama_folder_name}_organized.json')
    
    # JSON 파일 읽기
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # 아이템 처리 함수
    def process_item(item):
        try:
            filename = Path(item['image']).name
            
            # 파일명 구조: frame_0001.png → ['frame', '0001.png']
            parts = filename.split('_')
            
            # 숫자 부분 추출 (frame_0001.png → 0001)
            frame_number = int(parts[1].split('.')[0])  # frame_0001.png → 0001
            
            return {
                'image': filename,
                'frame_number': frame_number,
                'response': item['response'],
                'status': item['status']
            }
        except Exception as e:
            print(f"파일명 형식 오류: {filename} → {str(e)}")
            return None
    
    # 데이터 처리 (오류 항목 필터링)
    processed_data = [item for item in (process_item(i) for i in data) if item is not None]
    
    # 숫자 순으로 정렬 (frame_number 기준)
    sorted_data = sorted(processed_data, key=lambda x: x['frame_number'])
    
    # 최종 출력 형식
    final_data = [
        {
            'used_model': 'llama3.2-vision',  # 모델 이름을 하드코딩 (필요 시 수정)
            'image': sorted_data[0]['image'],
            'response': sorted_data[0]['response'],
        }
    ] + [
        {
            'image': item['image'],
            'response': item['response'],
        }
        for item in sorted_data[1:]
    ]
    
    # JSON 파일로 저장
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(final_data, f, ensure_ascii=False, indent=4)
        
    # 파일 삭제
    if input_file.exists():  # 파일이 존재하는지 확인
        input_file.unlink()  # 파일 삭제
        print(f"파일 제거: {input_file}")
    else:
        print(f"{input_file} 파일이 존재하지 않습니다.")
        
    print(f"정렬 완료! 결과 파일: {output_file}")
    
    ################################################
    # Merging llama-vision result + subtitle + timestamp
    ################################################
    output_file = Path(f'{base_path}/Refined_Datas/{version}/Data_llama_vision/{drama_folder_name}_organized.json')
    
    # llama_vision_data.json 파일 로드
    with open(output_file, "r", encoding="utf-8") as f:
        llama_vision_data = json.load(f)
    
    subtitle = Path(f'{base_path}/Data_Subtitles/{drama_folder_name}_organized.ko.json')
    # subtitle.json 파일 로드
    with open(subtitle, "r", encoding="utf-8") as f:
        subtitle_data = json.load(f)
    
    # final_output.json으로 저장할 데이터 리스트 초기화
    dataset = []
    
    # 두 파일의 데이터를 매칭하여 dataset 생성
    for result_item, subtitle_item in zip(llama_vision_data, subtitle_data):
        input_text = subtitle_item.get("context", "")  # subtitle_data.json의 "context"를 input으로
        timestamp = subtitle_item.get("timestamp", "")  # subtitle_data.json의 "timestamp"를 timestamp
        output_text = result_item.get("response", "")  # llama_vision_data.json의 "response"를 output으로
        
        # input, output, timestamp가 모두 비어있지 않은 경우만 추가
        if input_text and output_text:
            dataset.append({"timestamp": timestamp, "input": input_text, "output": output_text})
    
    # 디렉토리가 없으면 생성
    final_output = Path(f'{base_path}/Refined_Datas/{version}/Data_Final/')
    final_output.mkdir(parents=True, exist_ok=True)
    print(f"디렉토리가 생성되었습니다: {final_output}")
    
    final_output = Path(f'{base_path}/Refined_Datas/{version}/Data_Final/{drama_folder_name}_final.json')
    # dataset.json 파일로 저장
    with open(final_output, "w", encoding="utf-8") as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)
    
    print(f"/Refined_Datas/{version}/Data_Final/{drama_folder_name}_final.json 파일이 생성되었습니다.")
    
    ###############################################
    # Reverse input <-> output
    # Professor suggestion
    ################################################
    
    # final_output 파일 경로 설정
    final_output = Path(f'{base_path}/Refined_Datas/{version}/Data_Final/{drama_folder_name}_final.json')
    
    # final_output에서 데이터 읽기
    with open(final_output, "r", encoding="utf-8") as f:
        dataset = json.load(f)
    
    # input과 output 값을 교환하는 작업
    reversed_dataset = []
    for item in dataset:
        # input과 output을 서로 바꾸기
        reversed_item = {
            'timestamp': item['timestamp'],
            'input': item['output'],  # output을 input으로
            'output': item['input'],  # input을 output으로
        }
        reversed_dataset.append(reversed_item)
    
    # 기존 dataset과 reversed_dataset을 합치기
    combined_dataset = dataset + reversed_dataset
    
    # 디렉토리가 없으면 생성
    reversed_final_output = Path(f'{base_path}/Refined_Datas/{version}/Data_Final_Reversed/')
    reversed_final_output.mkdir(parents=True, exist_ok=True)
    print(f"디렉토리가 생성되었습니다: {final_output}")
    
    # reversed_final_output 파일 경로 설정
    reversed_final_output = Path(f'{base_path}/Refined_Datas/{version}/Data_Final_Reversed/{drama_folder_name}_reversed_final.json')
    
    # combined dataset을 새로운 JSON 파일로 저장
    with open(reversed_final_output, "w", encoding="utf-8") as f:
        json.dump(combined_dataset, f, ensure_ascii=False, indent=4)
    
    print(f"/Refined_Datas/{version}/Data_Final_Reversed/{drama_folder_name}_reversed_final.json 파일이 생성되었습니다.")


In [None]:
# base_path = "/scratch/jsong132/Can_LLM_Learn_New_Language"
# version = "v2"

# drama_list = ["고물가시대 혜자로운 코스트코 장보기", "그때 그 시절 우리가 사랑했던 원두", "나 가을 타나봐", "나폴리 맛피아 PICK 최애 스패니시 다이닝 맛집"]

# for drama_folder_name in drama_list:
#     process(base_path, drama_folder_name, version)

base_path = "/scratch/jsong132/Can_LLM_Learn_New_Language"
version = "v2"
process(base_path, "100명, 100개의 동그라미, 50만 달러", version)
process(base_path, "Juilliard•NYC VLOG 석사 2학년 시이작!", version)
process(base_path, "고물가시대 혜자로운 코스트코 장보기", version)
process(base_path, "그때 그 시절 우리가 사랑했던 원두", version)
process(base_path, "나 가을 타나봐", version)
process(base_path, "나폴리 맛피아 PICK 최애 스패니시 다이닝 맛집", version)
process(base_path, "난이도 최상인 미슐랭 셰프 사위 생일상", version)
process(base_path, "내가 생각한 여행이 아니야", version)

[2025-03-17 21:07:31] [INFO] Starting image processing for 986 images
[2025-03-17 21:07:31] [INFO] Processing image (1/986): frame_0895.png
[2025-03-17 21:07:31] [INFO] Image encoded in 0.00s
[2025-03-17 21:07:35] [INFO] API response received in 4.35s
[2025-03-17 21:07:35] [INFO] Processing image (2/986): frame_0840.png
[2025-03-17 21:07:35] [INFO] Image encoded in 0.00s
[2025-03-17 21:07:41] [INFO] API response received in 5.60s
[2025-03-17 21:07:41] [INFO] Processing image (3/986): frame_0258.png
[2025-03-17 21:07:41] [INFO] Image encoded in 0.00s
[2025-03-17 21:07:44] [INFO] API response received in 3.03s
[2025-03-17 21:07:44] [INFO] Processing image (4/986): frame_0400.png
[2025-03-17 21:07:44] [INFO] Image encoded in 0.00s
[2025-03-17 21:07:47] [INFO] API response received in 3.47s
[2025-03-17 21:07:47] [INFO] Processing image (5/986): frame_0266.png
[2025-03-17 21:07:47] [INFO] Image encoded in 0.00s
[2025-03-17 21:07:51] [INFO] API response received in 3.65s
[2025-03-17 21:07:51

In [4]:
from pathlib import Path

def check_img(drama_folder_name):
    # 경로 설정
    base_path = "/scratch/jsong132/Can_LLM_Learn_New_Language"
    version = "v2"
    image_dir = Path(f'{base_path}/Data_Images/{drama_folder_name}')
    
    # 이미지 확장자 설정
    VALID_EXTENSIONS = ('.png', '.jpg', '.jpeg')
    
    # 이미지 파일 개수 확인
    image_files = [f for f in image_dir.iterdir() if f.is_file() and f.suffix.lower() in VALID_EXTENSIONS]
    
    # 이미지 파일 개수 출력
    print(drama_folder_name)
    print(f"이미지 파일 개수: {len(image_files)}")
    print("예상 시간: ", len(image_files)*4
          /60/60, "시간")

check_img("내향인 짝사랑 몰아보기")
check_img("리얼타임러브몰아보기")
check_img("물가 급등한 발리, 집 구하고 살아본 솔직 후기")
check_img("미국 음식 먹는 브이로그")
check_img("미래에 간다면 이런 느낌일까")
check_img("베스트셀러가 되면 돈을 얼마나 벌까")


내향인 짝사랑 몰아보기
이미지 파일 개수: 1296
예상 시간:  1.4400000000000002 시간
리얼타임러브몰아보기
이미지 파일 개수: 1537
예상 시간:  1.7077777777777778 시간
물가 급등한 발리, 집 구하고 살아본 솔직 후기
이미지 파일 개수: 306
예상 시간:  0.33999999999999997 시간
미국 음식 먹는 브이로그
이미지 파일 개수: 335
예상 시간:  0.3722222222222222 시간
미래에 간다면 이런 느낌일까
이미지 파일 개수: 471
예상 시간:  0.5233333333333333 시간
베스트셀러가 되면 돈을 얼마나 벌까
이미지 파일 개수: 433
예상 시간:  0.4811111111111111 시간


In [10]:
###############################################
# Don't need to do
# Extra, this is not a main process
# Make .json file readible
# json_pretty.py
# Make json file readible.
################################################

import json
import textwrap

# JSON 파일 로드
with open(final_output, 'r', encoding='utf-8') as f:
    data = json.load(f)

# response 값을 50자씩 나누어 배열로 분할
for item in data:
    if 'response' in item:
        # 50자씩 나누기
        wrapped_text = textwrap.wrap(item['response'], width=100)
        item['response'] = wrapped_text

# 수정된 JSON 파일 저장
with open(output_pretty, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print("JSON 파일이 수정되어 저장되었습니다.")


JSON 파일이 수정되어 저장되었습니다.
