In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

In [None]:
!pip install tensorflow

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [None]:
# 런타임 오류 방지 함수
def keep_alive():
    display(Javascript('''
        function ClickConnect(){
            console.log("클릭 연결 버튼");
            document.querySelector("colab-connect-button").click()
        }
        setInterval(ClickConnect, 60000)
    '''))

# 데이터 로드 및 전처리
def load_and_preprocess_data():
    # '/gdrive/MyDrive/Final project/1_Red/3_데이터수집_저장/0_데이터수집폴더/피부 데이터/json to df.csv'에서 데이터를 로드
    df = pd.read_csv('/gdrive/MyDrive/Final project/1_Red/3_데이터수집_저장/0_데이터수집폴더/피부 데이터/json to df.csv')

    # 문자열로 저장된 딕셔너리를 실제 딕셔너리로 변환
    for col in ['info', 'images', 'annotations', 'equipment']:
        df[col] = df[col].apply(lambda x: eval(x) if isinstance(x, str) else x)

    # Training 데이터만 선택
    df = df[df['split'] == 'Training']
    return preprocess_data(df)

# 데이터 전처리
# annotations를 처리하여 리스트 형태의 값을 길이로 변환
# equipment에서 딕셔너리가 아닌 값은 빈 딕셔너리로 정리
def preprocess_data(df):
    # anno는 'annotations' 열의 각 값
    def process_annotations(anno):
        if isinstance(anno, dict):
            # 값 v가 list면, len(v) 반환
            # 값 v가 list가 아니라면, 값을 그대로 반환
            return {k: len(v) if isinstance(v, list) else v for k, v in anno.items()}
        # 입력값 anno가 딕셔너리가 아닌 경우, 빈 사전({})을 반환
        return {}

    # 'annotations' 열의 각 값이 process_annotations 함수에 전달
    df['annotations'] = df['annotations'].apply(process_annotations)
    # 'equipment'열의 값이 딕셔너리인지 확인
    # 만약 값이 딕셔너리면 그 값을 그대로 두고, 그렇지 않으면 {}로 대체
    df['equipment'] = df['equipment'].apply(lambda x: x if isinstance(x, dict) else {})
    return merge_eye_cheek_data(df)

# 눈가와 볼 데이터 합치기
# facepart 3,4를 34로, 5,6을 56으로 통일
def merge_eye_cheek_data(df):
    df.loc[df['images'].apply(lambda x: x['facepart'] in [3, 4]), 'images'] = df['images'].apply(lambda x: {**x, 'facepart': 34} if x['facepart'] in [3, 4] else x)
    df.loc[df['images'].apply(lambda x: x['facepart'] in [5, 6]), 'images'] = df['images'].apply(lambda x: {**x, 'facepart': 56} if x['facepart'] in [5, 6] else x)
    return df

# 오버샘플링 함수
def oversample_data(X, y):
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(X.to_frame(), y)
    return X_resampled.iloc[:, 0], y_resampled

# 클래스 가중치 계산 함수
def compute_class_weights(y):
    classes = np.unique(y)
    weights = compute_class_weight('balanced', classes=classes, y=y)
    return dict(zip(classes, weights))

# 학습률 조정을 위한 콜백 함수
def lr_schedule(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

# 데이터 증강
def augment_data(image):
    image = tf.image.random_flip_up_down(image)
    image = tf.image.random_contrast(image, lower=0.8, upper=1.2)
    image = tf.image.rot90(image, k=tf.random.uniform(shape=[], minval=0, maxval=4, dtype=tf.int32))
    return image

# lr_poly 함수 정의
def lr_poly(initial_lr, iter, max_iter, power):
    return initial_lr * ((1 - float(iter) / max_iter) ** power)

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, [224, 224])
    img = tf.keras.applications.resnet50.preprocess_input(img)
    return img

# create_data_generator 함수 수정
def create_data_generator(X, y, directory, batch_size=32, is_training=True):
    def gen():
        for i in range(len(X)):
            img_path = X.iloc[i]
            img = load_and_preprocess_image(img_path)
            label = {col: y.iloc[i][col] for col in y.columns}
            yield img, label

    dataset = tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            tf.TensorSpec(shape=(224, 224, 3), dtype=tf.float32),
            {col: tf.TensorSpec(shape=(), dtype=tf.float32) for col in y.columns}
        )
    )

    if is_training:
        dataset = dataset.shuffle(buffer_size=len(X))

    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

# 검증 데이터 생성기
def create_val_data_generator(X, y, directory, batch_size=32):
    return create_data_generator(X, y, directory, batch_size, is_training=False)

def create_model(output_dims):
    # ImageNet 가중치로 초기화된 ResNet50 모델 생성
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    #  ResNet50의 출력을 평탄화
    x = GlobalAveragePooling2D()(base_model.output)
    outputs = []
    for name, dim in output_dims.items():
        # 출력 차원이 1인 경우 회귀 출력으로 간주
        if dim == 1:  # 회귀 출력
            outputs.append(Dense(1, name=name)(x))
        # 그렇지 않은 경우 분류 출력으로 간주
        else:  # 분류 출력
            outputs.append(Dense(dim, activation='softmax', name=name)(x))
    return Model(inputs=base_model.input, outputs=outputs)

def plot_performance(history, output_dims, facepart, model_type):
    n_metrics = len(output_dims)
    fig, axes = plt.subplots(n_metrics, 2, figsize=(15, 5*n_metrics))

    for i, (name, dim) in enumerate(output_dims.items()):
        # Loss plot
        axes[i, 0].plot(history.history[f'{name}_loss'], label='Train Loss')
        axes[i, 0].plot(history.history[f'val_{name}_loss'], label='Validation Loss')
        axes[i, 0].set_title(f'{name} Loss')
        axes[i, 0].set_xlabel('Epoch')
        axes[i, 0].set_ylabel('Loss')
        axes[i, 0].legend()

        # Metric plot
        metric = 'accuracy' if dim > 1 else 'mae'
        axes[i, 1].plot(history.history[f'{name}_{metric}'], label=f'Train {metric.upper()}')
        axes[i, 1].plot(history.history[f'val_{name}_{metric}'], label=f'Validation {metric.upper()}')
        axes[i, 1].set_title(f'{name} {metric.upper()}')
        axes[i, 1].set_xlabel('Epoch')
        axes[i, 1].set_ylabel(metric.upper())
        axes[i, 1].legend()

    plt.tight_layout()
    plt.savefig(f'/gdrive/MyDrive/Final project/1_Red/5_분석모델링/피부진단/model/facepart_{facepart}_{model_type}_performance.png')
    plt.close()

# train_model 함수 수정
def train_model(model, train_data, val_data, output_dims, facepart, model_type, epochs=100, batch_size=1):
    initial_lr = 1e-3
    power = 0.9
    optimizer = Adam(learning_rate=initial_lr)

    losses = {}
    metrics = {}
    for name, dim in output_dims.items():
        if dim == 1:  # 회귀
            losses[name] = 'mean_squared_error'
            metrics[name] = 'mae'
        else:  # 분류
            losses[name] = 'sparse_categorical_crossentropy'
            metrics[name] = 'accuracy'

    model.compile(optimizer=optimizer, loss=losses, metrics=metrics)

    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=f'/gdrive/MyDrive/Final project/1_Red/5_분석모델링/피부진단/model/facepart_{facepart}_{model_type}_checkpoint_{{epoch:02d}}.keras',
        save_best_only=True,
        save_weights_only=False,
        monitor='loss',
        mode='min',
        save_freq=10)

    # 학습률 조정 콜백 추가
    lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_schedule)

    history = model.fit(
        train_data,
        validation_data=val_data,
        epochs=epochs,
        verbose=1,
        callbacks=[
            tf.keras.callbacks.LearningRateScheduler(lambda epoch: lr_poly(initial_lr, epoch, epochs, power)),
            checkpoint_callback,
            tf.keras.callbacks.EarlyStopping(monitor='loss', patience=20, restore_best_weights=True),
            lr_callback
        ])

    model.save(f'/gdrive/MyDrive/Final project/1_Red/5_분석모델링/피부진단/model/facepart_{facepart}_{model_type}_final_model.keras')
    return history

# train_facepart_models 함수 수정
def train_facepart_models(facepart, train_classification=True, train_regression=True):
    print(f"Processing facepart {facepart}")

    if facepart in [34, 56]:
        facepart_df = df[df['images'].apply(lambda x: x['facepart'] in ([3, 4] if facepart == 34 else [5, 6]))]
    else:
        facepart_df = df[df['images'].apply(lambda x: x['facepart'] == facepart)]

    def valid_bbox(bbox):
        if bbox is None:
            return False
        if isinstance(bbox, list) and len(bbox) == 4:
            if bbox == ['None', 'None', 'None', 'None']:
                return False
            return all(isinstance(b, int) and b > 0 for b in bbox)
        return False

    facepart_df = facepart_df[facepart_df['images'].apply(lambda x: valid_bbox(x.get('bbox')))]

    train_directory = f'/gdrive/MyDrive/Final project/1_Red/4_데이터탐색_전처리/facepart별 피부 이미지/Training_cropped/{facepart}'

    X = facepart_df['info'].apply(lambda x: str(os.path.join(train_directory, f"{x['filename'].split('.')[0]}_{facepart}.jpg")))

    anno_columns = set().union(*facepart_df['annotations'])
    equip_columns = set().union(*facepart_df['equipment'])

    y_class = pd.DataFrame()
    y_reg = pd.DataFrame()
    output_dims_class = {}
    output_dims_reg = {}

    for col in anno_columns:
        y_class[col] = facepart_df['annotations'].apply(lambda x: x.get(col, None))
        output_dims_class[col] = len(set(y_class[col].dropna())) if len(set(y_class[col].dropna())) > 1 else 1

    for col in equip_columns:
        y_reg[col] = facepart_df['equipment'].apply(lambda x: x.get(col, None))
        output_dims_reg[col] = 1  # 회귀

    if facepart == 0:
        y_class['skin_type'] = facepart_df['info'].apply(lambda x: x['skin_type'])
        y_class['sensitive'] = facepart_df['info'].apply(lambda x: x['sensitive'])
        output_dims_class['skin_type'] = len(set(y_class['skin_type'].dropna()))
        output_dims_class['sensitive'] = len(set(y_class['sensitive'].dropna()))
        y_class['acne_count'] = facepart_df['annotations'].apply(lambda x: len(x.get('acne', [])))
        output_dims_class['acne_count'] = 1  # 회귀

    if train_classification and not y_class.empty:
        print(f"Starting classification training for facepart {facepart}")

        X_train, X_val, y_train_class, y_val_class = train_test_split(X, y_class, test_size=0.1, random_state=42)

        # 오버샘플링 적용
        X_train_resampled, y_train_class_resampled = oversample_data(X_train, y_train_class)

        # 클래스 가중치 계산
        class_weights = {}
        for col in y_train_class_resampled.columns:
            if output_dims_class[col] > 1:  # 분류인 경우에만 클래스 가중치 계산
                class_weights[col] = compute_class_weights(y_train_class_resampled[col])

        train_datagen = ImageDataGenerator(preprocessing_function=augment_data)
        val_datagen = ImageDataGenerator(preprocessing_function=load_and_preprocess_image)

        train_generator_class = create_data_generator(X_train_resampled, y_train_class_resampled, train_directory)
        val_generator_class = create_val_data_generator(X_val, y_val_class, train_directory)

        model_class = create_model(output_dims_class)
        history_class = train_model(model_class, train_generator_class, val_generator_class, output_dims_class, facepart, 'classification')
        plot_performance(history_class, output_dims_class, facepart, 'classification')

    if train_regression and not y_reg.empty and not all(y_reg.isnull().all()):
        print(f"Starting regression training for facepart {facepart}")
        X_train, X_val, y_train_reg, y_val_reg = train_test_split(X, y_reg, test_size=0.1, random_state=42)

        # 오버샘플링 적용 (회귀에서는 주의가 필요함)
        X_train_resampled, y_train_reg_resampled = oversample_data(X_train, y_train_reg)

        train_generator_reg = create_data_generator(train_datagen, X_train_resampled, y_train_reg_resampled, train_directory)
        val_generator_reg = create_val_data_generator(val_datagen, X_val, y_val_reg, train_directory)

        model_reg = create_model(output_dims_reg)
        history_reg = train_model(model_reg, train_generator_reg, val_generator_reg, output_dims_reg, facepart, 'regression')
        plot_performance(history_reg, output_dims_reg, facepart, 'regression')

# 메인 실행
if __name__ == "__main__":
    keep_alive()
    df = load_and_preprocess_data()
    user_input = input("처리할 facepart 범위를 선택하세요 (1: 0-2, 2: 34,56, 3: 7-8): ")
    if user_input == '1':
        facepart_range = [0, 1, 2]
    elif user_input == '2':
        facepart_range = [34, 56]
    elif user_input == '3':
        facepart_range = [7, 8]
    else:
        print("잘못된 입력입니다.")
        exit()
    train_class = input("분류 모델을 학습하시겠습니까? (y/n): ").lower() == 'y'
    train_reg = input("회귀 모델을 학습하시겠습니까? (y/n): ").lower() == 'y'
    for facepart in facepart_range:
        train_facepart_models(facepart, train_classification=train_class, train_regression=train_reg)