# 2024 데이터 크리에이터 캠프

문제: 인공지능은 사람의 마음을 이해할수 있을까?

## Mission3. 패션스타일 선호 여부 예측

## 라이브러리 불러오기

In [1]:
import os
import torch
import json
import numpy as np
import pandas as pd
from torch import Tensor
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
from sklearn.metrics import accuracy_score
from typing import Type
from collections import defaultdict
import torchvision.transforms as transforms
from sklearn.metrics.pairwise import cosine_similarity

## Resnet

In [9]:
class BasicBlock(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        stride: int = 1,
        expansion: int = 1,
        downsample: nn.Module = None
    ) -> None:
        super(BasicBlock, self).__init__()
        self.expansion = expansion
        self.downsample = downsample
        self.conv1 = nn.Conv2d(
            in_channels,
            out_channels,
            kernel_size=3,
            stride=stride,
            padding=1,
            bias=False
        )
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(
            out_channels,
            out_channels*self.expansion,
            kernel_size=3,
            padding=1,
            bias=False
        )
        self.bn2 = nn.BatchNorm2d(out_channels*self.expansion)

    def forward(self, x: Tensor) -> Tensor:
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)
        return  out

In [10]:
class ResNet(nn.Module):
    def __init__(
        self,
        img_channels: int,
        num_layers: int,
        block: Type[BasicBlock],
        num_classes: int  = 1000
    ) -> None:
        super(ResNet, self).__init__()
        if num_layers == 18: # ResNet18 만을 본 대회에서 사용함으로 18층만 구현
            layers = [2, 2, 2, 2]
            self.expansion = 1

        self.in_channels = 64
        self.conv1 = nn.Conv2d(
            in_channels=img_channels,
            out_channels=self.in_channels,
            kernel_size=7,
            stride=2,
            padding=3,
            bias=False
        )
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512*self.expansion, num_classes)

    def _make_layer(
        self,
        block: Type[BasicBlock],
        out_channels: int,
        blocks: int,
        stride: int = 1
    ) -> nn.Sequential:
        downsample = None
        if stride != 1:
            downsample = nn.Sequential(
                nn.Conv2d(
                    self.in_channels,
                    out_channels*self.expansion,
                    kernel_size=1,
                    stride=stride,
                    bias=False
                ),
                nn.BatchNorm2d(out_channels * self.expansion),
            )
        layers = []
        layers.append(
            block(
                self.in_channels, out_channels, stride, self.expansion, downsample
            )
        )
        self.in_channels = out_channels * self.expansion

        for i in range(1, blocks):
            layers.append(block(
                self.in_channels,
                out_channels,
                expansion=self.expansion
            ))
        return nn.Sequential(*layers)

    def forward(self, x: Tensor) -> Tensor:
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        print('Dimensions of the last convolutional feature map: ', x.shape)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

### ResNet-18 모델 정의

주어진 ResNet-18 모델을 사용하여 각 이미지의 feature vector를 추출

In [11]:
class ResNet18FeatureExtractor(nn.Module):
    def __init__(self):
        super(ResNet18FeatureExtractor, self).__init__()
        self.resnet18 = ResNet(img_channels=3, num_layers=18, block=BasicBlock)
        self.features = nn.Sequential(*list(self.resnet18.children())[:-1])  # 마지막 FC 레이어 제외

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)  # Flatten
        return x

In [12]:
# CSV 파일 불러오기
mission2_result = pd.read_csv('../dataset/mission2-2_result_all.csv')

# 데이터프레임의 일부 출력
mission2_result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4116 entries, 0 to 4115
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   응답자 ID     4116 non-null   int64 
 1   train 선호   3081 non-null   object
 2   train 비선호  3416 non-null   object
 3   valid 선호   1396 non-null   object
 4   valid 비선호  1595 non-null   object
dtypes: int64(1), object(4)
memory usage: 160.9+ KB


In [13]:
mission2_result.head()

Unnamed: 0,응답자 ID,train 선호,train 비선호,valid 선호,valid 비선호
0,52002,W_24111_70_hippie_M.jpg,"T_00004_90_hiphop_M.jpg, T_03007_10_sportiveca...",,
1,66699,"T_00004_90_hiphop_M.jpg, T_01568_50_ivy_M.jpg,...","T_03643_00_metrosexual_M.jpg, T_06009_10_sport...",,
2,66797,"T_01259_10_sportivecasual_M.jpg, T_16092_10_sp...","T_00004_90_hiphop_M.jpg, W_15467_70_hippie_M.j...","T_08486_10_sportivecasual_M.jpg, W_23958_60_mo...",
3,66684,"T_00047_19_normcore_M.jpg, T_03699_90_hiphop_M...","T_00007_19_normcore_M.jpg, W_51917_00_metrosex...",W_15341_60_mods_M.jpg,
4,66817,"T_00012_19_normcore_M.jpg, T_04506_90_hiphop_M...","T_03624_90_hiphop_M.jpg, T_04522_90_hiphop_M.j...",,W_17135_00_metrosexual_M.jpg


### 이미지 전처리

ResNet-18 모델을 사용하여 이미지의 feature vector를 추출 및 저장

In [14]:
# 이미지 전처리
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# 이미지 feature vector 추출 함수
def extract_features(image_path, model, transform):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        features = model(image)
    return features.numpy().flatten()

# 이미지 디렉토리 경로
train_image_directory = '../dataset/training_image'
valid_image_directory = '../dataset/validation_image'

# ResNet-18 모델 초기화
model = ResNet18FeatureExtractor()
model.eval()

# 이미지 feature vector 추출 및 저장
train_features = {}
valid_features = {}

for image_name in os.listdir(train_image_directory):
    if image_name.endswith('.jpg'):
        image_path = os.path.join(train_image_directory, image_name)
        image_id = image_name.split('_')[1]
        train_features[image_id] = extract_features(image_path, model, transform)

for image_name in os.listdir(valid_image_directory):
    if image_name.endswith('.jpg'):
        image_path = os.path.join(valid_image_directory, image_name)
        image_id = image_name.split('_')[1]
        valid_features[image_id] = extract_features(image_path, model, transform)


이미지 간 유사도를 계산 및 Validation 데이터 내 응답자의 스타일 선호 여부를 예측

In [15]:

# 유사도 계산 함수
def calculate_similarity(feature1, feature2):
    return cosine_similarity([feature1], [feature2])[0][0]

# Validation 데이터 내 응답자의 스타일 선호 여부 예측
def predict_preference(mission2_result, valid_features, train_features, threshold=0.5):
    predictions = {}
    for index, row in mission2_result.iterrows():
        respondent_id = row['응답자 ID']
        if pd.isna(row['valid 선호']) and pd.isna(row['valid 비선호']):
            continue
        valid_images = []
        if not pd.isna(row['valid 선호']):
            valid_images.extend(row['valid 선호'].split(', '))
        if not pd.isna(row['valid 비선호']):
            valid_images.extend(row['valid 비선호'].split(', '))
        
        respondent_predictions = {}
        for valid_image in valid_images:
            valid_image_id = valid_image.split('_')[1]
            if valid_image_id in valid_features:
                valid_feature = valid_features[valid_image_id]
                similarities = []
                for train_image_id, train_feature in train_features.items():
                    similarity = calculate_similarity(valid_feature, train_feature)
                    similarities.append((train_image_id, similarity))
                similarities.sort(key=lambda x: x[1], reverse=True)
                top_similarities = similarities[:5]  # 상위 5개 유사도 사용
                preference_score = sum([sim for _, sim in top_similarities]) / len(top_similarities)
                respondent_predictions[valid_image] = '선호' if preference_score > threshold else '비선호'
        predictions[respondent_id] = respondent_predictions
    return predictions

# 예측 수행
predictions = predict_preference(mission2_result, valid_features, train_features, threshold=0.5)

성능 확인

In [17]:
# 성능 측정 (예시로 정확도 계산)
def calculate_accuracy(predictions, valid_labels):
    correct = 0
    total = 0
    for respondent_id, respondent_predictions in predictions.items():
        for image_name, predicted_label in respondent_predictions.items():
            image_id = image_name.split('_')[1]  # 이미지 ID 추출
            if image_id in valid_labels:
                total += 1
                if predicted_label == valid_labels[image_id]:
                    correct += 1
    return correct / total if total > 0 else 0

# Validation 데이터의 실제 라벨 로드
valid_labels = {}
valid_label_directory = '../dataset/validation_label'
for filename in os.listdir(valid_label_directory):
    if filename.endswith('.json'):
        filepath = os.path.join(valid_label_directory, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            data = json.load(file)
            image_id = data['item']['imgName'].split('_')[1]
            Q5 = data['item']['survey']['Q5']
            valid_labels[image_id] = '선호' if Q5 == 2 else '비선호'

# 정확도 계산
accuracy = calculate_accuracy(predictions, valid_labels)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.40


In [6]:
import pandas as pd
import numpy as np

In [None]:
import numpy as np

# Initialize a dictionary to store the processed data
processed_data = {}

# Iterate through each row in the DataFrame
for index, row in data.iterrows():
    respondent_id = row['응답자 ID']
    
    # Get lists of preferred and non-preferred items
    preferred_items = str(row['train 선호']).split(', ') if pd.notna(row['train 선호']) else []
    non_preferred_items = str(row['train 비선호']).split(', ') if pd.notna(row['train 비선호']) else []
    
    # Initialize respondent data in the dictionary
    processed_data[respondent_id] = {}
    
    # Mark preferred items with 1
    for item in preferred_items:
        processed_data[respondent_id][item] = 1
    
    # Mark non-preferred items with -1
    for item in non_preferred_items:
        processed_data[respondent_id][item] = -1

# Create DataFrame from the processed dictionary
matrix_df = pd.DataFrame.from_dict(processed_data, orient='index').fillna(0)

In [3]:
df = pd.read_csv('C:/Users/KimDongyoung/Desktop/my_git/mygit/K-ICT/DCC-amaranth/mission/mission3/동영특징벡터csv/item_user_matrix.csv')

In [13]:
df

Unnamed: 0.1,Unnamed: 0,W_24111_70_hippie_M.jpg,T_00004_90_hiphop_M.jpg,T_03007_10_sportivecasual_M.jpg,T_03118_19_normcore_M.jpg,W_02699_60_mods_M.jpg,W_23983_60_mods_M.jpg,T_01568_50_ivy_M.jpg,T_15877_10_sportivecasual_M.jpg,T_16259_10_sportivecasual_M.jpg,...,W_29108_90_kitsch_W.jpg,W_31559_90_kitsch_W.jpg,W_34808_10_sportivecasual_W.jpg,W_40520_19_genderless_W.jpg,W_40817_70_military_W.jpg,W_43763_00_oriental_W.jpg,W_45749_90_grunge_W.jpg,W_47546_60_minimal_W.jpg,W_17799_19_normcore_M.jpg,W_21986_60_space_W.jpg
0,52002,1,-1,-1,-1,-1,-1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,66563,-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7905,-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,61468,-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,63110,-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4027,26548,0,0,0,0,0,0,0,0,0,...,0,0,0,0,-1,0,0,0,0,0
4028,29728,0,0,0,0,0,0,0,0,0,...,0,0,0,0,-1,0,0,0,0,0
4029,22347,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4030,28789,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-1,0


In [8]:
# 코사인 유사도 계산 함수
def cosine_similarity(matrix):
    # 아이템 간의 유사도 행렬 초기화
    item_count = matrix.shape[1]
    similarity_matrix = np.zeros((item_count, item_count))

    for i in range(item_count):
        for j in range(item_count):
            if i == j:
                similarity_matrix[i, j] = 1  # 자기 자신과의 유사도는 1
            else:
                # 코사인 유사도 계산
                dot_product = np.dot(matrix[:, i], matrix[:, j])
                norm_i = np.linalg.norm(matrix[:, i])
                norm_j = np.linalg.norm(matrix[:, j])
                
                if norm_i > 0 and norm_j > 0:
                    similarity_matrix[i, j] = dot_product / (norm_i * norm_j)
                else:
                    similarity_matrix[i, j] = 0  # 둘 중 하나가 0 벡터일 경우 유사도 0

    return similarity_matrix

# 데이터프레임을 행렬로 변환
matrix = df.values

# 유사도 행렬 계산
similarity_matrix = cosine_similarity(matrix)

# 결과 출력
print("코사인 유사도 행렬:")
print(similarity_matrix)

코사인 유사도 행렬:
[[ 1.         -0.02431743 -0.00842362 ...  0.00360324 -0.0080621
  -0.01851289]
 [-0.02431743  1.         -0.23570226 ...  0.          0.
   0.        ]
 [-0.00842362 -0.23570226  1.         ...  0.          0.
   0.        ]
 ...
 [ 0.00360324  0.          0.         ...  1.          0.
   0.        ]
 [-0.0080621   0.          0.         ...  0.          1.
   0.        ]
 [-0.01851289  0.          0.         ...  0.          0.
   1.        ]]


In [20]:
matrix

array([[ 1, -1, -1, ...,  0,  0,  0],
       [-1,  0,  0, ...,  0,  0,  0],
       [-1,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  1,  0,  0],
       [ 0,  0,  0, ...,  0, -1,  0],
       [ 0,  0,  0, ...,  0,  0, -1]], dtype=int64)

In [11]:
# 아이템 ID를 인덱스로 설정하여 similarity_matrix를 데이터프레임으로 변환
item_ids = df.columns  # 아이템 ID는 데이터프레임의 컬럼 이름으로 가정
similarity_df = pd.DataFrame(similarity_matrix, index=item_ids, columns=item_ids)

# 결과 출력
print("코사인 유사도 데이터프레임:")
print(similarity_df)

코사인 유사도 데이터프레임:
                                 Unnamed: 0  W_24111_70_hippie_M.jpg  \
Unnamed: 0                         1.000000                -0.024317   
W_24111_70_hippie_M.jpg           -0.024317                 1.000000   
T_00004_90_hiphop_M.jpg           -0.008424                -0.235702   
T_03007_10_sportivecasual_M.jpg    0.002870                -0.288675   
T_03118_19_normcore_M.jpg         -0.014563                -0.408248   
...                                     ...                      ...   
W_43763_00_oriental_W.jpg         -0.019030                 0.000000   
W_45749_90_grunge_W.jpg           -0.031294                 0.000000   
W_47546_60_minimal_W.jpg           0.003603                 0.000000   
W_17799_19_normcore_M.jpg         -0.008062                 0.000000   
W_21986_60_space_W.jpg            -0.018513                 0.000000   

                                 T_00004_90_hiphop_M.jpg  \
Unnamed: 0                                     -0.008424   

In [12]:
similarity_df

Unnamed: 0.1,Unnamed: 0,W_24111_70_hippie_M.jpg,T_00004_90_hiphop_M.jpg,T_03007_10_sportivecasual_M.jpg,T_03118_19_normcore_M.jpg,W_02699_60_mods_M.jpg,W_23983_60_mods_M.jpg,T_01568_50_ivy_M.jpg,T_15877_10_sportivecasual_M.jpg,T_16259_10_sportivecasual_M.jpg,...,W_29108_90_kitsch_W.jpg,W_31559_90_kitsch_W.jpg,W_34808_10_sportivecasual_W.jpg,W_40520_19_genderless_W.jpg,W_40817_70_military_W.jpg,W_43763_00_oriental_W.jpg,W_45749_90_grunge_W.jpg,W_47546_60_minimal_W.jpg,W_17799_19_normcore_M.jpg,W_21986_60_space_W.jpg
Unnamed: 0,1.000000,-0.024317,-0.008424,0.002870,-0.014563,0.001374,-0.047104,0.023407,0.018574,0.003465,...,-0.010494,-0.031394,-0.018854,-0.017247,-0.019553,-0.019030,-0.031294,0.003603,-0.008062,-0.018513
W_24111_70_hippie_M.jpg,-0.024317,1.000000,-0.235702,-0.288675,-0.408248,-0.154303,-0.154303,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
T_00004_90_hiphop_M.jpg,-0.008424,-0.235702,1.000000,0.408248,0.577350,0.218218,0.218218,0.235702,0.408248,0.408248,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
T_03007_10_sportivecasual_M.jpg,0.002870,-0.288675,0.408248,1.000000,0.707107,0.267261,0.267261,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
T_03118_19_normcore_M.jpg,-0.014563,-0.408248,0.577350,0.707107,1.000000,0.377964,0.377964,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
W_43763_00_oriental_W.jpg,-0.019030,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.333333,0.333333,0.258199,0.408248,0.333333,1.000000,0.333333,0.333333,0.000000,0.000000
W_45749_90_grunge_W.jpg,-0.031294,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.333333,0.666667,0.258199,0.408248,0.333333,0.333333,1.000000,0.333333,0.000000,0.000000
W_47546_60_minimal_W.jpg,0.003603,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.333333,0.333333,0.258199,0.408248,0.333333,0.333333,0.333333,1.000000,0.000000,0.000000
W_17799_19_normcore_M.jpg,-0.008062,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000


In [14]:
similarity_matrix.shape[1]

6657

In [15]:
# 코사인 유사도 계산 함수
def cosine_similarity(matrix):
    # 아이템 간의 유사도 행렬 초기화
    item_count = matrix.shape[1]
    similarity_matrix = np.zeros((item_count, item_count))

    for i in range(item_count):
        for j in range(item_count):
            if i == j:
                similarity_matrix[i, j] = 1  # 자기 자신과의 유사도는 1
            else:
                # 코사인 유사도 계산
                dot_product = np.dot(matrix[:, i], matrix[:, j])
                norm_i = np.linalg.norm(matrix[:, i])
                norm_j = np.linalg.norm(matrix[:, j])
                
                if norm_i > 0 and norm_j > 0:
                    similarity_matrix[i, j] = dot_product / (norm_i * norm_j)
                else:
                    similarity_matrix[i, j] = 0  # 둘 중 하나가 0 벡터일 경우 유사도 0

    return similarity_matrix

# 유저 ID 컬럼을 제외한 나머지 데이터프레임을 행렬로 변환
matrix = df.drop(columns=['Unnamed: 0']).values

# 유사도 행렬 계산
similarity_matrix = cosine_similarity(matrix)

# 아이템 ID를 인덱스로 설정하여 similarity_matrix를 데이터프레임으로 변환
item_ids = df.columns.drop('Unnamed: 0')  # 아이템 ID는 데이터프레임의 컬럼 이름으로 가정
similarity_df = pd.DataFrame(similarity_matrix, index=item_ids, columns=item_ids)

# 결과 출력
print("코사인 유사도 데이터프레임:")
print(similarity_df)

코사인 유사도 데이터프레임:
                                 W_24111_70_hippie_M.jpg  \
W_24111_70_hippie_M.jpg                         1.000000   
T_00004_90_hiphop_M.jpg                        -0.235702   
T_03007_10_sportivecasual_M.jpg                -0.288675   
T_03118_19_normcore_M.jpg                      -0.408248   
W_02699_60_mods_M.jpg                          -0.154303   
...                                                  ...   
W_43763_00_oriental_W.jpg                       0.000000   
W_45749_90_grunge_W.jpg                         0.000000   
W_47546_60_minimal_W.jpg                        0.000000   
W_17799_19_normcore_M.jpg                       0.000000   
W_21986_60_space_W.jpg                          0.000000   

                                 T_00004_90_hiphop_M.jpg  \
W_24111_70_hippie_M.jpg                        -0.235702   
T_00004_90_hiphop_M.jpg                         1.000000   
T_03007_10_sportivecasual_M.jpg                 0.408248   
T_03118_19_normcore_M.j

In [17]:
similarity_df

Unnamed: 0,W_24111_70_hippie_M.jpg,T_00004_90_hiphop_M.jpg,T_03007_10_sportivecasual_M.jpg,T_03118_19_normcore_M.jpg,W_02699_60_mods_M.jpg,W_23983_60_mods_M.jpg,T_01568_50_ivy_M.jpg,T_15877_10_sportivecasual_M.jpg,T_16259_10_sportivecasual_M.jpg,W_00901_60_mods_M.jpg,...,W_29108_90_kitsch_W.jpg,W_31559_90_kitsch_W.jpg,W_34808_10_sportivecasual_W.jpg,W_40520_19_genderless_W.jpg,W_40817_70_military_W.jpg,W_43763_00_oriental_W.jpg,W_45749_90_grunge_W.jpg,W_47546_60_minimal_W.jpg,W_17799_19_normcore_M.jpg,W_21986_60_space_W.jpg
W_24111_70_hippie_M.jpg,1.000000,-0.235702,-0.288675,-0.408248,-0.154303,-0.154303,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
T_00004_90_hiphop_M.jpg,-0.235702,1.000000,0.408248,0.577350,0.218218,0.218218,0.235702,0.408248,0.408248,0.218218,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
T_03007_10_sportivecasual_M.jpg,-0.288675,0.408248,1.000000,0.707107,0.267261,0.267261,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
T_03118_19_normcore_M.jpg,-0.408248,0.577350,0.707107,1.000000,0.377964,0.377964,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
W_02699_60_mods_M.jpg,-0.154303,0.218218,0.267261,0.377964,1.000000,0.142857,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
W_43763_00_oriental_W.jpg,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.333333,0.333333,0.258199,0.408248,0.333333,1.000000,0.333333,0.333333,0.0,0.0
W_45749_90_grunge_W.jpg,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.333333,0.666667,0.258199,0.408248,0.333333,0.333333,1.000000,0.333333,0.0,0.0
W_47546_60_minimal_W.jpg,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.333333,0.333333,0.258199,0.408248,0.333333,0.333333,0.333333,1.000000,0.0,0.0
W_17799_19_normcore_M.jpg,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,0.0


In [None]:
# 코사인 유사도 계산 함수
def cosine_similarity(matrix):
    item_count = matrix.shape[1]
    similarity_matrix = np.zeros((item_count, item_count))

    for i in range(item_count):
        for j in range(item_count):
            if i == j:
                similarity_matrix[i, j] = 1  # 자기 자신과의 유사도는 1
            else:
                dot_product = np.dot(matrix[:, i], matrix[:, j])
                norm_i = np.linalg.norm(matrix[:, i])
                norm_j = np.linalg.norm(matrix[:, j])
                
                if norm_i > 0 and norm_j > 0:
                    similarity_matrix[i, j] = dot_product / (norm_i * norm_j)
                else:
                    similarity_matrix[i, j] = 0

    return similarity_matrix

# 선호 예측 함수
def predict_preferences(user_item_matrix, similarity_matrix):
    predictions = np.zeros_like(user_item_matrix)
    predicted_values = []  # 예측된 값 리스트
    actual_values_list = []  # 실제 값 리스트

    for user in range(user_item_matrix.shape[0]):
        for item in range(user_item_matrix.shape[1]):
            if user_item_matrix[user, item] != 0:  # 평가가 있는 경우는 건너뜀
                continue

            # 평가가 없는 경우에 대해 예측
            similar_items = similarity_matrix[item]
            weighted_sum = np.dot(similar_items, user_item_matrix[user, :])
            normalization_factor = np.sum(np.abs(similar_items[similar_items > 0]))  # 가중치의 합
            if normalization_factor > 0:
                prediction = weighted_sum / normalization_factor
                # 예측값을 1 또는 -1로 변환
                predicted_value = 1 if prediction > 0 else -1
                predictions[user, item] = predicted_value
                
                # 리스트에 예측값과 실제값 저장
                predicted_values.append(predicted_value)
                actual_values_list.append(matrix[user, item])

    return predictions, predicted_values, actual_values_list

# 정확도 계산 함수
def calculate_accuracy(predicted_values, actual_values_list):
    correct_predictions = sum(p == a for p, a in zip(predicted_values, actual_values_list))
    total_predictions = len(actual_values_list)

    if total_predictions == 0:
        return 0  # 평가가 없으면 정확도 0
    return correct_predictions / total_predictions

# 코사인 유사도 행렬 생성
similarity_matrix = cosine_similarity(user_item_matrix)

# 선호 예측 수행
predictions, predicted_values, actual_values_list = predict_preferences(user_item_matrix, similarity_matrix)

# 정확도 계산
accuracy = calculate_accuracy(predicted_values, actual_values_list)

# 결과 출력
print("코사인 유사도 행렬:")
print(similarity_matrix)
print("예측된 선호도 행렬:")
print(predictions)
print("예측된 값 리스트:", predicted_values)
print("실제 값 리스트:", actual_values_list)
print(f"정확도: {accuracy:.2f}")