In [1]:
from transformers import BertTokenizer, BertModel
import torch
from torchvision import transforms

import pandas as pd
import os ,re
import ast
from PIL import Image
import h5py  # for .h5 file

from collections import Counter

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
names = ['Sectional_Sofas', 'Sleeper_Sofas', 'Reclining_Sofas', 'LoveSeats', 'Futons', 'Settles', 'Convertibles', 
         'Accent_Chairs', 'Coffee_Tables', 'TV_Stands', 'End_Tables', 'Console_Tables', 'Ottomans', 'Living_Room_Sets', 
         'Decorative_Pillows', 'Throw_Blankets', 'Area_Rugs', 'Wall_Arts', 'Table_Lamps', 'Floor_Lamps', 
         'Pendants_and_Chandeliers', 'Sconces', 'Baskets_and_Storage', 'Candles', 'Live_Plants', 'Artificial_Plants', 
         'Planters', 'Decorative_Accessories', 'Window_Coverings', 'Decorative_Mirrors', 'Dining_Sets', 
         'Dining_Tables', 'Dining_Chairs', 'Bar_Stools', 'Kitchen_Islands', 'Buffets_and_Sideboards', 'China_Cabinets', 
         'Bakers_Recks', 'Bedroom_Sets', 'Mattresses', 'Nightstands', 'Dressers', 'Beds', 'Bedframes', 'Bases', 'Vanities', 
         'Entryway_Furnitures', 'Desks', 'Desk_Chairs', 'Bookcases', 
         'File_Cabinets', 'Computer_Armoires', 'Drafting_Tables', 'Cabinets', 'Furniture_Sets']

In [3]:
# 기본 경로 설정(01-4.Crawling_integrated를 넣었던 폴더)
base_path = '/home/all'

# 파일 이름을 안전하게 만드는 함수
def sanitize_filename(filename):
    return re.sub(r'[^a-zA-Z]', '', filename)

# 이미지 경로를 가져오는 함수
def get_image_path(title, category):
    sanitized_title = sanitize_filename(title[:200])
    file_path = os.path.join(base_path, 'imgs', category, f"{sanitized_title}.jpg")
    return file_path if os.path.exists(file_path) else "File not found."

# 모든 CSV 파일을 처리하고 하나의 데이터프레임으로 합치는 함수
def process_all_csv_files():
    all_dfs = []
    for name in names:
        csv_file = os.path.join(base_path, f'product_infos/{name}_product_infos.csv')
        if os.path.exists(csv_file):
            df = pd.read_csv(csv_file)
            df['img_path'] = df['Title'].apply(lambda title: get_image_path(title, name))
            all_dfs.append(df)
    return pd.concat(all_dfs, ignore_index=True)

# 모든 데이터를 하나의 데이터프레임으로 합치기
combined_df = process_all_csv_files()


In [4]:
# path에 주소가 없는 경우(크롤링이 실패한 사례)
combined_df = combined_df[combined_df['img_path'] != 'File not found.'].reset_index(drop=True)

In [5]:
# Style 라벨이 없으면 버리기
def extract_style(row):
    for col in ['Product_Info', 'Product_Feature']:
        try:
            info_dict = ast.literal_eval(row[col])
            if 'Style' in info_dict:
                return info_dict['Style']
            if 'style' in info_dict:
                return info_dict['style']
        except (ValueError, SyntaxError):
            continue
    return None

# Apply the function to each row
combined_df['Style'] = combined_df.apply(extract_style, axis=1)
# Drop rows where 'Style' is None
combined_df.dropna(subset=['Style'], inplace=True)

In [6]:
# valid style 아니면 버리기
# Define valid styles
valid_styles = ['modern', 'contemporary', 'classic', 'urban', 'country', 'unique', 'minimalism']

# Modified categorize_style function
def map_style(style):
    categories = {
        "Modern": ["Modern",'Contemporary,Modern','French','Copenhagen','Modern Contemporary','Italian', "European",'Mid-Century Modern, Contemporary','Eclectic, modern, traditional','Modern, Classic', 'Modern couch','contemporary and traditional, modern','Casual, Modern','Modern, Contemporary',"Modern Minimalist", "High Gloss", "Scandinavian", "Nordic", "European", "Japanese", "Mid Century Modern",'Mid-Century Modern,Contemporary','Mid-Centuryum', "Contemporary Modern", "Minimalist Modern"],
        "Contemporary": ["Contemporary", "Streamlined", "Unadorned", "Sleek", "Understated", "Clean Lines", "Modern Contemporary", "Contemporary Chic"],
        "Classic": [ "Classic",'Classic Contemporary', "Antique", "Art Deco", "Colonial", "Baroque", "Vintage", "French", "Victorian", "Traditional", "Retro","Traditional Classic", "Vintage Classic"],
        "Urban": ["Urban", "Metropolitan", "City Style", "Modern Urban", "Urban Contemporary", "Industrial", "Loft", "Modern Industrial", "Rustic Industrial", "Industrial Retro Style", "Metropolitan","Urban Industrial", "Industrial Chic",'Retro'],
        "Country": ["Country", "Rustic Country", "Country Style", "Rural", "Pastoral", "Provincial","Rustic", "Farmhouse", "Country Rustic", "Shabby Chic", "Lodge", "Reclaimed Wood","Country Rustic", "Rustic Charm"],
        "Unique": ["Unique",  "One-of-a-Kind", "Unique Design","Free Style", "Wild", "Fantasy Plus", "Boho Style", "Bohemian","Eclectic","Bohemian Eclectic", "Eclectic Mix","Fusion", "Quirky", "Galaxy", "Stars",'Bold eclectic'],
        "Minimalism": ["Minimalism", "Simple", "Zen", "Bare", "Sparse", "Minimalist","Simplistic Minimalism"]
    }

    for key, values in categories.items():
        if style in values:
            return key
    return style
 
# Simplified is_valid_style function
def is_valid_style(style):
    mapped_style = map_style(style)
    if mapped_style.lower() in valid_styles:
        return mapped_style
    else:
        return None

# Apply the function to the 'Style' column of the DataFrame
combined_df['Style'] = combined_df['Style'].apply(is_valid_style)
# Drop rows where 'Style' is None
combined_df.dropna(subset=['Style'], inplace=True)

In [7]:
# 너무 적은 갯수를 가진 스타일 버리기
# 1. Calculate the frequency of each style
style_counts = Counter(combined_df['Style'])
print(combined_df['Style'].map(style_counts).unique())

# 2. Filter out styles with only one member
combined_df = combined_df[combined_df['Style'].map(style_counts) > 10]
len(combined_df)

[8132  866 1277  159  733  455   66    2]


11688

In [8]:
texts = combined_df['Product_Text']

In [9]:
# Load the tokenizer and base BERT model (not the sequence classification variant)
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained('bert-large-uncased')

In [10]:
# Encode text
text = texts[0]
inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)

# Get model output
outputs = model(**inputs)

# You can obtain the pooled output (representation of the entire sentence)
# by using the `pooler_output` attribute.
# This is typically done by applying a pooling operation to the last hidden layer.
pooled_output = outputs.pooler_output

# Alternatively, you can work with the last hidden states directly.
# hidden_states = outputs.last_hidden_state

# The `pooled_output` is a fixed-size vector representation of your input text.
print(pooled_output)
print(pooled_output.shape)

tensor([[-0.9996, -0.9987,  1.0000,  ..., -1.0000,  0.9901, -0.9963]],
       grad_fn=<TanhBackward0>)
torch.Size([1, 1024])


In [11]:
# Ensure all elements are strings
texts = combined_df['Product_Text'].astype(str)

# Container for all pooled outputs
all_pooled_outputs = []

for text in texts:
    # Check if the text is empty or NaN
    if text.strip():  # This will be False for empty or NaN values
        # Encode text and get model output
        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
        with torch.no_grad():  # Disable gradient calculation for efficiency
            outputs = model(**inputs)

        # Extract pooled output and add to list
        pooled_output = outputs.pooler_output
        # all_pooled_outputs.append(pooled_output)

# Optionally, convert list of tensors to a single tensor
# all_pooled_outputs_tensor = torch.cat(all_pooled_outputs, dim=0)

# Save the tensor to a file
# torch.save(all_pooled_outputs_tensor, 'pooled_outputs.pt')

KeyboardInterrupt: 

In [13]:

# Load the image
image_path = combined_df['img_path'][0]
image = Image.open(image_path)

# Define the transformation of the image to tensor and normalization
transform = transforms.Compose([
    transforms.ToTensor(),  # Converts to a tensor with values between 0 and 1
    transforms.Resize((1024, 1024)),  # Resize to a smaller size for demonstration purposes
])

# Apply the transformation to the image
image_tensor = transform(image)
print(image_tensor.shape)
# Flatten the image tensor to concatenate with text tensor later
# We will use a dummy flattened size here for demonstration; this will need to match the actual text tensor size for real application
flattened_size = 1024  # Example size
image_tensor_flat = image_tensor.view(1, -1)[:,:flattened_size]  # Reshape to [1, 1024] for concatenation

image_tensor_flat.shape  # Should be torch.Size([1, 1024])


torch.Size([3, 1024, 1024])




torch.Size([1, 1024])

In [51]:
# img 랑 text 각각 tensor로 바꾼 후 이어붙여서 여러파일에 나눠서 저장(램 용량 딸림)

def process_dataframe_and_save(df, tokenizer, model, chunk_size=1000, save_dir='/home/bae/Big_Project/'):
    # Create the save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    # Process and save in chunks
    for start_idx in tqdm(range(0, len(df), chunk_size)):
        # Container for the current chunk's concatenated outputs
        chunk_concatenated_outputs = []

        # Process the current chunk
        end_idx = min(start_idx + chunk_size, len(df))
        for index in range(start_idx, end_idx):
            row = df.iloc[index]
            # Process text and image as before...
             # Process text
            text = row['Product_Text'].astype(str)
            if text.strip():  # If the text is not empty
                inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
                with torch.no_grad():
                    outputs = model(**inputs)
                text_tensor = outputs['pooler_output']
            else:
                text_tensor = torch.zeros(1, 1024)  # A tensor of zeros if the text is empty

            # Process image
            image_path = row['img_path']
            image = Image.open(image_path)
            image_tensor_flat = transform(image).view(1, -1)[:,:flattened_size]  # Reshape to [1, 1024]

            # Concatenate text_tensor and image_tensor_flat
            concatenated_tensor = torch.cat((text_tensor, image_tensor_flat), dim=1)

            # Concatenate and append to the chunk list
            chunk_concatenated_outputs.append(concatenated_tensor)

        # Convert chunk list of tensors to a single tensor
        chunk_tensor = torch.cat(chunk_concatenated_outputs, dim=0)

        # Save the current chunk tensor to a .pt file
        chunk_file_path = os.path.join(save_dir, f'torch_tensor_img_text_chunk_{start_idx//chunk_size}.pt')
        torch.save(chunk_tensor, chunk_file_path)

    # Return the directory where the chunks were saved
    return save_dir

# Usage example
file_path = process_dataframe_and_save(combined_df, tokenizer, model)


  0%|          | 0/12 [00:00<?, ?it/s]


AttributeError: 'str' object has no attribute 'astype'