In [2]:
import re
import pandas as pd

def normalize_brand(brand_string):
    normalized_brand = re.sub(r'\[.*?\]|\(.*?\)', '', brand_string).strip().replace("  ", " ")
    return normalized_brand

def preprocess_product_name(df) -> pd.DataFrame:

    df['product_name'] = df['product_name'].apply(normalize_brand)

    d = pd.DataFrame(df['product_name'].value_counts()).reset_index()
    d['split'] = d['product_name'].apply(lambda x: x.split(' '))

    def split_list(s_list):
        for idx, values in enumerate(s_list):
            if ('ml' in values) | ('mL' in values) :
                return idx 
            
    d['ml_idx'] = d['split'].apply(split_list)

    d = d.dropna()

    def join_split_data(df):
        result =  ' '.join(df['split'][:df['ml_idx']])
        return result

    d['ml_idx'] = d.loc[:,'ml_idx'].astype(int)

    d['result'] = d.apply(join_split_data, axis =1 )

    merge_df = pd.merge(df, d, on='product_name', how='right')

    merge_df.drop(['count','split','ml_idx'], axis=1, inplace=True)

    # 'brand' 열의 내용을 'result' 열 내용으로 바꾸기
    merge_df['product_name'] = merge_df['result']

    # 'result' 열 제거
    df = merge_df.drop(['result'], axis=1)

    return df


In [3]:
import pandas as pd

df_review = pd.read_csv("../../resource/data/collabo_filtering_dataset2.csv")

def get_brand_id(text: str) -> int:
    df_find = df_review[df_review["brand"] == text]
    if df_find.empty:
        return -1
    else: 
        brand_id = df_find["brand_id"].iloc[0]
        return brand_id

df_ingredient_dataset = pd.read_csv("../../resource/data/제품데이터셋.csv")

print(f"원본 데이터 사이즈: {len(df_ingredient_dataset)}")

df_ingredient_dataset = preprocess_product_name(df_ingredient_dataset)

print(f"정제 데이터 사이즈: {len(df_ingredient_dataset)}")

df_brand_ids = df_ingredient_dataset["product_name"].apply(get_brand_id)
valid_count = sum(df_brand_ids.values != -1)
print(f"유효한 데이터 사이즈: {valid_count}")

df_ingredient_dataset = df_ingredient_dataset.assign(brand_id=df_brand_ids)
df_ingredient_dataset = df_ingredient_dataset[df_ingredient_dataset["brand_id"] != -1]
print(len(df_ingredient_dataset))

df_ingredient_dataset = df_ingredient_dataset.drop_duplicates(subset="brand_id", keep="first")

print(f"최종 전성분 및 이미지 정보 사이즈: {len(df_ingredient_dataset)}")

df_ingredient_dataset.to_csv("../../resource/data/mapped_ingredient_dataset.csv", index=False)

원본 데이터 사이즈: 817
정제 데이터 사이즈: 691
유효한 데이터 사이즈: 443
443
최종 전성분 및 이미지 정보 사이즈: 251


In [34]:
df = pd.read_csv("../../resource/data/제품데이터셋.csv")
df = preprocess_product_name(df)
print(len(df))
print(len(df["product_name"].unique()))

691
448


In [36]:
review = pd.read_csv("../../resource/data/collabo_filtering_dataset2.csv")
print(len(review))
distinct_review = review.drop_duplicates(subset="brand", keep="first")
len(distinct_review)

565