In [1]:
from skt.gcp import (
    PROJECT_ID,
    bq_insert_overwrite,
    bq_to_df,
    bq_to_pandas,
    get_bigquery_client,
    bq_table_exists,
    get_max_part,
    load_query_result_to_table,
    pandas_to_bq,
    pandas_to_bq_table,
    load_bigquery_ipython_magic,
    get_bigquery_client,
    _print_query_job_results,
    load_query_result_to_partitions
    
)

from skt.ye import (
    get_hdfs_conn,
    get_spark,
    hive_execute,
    hive_to_pandas,
    pandas_to_parquet,
    slack_send
)
from skt.github_utils import GithubUtil
from skt.vault_utils import get_secrets


In [2]:
from datetime import date, datetime, timedelta

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from dateutil.relativedelta import relativedelta
from pyhive import hive

from copy import deepcopy
from joblib import Parallel, delayed
import os
import sys
from git import Repo
from contextlib import contextmanager
from tqdm.notebook import tqdm
import warnings

In [3]:
secrets = get_secrets('github/sktaiflow')
token = secrets['token']

proxies = {
    'http': secrets['proxy'],
    'https': secrets['proxy'],
}

# Customer func for cloning git modules

In [4]:
@contextmanager
def proxy(proxies):
    env_backup = dict(os.environ)
    os.environ["HTTP_PROXY"] = proxies["http"]
    os.environ["HTTPS_PROXY"] = proxies["https"]
    yield
    os.environ.clear()
    os.environ.update(env_backup)

In [5]:
def slack_sending(channel_name:str, msg:str="test", is_adot:bool=True):
    if "#" not  in channel_name:
        channel_name += "#" + channel_name

    slack_send(
        text=msg,
        username="SKT",
        channel=channel_name,
        icon_emoji=":large_blue_circle:",
        blocks=None,
        dataframe=False,
        adot=is_adot
    )


In [6]:
class GithubUtil_custom(GithubUtil):
    def __init__(self, token, proxies, **kwargs):
        super().__init__(token, proxies)
    
    def clone_from_repo(self, git_url, branch="main", git_save_path="/temp"):
        try:
            if self._proxies:
                with proxy(self._proxies):
                    response = Repo.clone_from(git_url, git_save_path, branch=branch)
                    return {"code": "200", "response": response}
            else:
                msg = f"proxy must be passed"
                raise Exception(msg)    
        except Exception as e:
            msg = f"cloning git repo:{git_url} branch:{branch} failed {e}"
            slack_sending(msg=msg, channel_name=channel_name, is_adot=True)
            raise Exception(msg)

In [7]:
git_url ='https://github.com/sktaiflow/onemodelV3-opensearch-engine.git'
branch = 'develop'
git_save_path = '/home/x1112436/shared/1112436/git'

In [8]:
import shutil
shutil.rmtree(git_save_path)
os.makedirs(git_save_path, exist_ok=True)

In [9]:
gitobj= GithubUtil_custom(token=token, proxies=proxies)

In [10]:
response = gitobj.clone_from_repo(git_url=git_url, branch=branch, git_save_path=git_save_path)

In [11]:
module_path = os.path.join(git_save_path, "dags")
sys.path.append(module_path)
sys.path.append(git_save_path)

In [12]:
# !pip install datasets==2.19.1
# !pip install pydantic==2.7.1
# !pip install loguru==0.7.2

In [13]:
## import from module
from onemodelV3.opensearch_engine.indexing_engine.preprocessor import OpensearchPreprocessor
from onemodelV3.opensearch_engine.indexing_engine.func import *





In [14]:
# get file list
def get_gzip_files(directory='./temp/indexing/input'):
    from pathlib import Path
    gzip_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".gzip"):
                file_path = os.path.join(root, file)
                gzip_files.append(file_path)

    return gzip_files

file_list = get_gzip_files(directory="/home/x1112436/shared/1112436/indexing_data")

In [15]:
from typing import List, Tuple
from torch.utils.data import (
    IterableDataset, 
    Dataset
)
from datasets import (
    load_dataset, 
    Dataset, 
    DatasetDict,
    IterableDatasetDict
)

In [16]:
class BaseParquetProcessor:

    @classmethod
    def load(
            cls, 
            file_path_list:List, 
            split:str=None, 
            stream:bool=True, 
            keep_in_memory:bool=False, 
            cache_dir:str='./.cache'
        ) -> Dataset:
        """ load datset from parquet files"""

        if isinstance(file_path_list, List):
            dataset = load_dataset(
                                path ="parquet", 
                                data_files=file_path_list, 
                                split=split, 
                                keep_in_memory=keep_in_memory,
                                streaming=stream,
                                cache_dir=cache_dir
                                )

        else:
            msg = f"path should be in type (List)"
            raise TypeError(msg)
        
        if split is None: dataset = dataset['train']
        
        return dataset


class OpensearchPreprocessor(BaseParquetProcessor):
    index_name = "onemodelV3"
    
    def __init__(self, args, **kwargs):        
        super().__init__(args)
    
    @classmethod
    def set_index_name(cls, new_name):
        cls.index_name = new_name

    @classmethod
    def load(
            cls, 
            file_path_list:List, 
            split:str=None, 
            stream=True,
            keep_in_memory:bool=False,
            cache_dir:str='./.cache'
        ) -> IterableDataset:        
        
        dataset = super(OpensearchPreprocessor, cls).load(
                file_path_list=file_path_list, 
                split=split, 
                stream=stream, 
                keep_in_memory=keep_in_memory,
                cache_dir=cache_dir
        )
        return dataset
    
    @classmethod
    def _validate_component_inputs(cls, doc_body:Dict) -> Dict:
        try:
            data = IndexingSchema(**doc_body)
            code = InternalCodes.SUCCESS
            message = InternalCodes.get_message(code=code)
        except ValidationError as e:
            data = None
            code = InternalCodes.PYDANTIC_VALIDATION_ERROR
            message = InternalCodes.get_message(code=code, e=e)
        finally:
            return {"data":data, "code":code, "message":message, "doc":doc_body}

    @classmethod
    def profile_normalize(cls, data:str, delimiter='<|n|>'):
        
        mno_profile = data["mno_profile_feature"]                
        mno_profiles = mno_profile.split(delimiter)
        mno_profile_dict = dict()
        for profile in mno_profiles:
            key, val = profile.split(':')
            null_values = mno_profile_mappings[key]
            if val in mno_select_default_value(field_name=null_values):
                """ 없음 모름 '' ... etc 이면 값을 제거한다."""
                continue
            elif val.strip() =="있음":
                mno_profile_dict[key] = key.split("이력")[0].strip()
            else:
                mno_profile_dict[key] = val.strip()

        mno_template_dict = defaultdict(list)

        for key, val in mno_profile_dict.items():
            new_feature = new_mno_profile_mappings[key]
            mno_template_dict[new_feature].append(val)

        mno_preferences = mno_template_dict.get('preference', [])
        mno_preference_template = ""        

        if mno_preferences:
            mno_preference_dict = defaultdict(set)
            mno_preference = mno_preferences[0]
            mnopreference_list = mno_preference.split(',')
            for mno_prefernce in mnopreference_list:
                split_mno_preference = mno_prefernce.split('_')
                if len(split_mno_preference) == 2:
                    upper_cate, lower_cate = split_mno_preference
                else:
                    upper_cate = split_mno_preference[0]

                mno_preference_dict[upper_cate].add(lower_cate)

            for key, val in mno_preference_dict.items():
                val_str = ','.join(val)
                if mno_preference_template == "": mno_preference_template = f"{key}: {val_str}"
                else: mno_preference_template += '\n' + f"{key}: {val_str}"              
            
        mno_template_dict['preference'] = mno_preference_template

        adot_profile = data["adot_profile_feature"]
        adot_profiles = adot_profile.split(delimiter)
        adot_profile_dict = dict()
        for profile in adot_profiles:
            key, val = profile.split(':', 1)
            key = key.strip()
            val = val.strip()
            null_values = adot_profile_mappings[key]
                
            if val in adot_select_default_value(field_name=null_values):
                pass
            else:
                adot_profile_dict[key] = val
                
        adot_template_dict = defaultdict(list)
        for key, val in new_adot_profile_mappings.items():
            adot_template_dict[val] = []

        
    @classmethod
    def preprocess(cls, item):
        
        user_vector =  [float(x) for x in item['user_vector']]
        svc_mgmt_num = str(item.get("svc_mgmt_num", "unk"))  
        luna_id = item.get("luna_id", "unk")
        is_active = True
        is_adot = False if luna_id else True
        mno_profile = item.get("mno_profile", "")
        adot_profile = item.get("adot_profile", "")
        behavior_profiles = item.get("behavior_profiles", "")
        age = item.get("age", "unk")
        gender = item.get("gender", "unk")
        model_version = item["model_version"]

        doc = {
            "_id": svc_mgmt_num,
            "svc_mgmt_num": svc_mgmt_num,
            "luna_id": item.get("luna_id", "temp"),
            "user_embedding":user_vector,
            "mno_profile": mno_profile,
            "adot_profile": adot_profile,
            "behavior_profile": behavior_profiles,
            "gender":gender,
            "age":age,
            "is_adot": is_adot,
            "is_active": is_active,
            "model_version": model_version
        }

        response = cls.doc_validation_check(doc_body=doc)
        if response["code"] != InternalCodes.SUCCESS:
            cls.profile_normalize(data=doc)
        
        else:
            pass
    
    @classmethod
    def apply_maps(cls, dataset:Dataset, functions_list: List[Tuple[Callable[..., Any], bool]]) -> Dataset:
        """ instance method for apply list of functions"""
        for func, with_indices in functions_list:
            dataset = cls.apply_map(dataset=dataset, func=func, with_indices=with_indices)
        
        return dataset
    
    @classmethod
    def apply_map(cls, dataset: Dataset, func:Callable, with_indices: bool = True) -> Dataset:
        """ instance method for apply only one function"""
        dataset = dataset.map(func, with_indices=with_indices)
        return dataset

In [21]:
data = file_list[:3]

In [23]:
dataset = OpensearchPreprocessor.load(file_path_list=data)

In [24]:
sample_data = next(iter(dataset))

In [25]:
from pydantic import BaseModel, Field, ValidationError
from enum import Enum

In [26]:
class GenderEnum(str, Enum):
    male = "male"
    female = "female" 
    unknown = "unknown"

class RawInputSchema(BaseModel):
    svc_mgmt_num: str = Field(..., min_length=1)
    luna_id: str
    age: Optional[int] = Field(None, gt=0)
    gender:GenderEnum = Field(..., description="Gender of the person")
    mno_profile_feature: Optional[str] = Field("", description="Gender of the person")
    adot_profile_feature: Optional[str] = Field("", description="Gender of the person")
    behavior_profile: Optional[str] = Field(None, description="Gender of the person")
    is_adot: Optional[bool] = False
    create_at: str
    user_vector:List[float]
    class Config:
        extra = 'forbid'

In [27]:
%timeit
failed_log = []
data_list = []
for i, data in enumerate(dataset):
    try:
        data_list.append(RawInputSchema(**data))
    except Exception as e: 
        e_json = e.json()
        failed_log.append(e_json)

In [29]:
print(len(failed_log))
print(len(data_list))

64
39662


In [31]:
dd = data_list[2]

In [35]:
dd.mno_profile_feature

'관심사:음식/식사_음식 배달 주문,라이프스타일/취미_소셜 미디어,쇼핑_오픈마켓<|n|>성별:여자<|n|>나이:29세<|n|>서비스 사용 기간:4년<|n|>기변 후 경과일:모름<|n|>요금제 이름:T플랜 스페셜<|n|>요금제 가격:79000원<|n|>3개월 평균 데이터 사용량:122gb<|n|>단말기 가격:고가<|n|>단말기 제조사:apple<|n|>멤버십 등급:vip<|n|>멤버십 사용 이력:있음<|n|>가족 결합 이력:없음<|n|>로밍 사용 이력:없음<|n|>세컨디바이스 보유 여부:없음<|n|>소액 및 DCB 결제 이력:0원'

In [54]:
def normalize_mno_profiels(mno_profile, delimiter ="<|n|>"):
    mno_profiles = mno_profile.split(delimiter)
    mno_profile_dict = dict()
    for profile in mno_profiles:
        key, val = profile.split(':')
        null_values = mno_profile_mappings[key]
        if val in mno_select_default_value(field_name=null_values):
            """ 없음 모름 '' ... etc 이면 값을 제거한다."""
            continue
        elif val.strip() =="있음":
            mno_profile_dict[key] = key.split("이력")[0].strip()
        else:
            mno_profile_dict[key] = val.strip()

    mno_template_dict = defaultdict(list)

    for key, val in mno_profile_dict.items():
        new_feature = new_mno_profile_mappings[key]
        mno_template_dict[new_feature].append(val)

    mno_preferences = mno_template_dict.get('preference', [])
    mno_preference_template = ""        

    if mno_preferences:
        mno_preference_dict = defaultdict(set)
        mno_preference = mno_preferences[0]
        mnopreference_list = mno_preference.split(',')
        for mno_prefernce in mnopreference_list:
            split_mno_preference = mno_prefernce.split('_')
            if len(split_mno_preference) == 2:
                upper_cate, lower_cate = split_mno_preference
            else:
                upper_cate = split_mno_preference[0]

            mno_preference_dict[upper_cate].add(lower_cate)

        for key, val in mno_preference_dict.items():
            val_str = ','.join(val)
            if mno_preference_template == "": mno_preference_template = f"{key}: {val_str}"
            else: mno_preference_template += '\n' + f"{key}: {val_str}"              
        
    mno_template_dict['preference'] = mno_preference_template
    return dict(mno_template_dict)


In [55]:
from collections import defaultdict
from dags.onemodelV3.opensearch_engine.mapper import (
    MNO_DEFAULT_VALUES, 
    MnoprofileKeys, 
    mno_select_default_value, 
    mno_profile_mappings,
    new_mno_profile_mappings,
    ADOT_DEFAULT_VALUES,
    AdotprofileKeys,
    adot_select_default_value,
    adot_profile_mappings,
    new_adot_profile_mappings
)

In [56]:
normalize_mno_profiels(dd.mno_profile_feature)

{'preference': '음식/식사: 음식 배달 주문\n라이프스타일/취미: 소셜 미디어\n쇼핑: 오픈마켓',
 'gender': ['여자'],
 'age': ['29세'],
 'service_duration': ['4년'],
 'mno_status': ['T플랜 스페셜', '고가', 'apple', 'vip', '멤버십 사용'],
 'plan_price': ['79000원'],
 'avg_data_usage': ['122gb']}

In [72]:
new_adot_profile_mappings

{'선호 도메인': 'preference',
 '성별': 'gender',
 '나이': 'age',
 '선호 아이템': 'preference_item',
 '선호 카테고리': 'preference',
 '인기 컨텐츠 선호도': 'popularity_preference_level',
 '사용성 기준': 'adot_status',
 '활성 상태': 'adot_status',
 '다중 도메인 성향': 'multi_domain_tendency'}

In [69]:
def normalize_adot_profiels(adot_profile, delimiter ="<|n|>"):
    adot_profiles = adot_profile.split(delimiter)
    adot_profile_dict = dict()
    for profile in adot_profiles:
        key, val = profile.split(':', 1)
        key = key.strip()
        val = val.strip()
        null_values = adot_profile_mappings[key]

        if val in adot_select_default_value(field_name=null_values):
            pass
        else:
            adot_profile_dict[key] = val

    adot_template_dict = defaultdict(list)
    
    for key, val in new_adot_profile_mappings.items():
    
    #   """ 선호 도메인, 선호 아이템, 선호 카테고리 매칭"""

    domain = adot_profile_dict.get('선호 도메인', '')
    category = adot_profile_dict.get('선호 카테고리', '')
    item = adot_profile_dict.get('선호 아이템','')

    if domain:
        preference_template = f"{domain}"
        preference_item_template = ''
    else:
        preference_template = ''
        preference_item_template = ''

    pattern = r'^(.*?)\((.*?)\)$'

    if category:
        match = re.search(pattern, category)
        prefix = match.group(1)
        cate = match.group(2)
        if cate:
            preference_template +=f",{prefix}:{cate}"

    if item:
        match = re.search(pattern, item)
        if match:
            # Extract the string between parentheses
            item = match.group(2)
            preference_item_template = f"{item}"

    # new_feature = new_adot_profile_mappings[key]
    # adot_template_dict[new_feature].append(val)
    # Remove spaces after the colon

    preference_template = re.sub(r'\s+(?=:)', '', preference_template)
    preference_template = re.sub(r':\s+', ':', preference_template)
    # Remove spaces after the colon
    preference_item_template = re.sub(r'\s+(?=:)', '', preference_item_template)
    preference_item_template = re.sub(r':\s+', ':', preference_item_template)
    adot_template_dict['preference'] = preference_template
    adot_template_dict['preference_item'] = preference_item_template
    #adot_preferences = adot_template_dict.get('preference', [])
    return dict(adot_template_dict)

In [70]:
dd.adot_profile_feature

'선호 도메인 : 전화<|n|>선호 카테고리 : 없음<|n|>선호 아이템 : 없음<|n|>성별 : 여성<|n|>나이 : 29<|n|>활성 상태 : 휴면<|n|>다중 도메인 성향 : 낮음<|n|>인기 컨텐츠 선호도 : 낮음<|n|>사용성 기준 : 일반유저<|n|>헤비유즈 도메인 : 없음'

In [71]:
import re
normalize_adot_profiels(dd.adot_profile_feature)

{'선호 도메인': '전화', '성별': '여성', '나이': '29', '활성 상태': '휴면', '다중 도메인 성향': '낮음', '인기 컨텐츠 선호도': '낮음', '사용성 기준': '일반유저'}


{'preference': '전화',
 'gender': [],
 'age': [],
 'preference_item': '',
 'popularity_preference_level': [],
 'adot_status': [],
 'multi_domain_tendency': []}

In [58]:
#data = [datum next(iter(dataset))

In [21]:
from dags.onemodelV3.opensearch_engine.mapper import (
    MNO_DEFAULT_VALUES, 
    MnoprofileKeys, 
    mno_select_default_value, 
    mno_profile_mappings,
    new_mno_profile_mappings,
    ADOT_DEFAULT_VAtLUES,
    AdotprofileKeys,
    adot_select_default_value,
    adot_profile_mappings,
    new_adot_profile_mappings
)

In [22]:
from collections import defaultdict
def profile_normalize(profile:str, delimiter='<|n|>'):
        """성별, 나이"""
        mno_profile = profile["mno_profile_feature"]
        adot_profile = profile["adot_profile_feature"]
        ##

        mno_profiles = mno_profile.split(delimiter)
        mno_profile_dict = dict()
        for profile in mno_profiles:
            key, val = profile.split(':')
            null_values = mno_profile_mappings[key]
            if val in select_default_value(field_name=null_values):
                continue
            elif val.strip() =="있음":
                mno_profile_dict[key] = key.split("이력")[0].strip()
            else:
                mno_profile_dict[key] = val

        mno_template_dict = defaultdict(list)
        for key, val in new_mno_profile_mappings.items():
            mno_template_dict[val] = []

        for key, val in mno_profile_dict.items():
            new_feature = new_mno_profile_mappings[key]
            mno_template_dict[new_feature].append(val)
        
        mno_preferences = mno_template_dict.get('preference', [])
        mno_preference_template = ''
        if mno_preferences:
            mno_preference_dict = defaultdict(set)
            mno_preference = mno_preferences[0]
            mnopreference_list = mno_preference.split(',')
            for mno_prefernce in mnopreference_list:
                split_mno_preference = mno_prefernce.split('_')
                if len(split_mno_preference) == 2:
                    upper_cate, lower_cate = split_mno_preference
                else:
                    upper_cate = split_mno_preference[0]
                mno_preference_dict[upper_cate].add(lower_cate)
                
            for key, val in mno_preference_dict.items():
                val_str = ','.join(val)
                if mno_preference_template == '': mno_preference_template = f"{key}: {val_str}"
                else: mno_preference_template += '\n' + f"{key}: {val_str}"
        else:
            mno_preference_template = ''
        mno_template_dict['preference'] = mno_preference_template
        return mno_template_dict

In [29]:
new_adot_profile_mappings = {
    '선호 도메인': 'preference',
    '성별': 'gender',
    '나이': 'age',
    '선호 아이템': 'preference',
    '선호 카테고리': 'preference',
    '인기 컨텐츠 선호도': 'preference',
    '사용성 기준': 'adot_status',
    '활성 상태':'adot_status',
    '다중 도메인 성향': 'multi_domain_tendency',
}

In [112]:
pattern = r"선호 (?P<entity>\w+)\s?:\s?(?P<item>.*?)\((?P<metadata>.*?)\)"


In [113]:
from collections import defaultdict
import re
def profile_normalize(profile:str, delimiter='<|n|>'):
        """성별, 나이"""
        adot_profile = profile["adot_profile_feature"]
        adot_profiles = adot_profile.split(delimiter)

        adot_profile_dict = dict()
        for profile in adot_profiles:
            match = re.search(pattern, row)
            key, val = profile.split(':')
            key = key.strip()
            val = val.strip()
            try:
                null_values = adot_profile_mappings[key]
            except:
                pass
            if val in adot_select_default_value(field_name=null_values):
                continue
            elif val.strip() =="있음":
                adot_profile_dict[key] = key.split("이력")[0].strip()
            else:
                adot_profile_dict[key] = val

        adot_template_dict = defaultdict(list)
        for key, val in new_adot_profile_mappings.items():
            adot_template_dict[val] = []

        for key, val in adot_profile_dict.items():
            try:
                new_feature = new_adot_profile_mappings[key]
                adot_template_dict[new_feature].append(val)
            except:
                pass
        
        adot_preferences = adot_template_dict.get('preference', [])

        return dict(adot_template_dict)

In [154]:
adot_profile_mappings = {
    '선호 도메인': 'preferred_domain',
    '성별': 'gender',
    '나이': 'age',
    '활성 상태':'active_status',
    '선호 카테고리': 'preferred_category',
    '선호 아이템': 'preferred_item',
    '다중 도메인 성향': 'multi_domain_tendency',
    '인기 컨텐츠 선호도': 'popular_content_preference',
    '사용성 기준': 'usability_criteria',
    '헤비유즈 도메인': 'heavy_use_domains',
    '헤비유저인 도메인': 'heavy_user_domains'
}


new_adot_profile_mappings = {
    '선호 도메인': 'preference',
    '성별': 'gender',
    '나이': 'age',
    '선호 아이템': 'preference',
    '선호 카테고리': 'preference',
    '인기 컨텐츠 선호도': 'preference_level',
    '사용성 기준': 'adot_status',
    '활성 상태':'adot_status',
    '다중 도메인 성향': 'multi_domain_tendency',
}


In [253]:
pattern = r'^(.*?)\((.*?)\)$'
a= '음악(장르 : 국내 알앤비, 국내 발라드, 국내 댄스/일렉 & 가수 : 비비, 아이유, 트와이스)'
match = re.search(pattern, a)

In [255]:
match.group(2)

'장르 : 국내 알앤비, 국내 발라드, 국내 댄스/일렉 & 가수 : 비비, 아이유, 트와이스'

In [262]:
from collections import defaultdict
import re
pattern = r"선호 (?P<entity>\w+)\s?:\s?(?P<item>.*?)\((?P<metadata>.*?)\)"
def profile_normalize(profile:str, delimiter='<|n|>'):
        """성별, 나이"""
        adot_profile = profile["adot_profile_feature"]
        adot_profiles = adot_profile.split(delimiter)
        adot_profile_dict = dict()
        for profile in adot_profiles:
            key, val = profile.split(':', 1)
            key = key.strip()
            val = val.strip()
            null_values = adot_profile_mappings[key]
                
            if val in adot_select_default_value(field_name=null_values):
                pass
            else:
                adot_profile_dict[key] = val
                
        adot_template_dict = defaultdict(list)
        for key, val in new_adot_profile_mappings.items():
            adot_template_dict[val] = []
        
        #for key, val in adot_profile_dict.items():    
        #   """ 선호 도메인, 선호 아이템, 선호 카테고리 매칭"""
            
        domain = adot_profile_dict.get('선호 도메인', '')
        category = adot_profile_dict.get('선호 카테고리', '')
        item = adot_profile_dict.get('선호 아이템','')
        
        if domain:
            preference_template = f"{domain}"
            preference_item_template = ''
        else:
            preference_template = ''
            preference_item_template = ''
        
        pattern = r'^(.*?)\((.*?)\)$'

        #pattern = r'\((.*?)\)$'
        if category:
            #match = re.search(pattern, category)
            match = re.search(pattern, category)
            prefix = match.group(1)
            cate = match.group(2)
            if cate:
                # Extract the string between parentheses
                #cate = match.group(1)
                preference_template +=f",{prefix}:{cate}"
        
        if item:
            match = re.search(pattern, item)
            if match:
                # Extract the string between parentheses
                item = match.group(2)
                preference_item_template = f"{item}"
            
        # new_feature = new_adot_profile_mappings[key]
        # adot_template_dict[new_feature].append(val)
        # Remove spaces after the colon

        preference_template = re.sub(r'\s+(?=:)', '', preference_template)
        preference_template = re.sub(r':\s+', ':', preference_template)
        # Remove spaces after the colon
        preference_item_template = re.sub(r'\s+(?=:)', '', preference_item_template)
        preference_item_template = re.sub(r':\s+', ':', preference_item_template)
        adot_template_dict['preference'] = preference_template
        adot_template_dict['preference_item'] = preference_item_template
        #adot_preferences = adot_template_dict.get('preference', [])
        return dict(adot_template_dict)

In [263]:
samples = []
i = 0
for data in dataset.map(profile_normalize, remove_columns=['user_vector', 'mno_profile_feature']):
    if data['preference'] and data['preference']!='전화':
        samples.append(data)
        i = i+1
    if i > 30:
        break

In [264]:
samples

[{'svc_mgmt_num': '78edbc72bf50cc62ebef171b4150ae07d5e68b26b59596fc7da784a061389edc',
  'luna_id': 'APL00000DG4BGMB4ENSW',
  'age': [],
  'gender': [],
  'adot_profile_feature': '선호 도메인 : 포토<|n|>선호 카테고리 : 없음<|n|>선호 아이템 : 없음<|n|>성별 : 남성<|n|>나이 : 66<|n|>활성 상태 : 활성<|n|>다중 도메인 성향 : 높음<|n|>인기 컨텐츠 선호도 : 높음<|n|>사용성 기준 : 헤비유저<|n|>헤비유저인 도메인 : 맛집추천, TV, 포토',
  'is_adot': True,
  'create_at': '2024-05-11',
  'preference': '포토',
  'preference_level': [],
  'adot_status': [],
  'multi_domain_tendency': [],
  'preference_item': ''},
 {'svc_mgmt_num': '783211e3944cec4b713411c9d5f5b9680c0d34795e90c5450ba2228c6086a662',
  'luna_id': 'APL00000BP2VY811XON4',
  'age': [],
  'gender': [],
  'adot_profile_feature': '선호 도메인 : 음악<|n|>선호 카테고리 : 음악(장르 : 해외 팝, 해외 락,  & 가수 : 스콜피온스, 산타나, 신디 로퍼) <|n|>선호 아이템 : 음악(Wind Of Change, Love of My Life (Live), Smooth) <|n|>성별 : 여성<|n|>나이 : 52<|n|>활성 상태 : 활성<|n|>다중 도메인 성향 : 보통<|n|>인기 컨텐츠 선호도 : 보통<|n|>사용성 기준 : 일반유저<|n|>헤비유저인 도메인 : TV, 음악, 루틴',
  'is_adot': True,
  'create_at'

In [68]:
# samples = []
# for data in dataset.map(profile_normalize, batched=True, batch_size=100 , remove_columns=['user_vector', 'mno_profile_feature']):
#     samples.append(data)
#     break

In [20]:
from collections import defaultdict
def profile_normalize(profile:str, delimiter='<|n|>'):
        adot_profile = profile["adot_profile_feature"]
        adot_profiles = adot_profile.split(delimiter)
        adot_profile_dict = dict()

In [163]:
dict(profile_normalize(next(iter(dataset))))

{'preference': '쇼핑: 오픈마켓,해외직구\n미디어/엔터테인먼트: OTT',
 'gender': ['여자'],
 'age': ['42세'],
 'service_duration': ['11년'],
 'days_after_change': ['1355일'],
 'mno_status': ['베이직플러스', '고가', 'samsung', 'vip', '가족 결합'],
 'plan_price': ['59000원'],
 'avg_data_usage': ['13gb']}

In [40]:
adot_profile = data["adot_profile_feature"]
adot_profiles = adot_profile.split('<|n|>')

In [164]:
data_list[0]

'선호 도메인 : 없음<|n|>선호 카테고리 : 없음<|n|>선호 아이템 : 없음<|n|>성별 : 여성<|n|>나이 : 42<|n|>활성 상태 : 복귀<|n|>다중 도메인 성향 : 없음<|n|>인기 컨텐츠 선호도 : 없음<|n|>사용성 기준 : 라이트유저<|n|>헤비유저인 도메인 : T 서비스'

In [157]:
preferred_category = set()
preferred_item = set()
preferred_domain =set()
for data in data_list:
    preferred_domain.add(data.split('<|n|>')[0])
    preferred_category.add(data.split('<|n|>')[1])
    preferred_item.add(data.split('<|n|>')[2])
    

In [159]:
preferred_domain

{'선호 도메인 : 게임, 전화',
 '선호 도메인 : 날씨',
 '선호 도메인 : 없음',
 '선호 도메인 : 운세',
 '선호 도메인 : 음악',
 '선호 도메인 : 전화',
 '선호 도메인 : 포토',
 '선호 도메인 : 포토, 게임'}

In [158]:
preferred_category

{'선호 카테고리 : 게임(하이퍼캐쥬얼, 심리테스트) ',
 '선호 카테고리 : 없음',
 '선호 카테고리 : 음악(장르 : 국내 댄스/일렉, OST/BGM, 국내 팝/어쿠스틱 & 가수 : 르세라핌, 악뮤, (여자)아이들) ',
 '선호 카테고리 : 음악(장르 : 국내 알앤비, 국내 발라드, 국내 댄스/일렉 & 가수 : 비비, 아이유, 트와이스) ',
 '선호 카테고리 : 음악(장르 : 해외 팝, 해외 락,  & 가수 : 스콜피온스, 산타나, 신디 로퍼) '}

In [100]:
preferred_item

{'선호 아이템 : 게임(미니펫, 스택폴, 진짜 모습 테스트) ',
 '선호 아이템 : 없음',
 "선호 아이템 : 음악(Perfect Night, Love Lee, Turn Up The Sunshine (PNAU Remix / From 'Minions: The Rise of Gru' Soundtrack)) ",
 '선호 아이템 : 음악(Wind Of Change, Love of My Life (Live), Smooth) ',
 '선호 아이템 : 음악(밤양갱, Discord, Love wins all) '}

In [30]:
mno_profile_mappings

{'관심사': 'interests',
 '성별': 'gender',
 '나이': 'age',
 '서비스 사용 기간': 'service_duration',
 '기변 후 경과일': 'days_after_change',
 '요금제 이름': 'plan_name',
 '요금제 가격': 'plan_price',
 '3개월 평균 데이터 사용량': 'avg_data_usage',
 '단말기 가격': 'device_price',
 '단말기 제조사': 'device_manufacturer',
 '멤버십 등급': 'membership_level',
 '멤버십 사용 이력': 'membership_history',
 '가족 결합 이력': 'family_bundle_history',
 '로밍 사용 이력': 'roaming_history',
 '세컨디바이스 보유 여부': 'second_device',
 '소액 및 DCB 결제 이력': 'micropayment_history'}