In [None]:
import os
import pandas as pd


def save_json(mode, target_dataset, div_loss = 0.25, idorcodebook = 'codebook'):

    codebook_df = pd.read_csv(f'datasets/{target_dataset}/codebooks_{div_loss}.csv')
    poi_sequence_df = pd.read_csv(f'datasets/{target_dataset}/data/{mode}.csv')


    codebook_df['Codebook'] = codebook_df['Codebook'].apply(eval)

    poi_to_codebook = dict(zip(codebook_df['Pid'], codebook_df['Codebook']))

    users = []
    sequences = []
    targets = []

    for _, row in poi_sequence_df.iterrows():
        uid = row['Uid']
        poi_sequence = eval(row['Pids'])
        time_sequence = eval(row['Times'])
        target_time = row['Target_time']
        target = row['Target']

        if idorcodebook == 'codebook':
            embedded_sequence = [
                ''.join([f"<{chr(97 + idx)}_{code}>" for idx, code in enumerate(poi_to_codebook[poi])]) + f' at {time_sequence[i]}, ' 
                if i < len(poi_sequence) - 1 else 
                ''.join([f"<{chr(97 + idx)}_{code}>" for idx, code in enumerate(poi_to_codebook[poi])]) + f' at {time_sequence[i]}.'
                for i, poi in enumerate(poi_sequence)
            ]
            target_embedding = ''.join([f"<{chr(97 + idx)}_{code}>" for idx, code in enumerate(poi_to_codebook[target])])
        
        elif idorcodebook == 'id':
            embedded_sequence = [
                f"<{poi}>" + f' at {time_sequence[i]}, ' if i < len(poi_sequence) - 1 else
                f"<{poi}>" + f' at {time_sequence[i]}.'
                for i, poi in enumerate(poi_sequence)
            ]
            target_embedding = f"<{target}>"
        
        else:
            raise ValueError("Invalid idorcodebook value. Use 'codebook' or 'id'.")

        instruction = f"Here is a record of a user's POI accesses, your task is based on the history to predict the POI that the user is likely to access at the specified time."
        input = f"User_{uid} visited: " + "".join(embedded_sequence) + f" When {target_time} user_{uid} is likely to visit:"
        

        sequences.append(input)
        targets.append(target_embedding)

    semitic_df = pd.DataFrame({
        'instruction': instruction,
        'input': sequences,
        'output': targets
    })

    json_data = semitic_df.to_json(orient="records", indent=4)

    with open(f'datasets/{target_dataset}/data/{mode}_{idorcodebook}.json', "w") as file:
        file.write(json_data)


In [None]:
""" 
save_json('train', 'NYC', 0, 'codebook')
save_json('train', 'NYC', 0, 'id')
save_json('val', 'NYC', 0, 'codebook')
save_json('val', 'NYC', 0, 'id')
save_json('test', 'NYC', 0, 'codebook')
save_json('test', 'NYC', 0, 'id')

save_json('test_all', 'NYC', 0, 'codebook')
save_json('test_all', 'NYC', 0, 'id')
 """
# save_json('test', 'NYC', 0.25, 'id')
# save_json('history', 'NYC', 0.25, 'id')

save_json('test', 'NYC', 0.25, 'codebook')
save_json('history', 'NYC', 0.25, 'codebook')

In [1]:
import os
import pandas as pd
import ast


def safe_eval_string(s):
    """
    安全地解析字符串为Python对象
    """
    if isinstance(s, str):
        try:
            return ast.literal_eval(s)
        except (ValueError, SyntaxError):
            # 如果literal_eval失败，尝试使用eval但限制命名空间
            try:
                # 创建一个安全的命名空间，只包含基本类型
                safe_dict = {"__builtins__": {}, "list": list, "dict": dict, "tuple": tuple, "str": str, "int": int, "float": float}
                return eval(s, safe_dict)
            except:
                # 如果仍然失败，返回原始字符串
                print(f"Warning: Could not parse string: {s}")
                return s
    else:
        # 如果不是字符串，直接返回
        return s


def save_json_notime(mode, target_dataset, div_loss = 0.25, idorcodebook = 'codebook'):

    codebook_df = pd.read_csv(f'datasets/{target_dataset}/codebooks_{div_loss}.csv')
    poi_sequence_df = pd.read_csv(f'datasets/{target_dataset}/data/{mode}.csv')

    codebook_df['Codebook'] = codebook_df['Codebook'].apply(safe_eval_string)

    poi_to_codebook = dict(zip(codebook_df['Pid'], codebook_df['Codebook']))

    users = []
    sequences = []
    targets = []

    for _, row in poi_sequence_df.iterrows():
        uid = row['Uid']
        poi_sequence = safe_eval_string(row['Pids'])
        time_sequence = safe_eval_string(row['Times'])
        target_time = row['Target_time']
        target = row['Target']

        if idorcodebook == 'codebook':
            embedded_sequence = []
            for i, poi in enumerate(poi_sequence):
                if poi in poi_to_codebook:
                    poi_codes = poi_to_codebook[poi]
                    poi_str = ''.join([f"<{chr(97 + idx)}_{code}>" for idx, code in enumerate(poi_codes)])
                    if i < len(poi_sequence) - 1:
                        poi_str += f' '
                    else:
                        poi_str += f''
                    embedded_sequence.append(poi_str)
                else:
                    print(f"Warning: POI {poi} not found in codebook, skipping...")
                    continue  # 跳过不存在的POI
            
            if target in poi_to_codebook:
                target_embedding = ''.join([f"<{chr(97 + idx)}_{code}>" for idx, code in enumerate(poi_to_codebook[target])])
            else:
                print(f"Warning: Target POI {target} not found in codebook, skipping...")
                continue  # 跳过这一行数据
        
        elif idorcodebook == 'id':
            embedded_sequence = [
                f"<{poi}>" + f' ' if i < len(poi_sequence) - 1 else
                f"<{poi}>" + f''
                for i, poi in enumerate(poi_sequence)
            ]
            target_embedding = f"<{target}>"
        
        else:
            raise ValueError("Invalid idorcodebook value. Use 'codebook' or 'id'.")

        instruction = f"Here is a record of a user's POI accesses, your task is based on the history to predict the next POI."
        input = f"User_{uid} visited: " + "".join(embedded_sequence)
        

        sequences.append(input)
        targets.append(target_embedding)

    semitic_df = pd.DataFrame({
        'instruction': instruction,
        'input': sequences,
        'output': targets
    })

    json_data = semitic_df.to_json(orient="records", indent=4)

    with open(f'datasets/{target_dataset}/data/{mode}_{idorcodebook}_notime.json', "w") as file:
        file.write(json_data)

In [2]:
save_json_notime('train', 'NYC', 0.25, 'codebook')
save_json_notime('test', 'NYC', 0.25, 'codebook')

