In [None]:
import os
import collections
from tqdm import tqdm
import pandas as pd

In [None]:
!git clone https://github.com/RUCAIBox/RecDatasets
    

!cd RecDatasets/conversion_tools

os.chdir('/kaggle/working/RecDatasets/conversion_tools')

!pip install -r requirements.txt

In [None]:
!python run.py --dataset yelp \
--input_path /kaggle/input/yelp-dataset --output_path /kaggle/working/yelp \
--convert_inter --convert_item

In [None]:
df_item=pd.read_csv('/kaggle/working/yelp/yelp.item', sep='\t', header=None)
df_item=df_item[[0, 1,2,3,4,11]]
df_item.columns = ['business_id', 'item_name','address','city','state','categories']
test_data=df_item[1:]
tqdm.pandas()

categories_dict = test_data.set_index('item_name')['categories'].to_dict()
state = test_data.set_index('item_name')['state'].to_dict()
city = test_data.set_index('item_name')['city'].to_dict()
address = test_data.set_index('item_name')['address'].to_dict()


test_data['item_text'] = test_data['item_name'].progress_apply(lambda x: f"Title:{x} Description:{categories_dict.get(x, '')} Address:{address.get(x, '')},{city.get(x, '')},{state.get(x, '')}")

item2title = {example['business_id']: example['item_text'] for _,example in tqdm(test_data.iterrows())}


In [None]:
#making .inter file for yelp dataset

df=pd.read_csv('/kaggle/working/yelp/yelp.inter', sep='\t', header=None)
df2=df[[1,2,3,7]]
df2.columns = ['user_id', 'business_id','stars','date']
df2=df2[1:]

def make_inters_in_order(inters):
    user2inters, new_inters = collections.defaultdict(list), collections.defaultdict(list)
    for inter in inters:
        user, item, rating, timestamp = inter
        user2inters[user].append((user, item, rating, timestamp))
    for user in user2inters:
        user_inters = user2inters[user]
        user_inters.sort(key=lambda d: d[3])
        his_items = set()
        for inter in user_inters:
            user, item, rating, timestamp = inter
            if item in his_items:
                continue
            his_items.add(item)
            new_inters[user].append(inter)
    return new_inters


df=df2


if len(df.columns) != 4:
    print(f"Warning: Expected 4 columns, but found {len(df.columns)}.")
    print("Columns found:", df.columns)
    print("First few rows of the data:")
    print(df.head())
    

df.columns = ['user_id', 'item_id','rating','timestamp']
print("Data loaded successfully. Shape:", df.shape)
print("Data types:", df.dtypes)
print("First few rows:")
print(df.head())

inters = []
for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
    try:
        user_id = row['user_id']
        item_id = row['item_id']
        timestamp = int(row['timestamp'])
        rating=row['rating']
        inters.append((user_id, item_id, rating, timestamp))
    except Exception as e:
        print(f"Error processing row: {row}")
        print(f"Error message: {str(e)}")
        continue

if not inters:
    print("No valid interactions found. Check your input file format.")
    


ordered_inters = make_inters_in_order(inters=inters)

# Calculate timestamps for splitting (adjust these based on your data)
all_timestamps = [inter[3] for user_inters in ordered_inters.values() for inter in user_inters]
all_timestamps.sort()
total_interactions = len(all_timestamps)
valid_timestamp = all_timestamps[int(total_interactions * 0.8)]
test_timestamp = all_timestamps[int(total_interactions * 0.9)]

# Calculate timestamps for splitting (adjust these based on your data)
all_timestamps = [inter[3] for user_inters in ordered_inters.values() for inter in user_inters]
all_timestamps.sort()
total_interactions = len(all_timestamps)
valid_timestamp = all_timestamps[int(total_interactions * 0.8)]
test_timestamp = all_timestamps[int(total_interactions * 0.9)]

# Ensure output directory exists
output_path='/kaggle/working/data'
os.makedirs(output_path, exist_ok=True)

# For sequential recommendation
train_file = open(os.path.join(output_path, 'train.csv'), 'w')
valid_file = open(os.path.join(output_path, 'valid.csv'), 'w')
test_file = open(os.path.join(output_path, 'test.csv'), 'w')

train_file.write('user_id,parent_asin,rating,timestamp,history\n')
valid_file.write('user_id,parent_asin,rating,timestamp,history\n')
test_file.write('user_id,parent_asin,rating,timestamp,history\n')

for user in tqdm(ordered_inters, desc='Write seq files'):
    cur_inter = ordered_inters[user]
    for i in range(len(cur_inter)):
        ts = cur_inter[i][3]
        cur_his = ' '.join([_[1] for _ in cur_inter[:i]])
        out_file = None
        if ts >= test_timestamp:
            out_file = test_file
        elif ts >= valid_timestamp:
            out_file = valid_file
        else:
            out_file = train_file
        out_file.write(f'{cur_inter[i][0]},{cur_inter[i][1]},{cur_inter[i][2]},{cur_inter[i][3]},{cur_his}\n')

for file in [train_file, valid_file, test_file]:
    file.close()


splits=['valid','test','train']
for split in tqdm(splits):
    tf=pd.read_csv(f'/kaggle/working/data/{split}.csv')
    tf=tf.dropna()
    tf.to_csv(f'/kaggle/working/{split}.csv',index=False)
    

#Final dataset creation for our task 


In [None]:
from datasets import load_dataset, DatasetDict
import os
import re
import html
import json
import argparse
import torch
from tqdm import tqdm
from datasets import load_dataset
import pandas as pd
from sentence_transformers import SentenceTransformer
import warnings 
from transformers import AutoModel, AutoTokenizer
import numpy as np

In [None]:


#Load datasets from CSV files
train_dataset = load_dataset('csv', data_files='/kaggle/working/train.csv')['train']
valid_dataset = load_dataset('csv', data_files='/kaggle/working/valid.csv')['train']
test_dataset = load_dataset('csv', data_files='/kaggle/working/test.csv')['train']

datasets = DatasetDict({
    'train': train_dataset,
    'valid': valid_dataset,
    'test': test_dataset
})

train_samples = 300000
valid_samples = int((2 / 7) * train_samples)
test_samples = int((1 / 7) * train_samples)

train_subset = datasets['train'].shuffle(seed=42).select(range(min(train_samples, len(datasets['train']))))
valid_subset = datasets['valid'].shuffle(seed=42).select(range(min(valid_samples, len(datasets['valid']))))
test_subset = datasets['test'].shuffle(seed=42).select(range(min(test_samples, len(datasets['test']))))

datasets['train'] = train_subset
datasets['valid'] = valid_subset
datasets['test'] = test_subset


In [None]:
def check_path(path):
    if not os.path.exists(path):
        os.makedirs(path)
        
def filter_items_wo_metadata(example, item2meta):
    if example['parent_asin'] not in item2meta:
        example['history'] = ''
    history = example['history'].split(' ')
    filtered_history = [_ for _ in history if _ in item2meta]
    example['history'] = ' '.join(filtered_history)
    return example


def truncate_text_history(example, max_his_len,item2meta,tokenizer,max_length=255):
    history_items=example['history'].split(' ')
    example['history'] = ' '.join(history_items[-max_his_len:])

    start_part = "<extra_id_0>" #Represent <start>
    items=[f"<extra_id_1>Title:{item2meta[item]}<extra_id_2>" for item in history_items] #id 1 represent item start and id 2 represent item end
    end_part ="<extra_id_3>" #Represent <end>  
    
    # Combine parts
    history_text = start_part + ''.join(items) + end_part
    tokens = tokenizer.tokenize(history_text)

    if len(tokens) <= max_length:
        example['history_text'] = history_text
        return example
    
    # Add items in reverse order while checking the length constraint
    final_items = []
    current_length = 2

    for item in reversed(items):
        new_length = current_length + len(tokenizer.tokenize(item))
        if new_length <= max_length:
            final_items.append(item)
            current_length = new_length
        else:
            break

    # Ensure the items are in the original order in the final result
    final_items.reverse()
    final_result = start_part + ''.join(final_items) + end_part

    example['history_text'] = final_result
    return example


def item_text(example,item2title,domain):
    if example['parent_asin'] not in item2title:
        example['target_item_text']=None
        return example
    target_item_text=item2title[example['parent_asin']]
    example['target_item_text']=target_item_text
    return example


def item_desc_text(example,item2description):
    if example['parent_asin'] not in item2description:
        example['target_description']=None  
        return example
    target_description=item2description[example['parent_asin']]
    example['target_description']=target_description
    return example

def remap_id(datasets):
    user2id = {'[PAD]': 0}
    id2user = ['[PAD]']
    item2id = {'[PAD]': 0}
    id2item = ['[PAD]']

    for split in ['train', 'valid', 'test']:
        dataset = datasets[split]
        for user_id, item_id, history in zip(dataset['user_id'], dataset['parent_asin'], dataset['history']):
            if user_id not in user2id:
                user2id[user_id] = len(id2user)
                id2user.append(user_id)
            if item_id not in item2id:
                item2id[item_id] = len(id2item)
                id2item.append(item_id)
            items_in_history = history.split(' ')
            for item in items_in_history:
                if item not in item2id:
                    item2id[item] = len(id2item)
                    id2item.append(item)

    data_maps = {'user2id': user2id, 'id2user': id2user, 'item2id': item2id, 'id2item': id2item}
    return data_maps

#cleaning preprocessing dataset

def list_to_str(l):
    if isinstance(l, list):
        return list_to_str(', '.join(l))
    else:
        return l


def clean_text(raw_text):
    text = list_to_str(raw_text)
    text = html.unescape(text) 
    text = text.strip()
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[\n\t]', ' ', text) 
    text = re.sub(r' +', ' ', text) 
    text=re.sub(r'[^\x00-\x7F]', ' ', text)
    return text


def feature_process(feature):
    sentence = ""
    if isinstance(feature, float): 
        sentence += str(feature) 
        sentence += '.' 
    elif isinstance(feature, list) and len(feature) > 0: 
        for v in feature: 
            sentence += clean_text(v)
            sentence += ', '
        sentence = sentence[:-2] 
        sentence += '.'
    else:
        sentence = clean_text(feature)
    return sentence + ' '


def clean_title(example):
    if 'title' in example and example['title']:
        return {'parent_asin': example['parent_asin'], 'title': feature_process(example['title'])}
    else:
        return {'parent_asin': example['parent_asin'], 'title': None}

def clean_description(example):
    if 'description' in example and example['description']:
        return {'parent_asin': example['parent_asin'], 'description': feature_process(example['description'])}
    else:
        return {'parent_asin': example['parent_asin'], 'description': None}



def clean_review(example):
    review = ''
    features_needed = ['title', 'text']
    for feature in features_needed:
        if example[feature] is not None:
            review += f'Review {feature}:' + feature_process(example[feature]) + ' '
        else:
            continue
    if not review:
        review = None
    example['cleaned_review'] = review
    return example

def process_sentence(sentence, max_length=255):
    tokens = tokenizer.tokenize(sentence)
    if len(tokens) <= max_length:
        return sentence

    flag = False

    while len(tokens) > max_length:
        start_index = sentence.find("<extra_id_1>")
        end_index = sentence.find("<extra_id_2>", start_index) + len("<extra_id_2>")

        if start_index == -1 or end_index == -1:
            raise AssertionError('Something wrong')

        # Remove the first title
        sentence = sentence[:start_index] + sentence[end_index:]
        tokens = tokenizer.tokenize(sentence)

        if sentence.endswith('<extra_id_0><extra_id_3>'):
            flag = True
            print('bc')
            break

    if flag:
        title_block = sentence[start_index:end_index]
        title_tokens = tokenizer.tokenize(title_block)
        title_token_ids = tokenizer.convert_tokens_to_ids(title_tokens)

        while len(tokens) > max_length - 2 and title_token_ids:
            title_token_ids.pop()
            truncated_title_block = tokenizer.decode(title_token_ids, skip_special_tokens=True)
            sentence = sentence[:start_index] + truncated_title_block + sentence[end_index:]
            tokens = tokenizer.tokenize(sentence)

        sentence += '<extra_id_2><extra_id_3>'

    return sentence


max_his_len=50
n_workers=64
domain='yelp'
output_dir = os.path.join('/kaggle/working/','domain' )
check_path(output_dir)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


# Load a pre-trained Sentence Transformer model
model = SentenceTransformer('sentence-transformers/sentence-t5-base',device=device)

# Access the tokenizer from the underlying transformers model
tokenizer = model.tokenizer

truncated_datasets = {}


print("Processing interaction data...")
for split in ['train','valid','test']:

    print(f"Processing {split} split...")
    filtered_dataset = datasets[split].map(
        lambda t: filter_items_wo_metadata(t, item2title),
        num_proc=n_workers
    )
    filtered_dataset = filtered_dataset.filter(lambda t: len(t['history']) > 0)

    truncated_dataset = filtered_dataset.map(
        lambda t: truncate_text_history(t, max_his_len, item2title,tokenizer),
        num_proc=n_workers
    )
    truncated_dataset = truncated_dataset.map(
        lambda t: item_text(t,item2title,domain),
        num_proc=n_workers
    )

    truncated_datasets[split] = truncated_dataset
    df=pd.DataFrame(truncated_datasets[split])
    df.to_csv(os.path.join(output_dir, f'{domain}.{split}.csv'), index=False)

    #For unisrec: 
    df2 = df[['user_id','history','parent_asin','timestamp']]
    df2 = df2.rename(columns={'user_id':'user_id:token','history':'item_id_list:token_seq','parent_asin':'item_id:token','timestamp':'timestamp:float'})
    df2.to_csv(os.path.join(output_dir, f'{domain}.{split}.inter'), sep='\t', index=False)



print("Remapping IDs...")
data_maps = remap_id(truncated_datasets)
id2meta = {0: '[PAD]'}
for item in item2title:
    if item not in data_maps['item2id']:
        continue
    item_id = data_maps['item2id'][item]
    id2meta[item_id] = item2title[item]
data_maps['id2meta'] = id2meta
output_path = os.path.join(output_dir, f'{domain}.data_maps')
with open(output_path, 'w') as f:
    json.dump(data_maps, f)



print("Item Title")
sorted_text = []    # 1-base, sorted_text[0] -> item_id=1
for i in range(1, len(data_maps['item2id'])):
    item_text=f"{data_maps['id2meta'][i]}"
    sorted_text.append(item_text)

with open(os.path.join(output_dir, 'item_profile.txt'), 'w') as f:
    for line in sorted_text:
        f.write(f"{line}\n")


'''
Generate Item From Unisrec - roberta
'''
print("Generating item features for Unisrec...")
plm = 'hyp1231/blair-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(plm)
model_plm = AutoModel.from_pretrained(plm).to(device)

batch_size=32
all_embeddings = []
for pr in tqdm(range(0, len(sorted_text), batch_size)):
    batch = sorted_text[pr:pr + batch_size]
    inputs = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model_plm(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    all_embeddings.append(embeddings)
all_embeddings = np.concatenate(all_embeddings, axis=0)
all_embeddings.tofile(os.path.join(output_dir, f'{domain}.{plm.split("/")[-1]}.feature')) 

print("Item features generated for Unisrec.")
