In [1]:
#Change the current path of the execution
import sys
import os
cwd = os.path.abspath(os.path.join(os.getcwd(), '../..'))
sys.path.append(cwd)
os.chdir(cwd)

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import requests
from tqdm import tqdm
import random
import json
import csv

### Downloading Book-Crossing dataset

In [4]:
data_dir = 'datasets/book_crossing'
os.makedirs(data_dir, exist_ok=True)

# Function to convert GitHub URL to raw URL
def get_raw_url(github_url):
    return github_url.replace('github.com', 'raw.githubusercontent.com').replace('/blob/', '/')

# GitHub URLs for the dataset files
github_urls = [
    'https://github.com/rochitasundar/Collaborative-Filtering-Book-Recommendation-System/blob/master/BX-Book-Ratings.csv',
    'https://github.com/rochitasundar/Collaborative-Filtering-Book-Recommendation-System/blob/master/BX-Books.csv',
    'https://github.com/rochitasundar/Collaborative-Filtering-Book-Recommendation-System/blob/master/BX-Users.csv'
]

# Download all files
for url in github_urls:
    raw_url = get_raw_url(url)
    filename = url.split('/')[-1]
    local_path = os.path.join(data_dir, filename)
    
    print(f"Downloading {filename} from {raw_url}")
    response = requests.get(raw_url)
    
    if response.status_code == 200:
        with open(local_path, 'wb') as f:
            f.write(response.content)
        print(f"Successfully saved to {local_path}")
    else:
        print(f"Failed to download {filename}. Status code: {response.status_code}")

Downloading BX-Book-Ratings.csv from https://raw.githubusercontent.com/rochitasundar/Collaborative-Filtering-Book-Recommendation-System/master/BX-Book-Ratings.csv
Successfully saved to datasets/book_crossing/BX-Book-Ratings.csv
Downloading BX-Books.csv from https://raw.githubusercontent.com/rochitasundar/Collaborative-Filtering-Book-Recommendation-System/master/BX-Books.csv
Successfully saved to datasets/book_crossing/BX-Books.csv
Downloading BX-Users.csv from https://raw.githubusercontent.com/rochitasundar/Collaborative-Filtering-Book-Recommendation-System/master/BX-Users.csv
Successfully saved to datasets/book_crossing/BX-Users.csv


### Reading the data

In [5]:
# Now read the files using your original format
print("\nReading downloaded files...")

# Read the files using the format you specified
try:
    rating = pd.read_csv(os.path.join(data_dir, "BX-Book-Ratings.csv"), sep=';', encoding="latin-1")
    users = pd.read_csv(os.path.join(data_dir, "BX-Users.csv"), sep=';', encoding="latin-1")
    
    # Note: error_bad_lines is deprecated in newer pandas versions
    # Use on_bad_lines='skip' instead for newer pandas versions
    try:
        books = pd.read_csv(os.path.join(data_dir, "BX-Books.csv"), sep=';', encoding="latin-1", error_bad_lines=False)
    except TypeError:
        books = pd.read_csv(os.path.join(data_dir, "BX-Books.csv"), sep=';', encoding="latin-1", on_bad_lines='skip')
    
    # Merge rating with books on ISBN
    rating = pd.merge(rating, books, on='ISBN', how='inner')
    
    # Save books dataframe to a new CSV file
    books.to_csv(os.path.join(data_dir, 'book_item_mapping.csv'), index=True)
    
    # Print information about the dataframes
    print(f"Rating shape: {rating.shape}")
    print(f"Users shape: {users.shape}")
    print(f"Books shape: {books.shape}")
    print(f"Created book_item_mapping.csv")
    
except Exception as e:
    print(f"Error processing data: {e}")


Reading downloaded files...




  books = pd.read_csv(os.path.join(data_dir, "BX-Books.csv"), sep=';', encoding="latin-1", error_bad_lines=False)
b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping

Rating shape: (1031136, 10)
Users shape: (278858, 3)
Books shape: (271360, 8)
Created book_item_mapping.csv


In [6]:
rating.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...


In [7]:
# Initialize variables
user_dict = {}
item_id = {}
mx = 0  # Initialize mx before using it

# Create mapping from ISBN to index
for index, row in tqdm(books.iterrows()):
    item_id[row['ISBN']] = index

# Process ratings
for index, row in tqdm(rating.iterrows()):
    userid = row['User-ID']
    
    # Check if ISBN exists in item_id to avoid KeyError
    if row['ISBN'] not in item_id:
        continue
        
    # More Pythonic way to check if key exists
    if userid not in user_dict:
        user_dict[userid] = {
            'ISBN': [],
            'Book-Rating': [],
            'Book-Title': [],
            'Book-Author': [],
            'Year-Of-Publication': [],
        }
    
    # Add data to user dictionary
    user_dict[userid]['ISBN'].append(item_id[row['ISBN']])
    user_dict[userid]['Book-Rating'].append(float(row['Book-Rating']))
    user_dict[userid]['Book-Title'].append(row['Book-Title'])
    user_dict[userid]['Book-Author'].append(row['Book-Author'])
    user_dict[userid]['Year-Of-Publication'].append(row['Year-Of-Publication'])

# Filter users with more than 3 ratings
new_user_dict = {}
for key in user_dict.keys():
    # Update max length
    mx = max(mx, len(user_dict[key]['ISBN']))
    
    # Filter users with more than 3 ratings
    if len(user_dict[key]['ISBN']) > 3:  # Changed <= to > to match your intent
        new_user_dict[key] = user_dict[key]

print(f"Maximum number of books rated by a user: {mx}")
print(f"Number of users with more than 3 ratings: {len(new_user_dict)}")

271360it [00:08, 32982.30it/s]
1031136it [00:49, 20883.05it/s]


Maximum number of books rated by a user: 11144
Number of users with more than 3 ratings: 24268


In [8]:
# Split users into train/valid/test sets
user_list = list(new_user_dict.keys())
random.seed(42)  # Set seed once at the beginning
random.shuffle(user_list)

train_user = user_list[:int(len(user_list) * 0.8)]
valid_user = user_list[int(len(user_list) * 0.8):int(len(user_list) * 0.9)]
test_user = user_list[int(len(user_list) * 0.9):]

In [9]:
def generate_csv(user_list, output_csv, output_json, user_dict):
    nrows = []
    for user in user_list:
        item_id = user_dict[user]['ISBN'].copy()  # Create copies to avoid modifying original data
        rating = [int(_ > 5) for _ in user_dict[user]['Book-Rating']]
        
        # Use a different seed for each user to ensure diversity
        random_seed = hash(user) % 10000
        random.seed(random_seed)
        
        # Shuffle both lists with the same seed
        combined = list(zip(item_id, rating))
        random.shuffle(combined)
        item_id, rating = zip(*combined)  # Unzip
        
        # Convert back to lists
        item_id = list(item_id)
        rating = list(rating)
        
        nrows.append([user, item_id[:-1][:10], rating[:-1][:10], item_id[-1], rating[-1]])
    
    with open(output_csv, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['user', 'history_item_id', 'history_rating', 'item_id', 'rating'])
        writer.writerows(nrows)
    
    Prompt_json = []
    for user in user_list:
        # Create copies of all lists
        item_id = user_dict[user]['ISBN'].copy()
        rating = [int(_ > 5) for _ in user_dict[user]['Book-Rating']]
        book_title = user_dict[user]['Book-Title'].copy()
        book_author = user_dict[user]['Book-Author'].copy()
        
        # Use a consistent seed for this user
        random_seed = hash(user) % 10000
        random.seed(random_seed)
        
        # Shuffle all lists together to maintain correspondence
        combined = list(zip(item_id, rating, book_title, book_author))
        random.shuffle(combined)
        item_id, rating, book_title, book_author = zip(*combined)
        
        # Convert back to lists
        item_id = list(item_id)
        rating = list(rating)
        book_title = list(book_title)
        book_author = list(book_author)
        
        preference = []
        unpreference = []
        for i in range(min(len(item_id) - 1, 10)):
            if rating[i] == 1:
                preference.append(f'"{book_title[i]}" written by {book_author[i]}')
            else:
                unpreference.append(f'"{book_title[i]}" written by {book_author[i]}')
        
        preference_str = ", ".join(preference)
        unpreference_str = ", ".join(unpreference)
        
        target_preference_str = "Yes." if rating[-1] == 1 else "No."
        target_book_str = f'"{book_title[-1]}" written by {book_author[-1]}'
        
        Prompt_json.append({
            "instruction": "Given the user's preference and unpreference, identify whether the user will like the target book by answering \"Yes.\" or \"No.\".",
            "input": f"User Preference: {preference_str}\nUser Unpreference: {unpreference_str}\nWhether the user will like the target book {target_book_str}?",
            "output": target_preference_str,
        })
    
    with open(output_json, 'w') as f:
        json.dump(Prompt_json, f, indent=4)

In [10]:
generate_csv(train_user, os.path.join(data_dir, 'train.csv'), os.path.join(data_dir, 'train.json'), new_user_dict)
generate_csv(valid_user, os.path.join(data_dir, 'valid.csv'), os.path.join(data_dir, 'valid.json'), new_user_dict)
generate_csv(test_user, os.path.join(data_dir, 'test.csv'), os.path.join(data_dir, 'test.json'), new_user_dict)

### Reading a sample (Task Instruction + Task Input, Task Output)

In [11]:
with open(os.path.join(data_dir, "train.json"), 'r') as lst:
    b = json.load(lst)
    
for line in b:
    print(line["instruction"])
    print(line["input"])
    print(line["output"])
    print()
    break

Given the user's preference and unpreference, identify whether the user will like the target book by answering "Yes." or "No.".
User Preference: "The End of Enemies (Briggs Tanner Novels)" written by Grant Blackwood, "Q Is for Quarry" written by Sue Grafton
User Unpreference: "ICEFIRE" written by Judith Reeves-Stevens
Whether the user will like the target book "Specter of the Past: Star Wars (Star Wars (Bantam Books (Firm) : Unnumbered).)" written by Timothy Zahn?
Yes.



### Cleaning up the data within datasets once done with work

In [12]:
# from src.common import cleanup
# data_dir = "datasets"
# cleanup(data_dir)