In [1]:
#Reference -- https://github.com/SAI990323/TALLRec/blob/main/preprocess_movie.py

In [2]:
#Change the current path of the execution
import sys
import os
cwd = os.path.abspath(os.path.join(os.getcwd(), '../..'))
sys.path.append(cwd)
os.chdir(cwd)

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import requests
import json
import csv
import numpy as np
import pandas as pd
import zipfile
import shutil

### Downloading the data

In [5]:
data_dir = 'datasets'
os.makedirs(data_dir, exist_ok=True)

# 1. Download the file
file_path = os.path.join(data_dir, 'ml-100k.zip')
url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'

print(f"Downloading from {url}...")
response = requests.get(url)
with open(file_path, 'wb') as f:
    f.write(response.content)
print(f"Download complete: {file_path}")

# 2. Extract the zip file
print("Extracting data...")
with zipfile.ZipFile(file_path, 'r') as zip_ref:
    zip_ref.extractall(data_dir)
print("Extraction complete")

Downloading from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Download complete: datasets/ml-100k.zip
Extracting data...
Extraction complete


### Reading the data

In [6]:
f = open(os.path.join("datasets", "ml-100k", 'u.data'), 'r')
data = f.readlines()
f = open(os.path.join("datasets", "ml-100k", 'u.item'), 'r', encoding='ISO-8859-1')
movies = f.readlines()
f = open(os.path.join("datasets", "ml-100k", "u.user"), 'r')
users = f.readlines()

movie_names = [_.split('|')[1] for _ in movies] # movie_names[0] = 'Toy Story (1995)'
user_ids = [_.split('|')[0] for _ in users] # user_ids[0] = '1'
movie_ids = [_.split('|')[0] for _ in movies] # movie_ids[0] = '1'

In [7]:
data

['196\t242\t3\t881250949\n',
 '186\t302\t3\t891717742\n',
 '22\t377\t1\t878887116\n',
 '244\t51\t2\t880606923\n',
 '166\t346\t1\t886397596\n',
 '298\t474\t4\t884182806\n',
 '115\t265\t2\t881171488\n',
 '253\t465\t5\t891628467\n',
 '305\t451\t3\t886324817\n',
 '6\t86\t3\t883603013\n',
 '62\t257\t2\t879372434\n',
 '286\t1014\t5\t879781125\n',
 '200\t222\t5\t876042340\n',
 '210\t40\t3\t891035994\n',
 '224\t29\t3\t888104457\n',
 '303\t785\t3\t879485318\n',
 '122\t387\t5\t879270459\n',
 '194\t274\t2\t879539794\n',
 '291\t1042\t4\t874834944\n',
 '234\t1184\t2\t892079237\n',
 '119\t392\t4\t886176814\n',
 '167\t486\t4\t892738452\n',
 '299\t144\t4\t877881320\n',
 '291\t118\t2\t874833878\n',
 '308\t1\t4\t887736532\n',
 '95\t546\t2\t879196566\n',
 '38\t95\t5\t892430094\n',
 '102\t768\t2\t883748450\n',
 '63\t277\t4\t875747401\n',
 '160\t234\t5\t876861185\n',
 '50\t246\t3\t877052329\n',
 '301\t98\t4\t882075827\n',
 '225\t193\t4\t879539727\n',
 '290\t88\t4\t880731963\n',
 '97\t194\t3\t884238860\n',


In [8]:
#Converting data from csv to dictionary
#{'user_id': {'Movie_id': [id1, id2, .....]}, 'rating': [r1, r2, ...], 'timestamp': [t1, t2, ...]}
interaction_dicts = dict()  
for line in data:
    user_id, movie_id, rating, timestamp = line.split('\t')
    if user_id not in interaction_dicts:
        interaction_dicts[user_id] = {
            'movie_id': [],
            'rating': [],
            'timestamp': [],
        }
    interaction_dicts[user_id]['movie_id'].append(movie_id)
    interaction_dicts[user_id]['rating'].append(int(int(rating) > 3))
    interaction_dicts[user_id]['timestamp'].append(timestamp)

In [9]:
#Creating movie_id and movie_name in a csv file
with open('datasets/item_mapping.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['movie_id', 'movie_name'])
    for i, name in enumerate(movie_names):
        writer.writerow([i + 1, name])

In [10]:
#Creating list of lists where each sublist consists of [user_id, movie_ids, ratings, target_movie_id, timestamp]
sequential_interaction_list = []
seq_len = 10
for user_id in interaction_dicts:
    temp = zip(interaction_dicts[user_id]['movie_id'], interaction_dicts[user_id]['rating'], interaction_dicts[user_id]['timestamp'])
    temp = sorted(temp, key=lambda x: x[2]) #-> sorting the map by timestamp
    result = zip(*temp)
    interaction_dicts[user_id]['movie_id'], interaction_dicts[user_id]['rating'], interaction_dicts[user_id]['timestamp'] = [list(_) for _ in result]
    for i in range(10, len(interaction_dicts[user_id]['movie_id'])):
        sequential_interaction_list.append(
            [user_id, 
             interaction_dicts[user_id]['movie_id'][i-seq_len:i], 
             interaction_dicts[user_id]['rating'][i-seq_len:i], 
             interaction_dicts[user_id]['movie_id'][i], 
             interaction_dicts[user_id]['rating'][i], 
             interaction_dicts[user_id]['timestamp'][i].strip('\n')
            ]
        )

In [11]:
sequential_interaction_list

[['196',
  ['242', '286', '269', '306', '340', '1022', '251', '257', '1007', '1241'],
  [0, 1, 0, 1, 0, 1, 0, 0, 1, 0],
  '428',
  1,
  '881251702'],
 ['196',
  ['286', '269', '306', '340', '1022', '251', '257', '1007', '1241', '428'],
  [1, 0, 1, 0, 1, 0, 0, 1, 0, 1],
  '381',
  1,
  '881251728'],
 ['196',
  ['269', '306', '340', '1022', '251', '257', '1007', '1241', '428', '381'],
  [0, 1, 0, 1, 0, 0, 1, 0, 1, 1],
  '202',
  0,
  '881251728'],
 ['196',
  ['306', '340', '1022', '251', '257', '1007', '1241', '428', '381', '202'],
  [1, 0, 1, 0, 0, 1, 0, 1, 1, 0],
  '8',
  1,
  '881251753'],
 ['196',
  ['340', '1022', '251', '257', '1007', '1241', '428', '381', '202', '8'],
  [0, 1, 0, 0, 1, 0, 1, 1, 0, 1],
  '116',
  0,
  '881251753'],
 ['196',
  ['1022', '251', '257', '1007', '1241', '428', '381', '202', '8', '116'],
  [1, 0, 0, 1, 0, 1, 1, 0, 1, 0],
  '285',
  1,
  '881251753'],
 ['196',
  ['251', '257', '1007', '1241', '428', '381', '202', '8', '116', '285'],
  [0, 0, 1, 0, 1, 1, 0,

In [12]:
#Getting the most recent 10000 sequence interactions from the sequential interactions list
sequential_interaction_list = sequential_interaction_list[-10000:] # 10000 
sequential_interaction_list

[['843',
  ['419', '197', '275', '23', '208', '95', '191', '179', '209', '216'],
  [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
  '196',
  0,
  '879446806'],
 ['843',
  ['197', '275', '23', '208', '95', '191', '179', '209', '216', '196'],
  [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
  '193',
  0,
  '879446863'],
 ['843',
  ['275', '23', '208', '95', '191', '179', '209', '216', '196', '193'],
  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
  '170',
  0,
  '879446863'],
 ['843',
  ['23', '208', '95', '191', '179', '209', '216', '196', '193', '170'],
  [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
  '205',
  1,
  '879446888'],
 ['843',
  ['208', '95', '191', '179', '209', '216', '196', '193', '170', '205'],
  [0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
  '671',
  0,
  '879446889'],
 ['843',
  ['95', '191', '179', '209', '216', '196', '193', '170', '205', '671'],
  [0, 0, 1, 0, 0, 0, 0, 0, 1, 0],
  '175',
  1,
  '879446911'],
 ['843',
  ['191', '179', '209', '216', '196', '193', '170', '205', '671', '175'],
  [0, 1, 0, 0, 0, 0, 0, 1, 0, 1],
  '504',
  

## Creating train, test and validation csv data

In [13]:
# save the csv file for baselines
with open('datasets/train.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['user_id', 'history_movie_id', 'history_rating', 'movie_id', 'rating', 'timestamp'])
    writer.writerows(sequential_interaction_list[:int(len(sequential_interaction_list)*0.8)])
with open('datasets/valid.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['user_id', 'history_movie_id', 'history_rating', 'movie_id', 'rating', 'timestamp'])
    writer.writerows(sequential_interaction_list[int(len(sequential_interaction_list)*0.8):int(len(sequential_interaction_list)*0.9)])
with open('datasets/test.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['user_id', 'history_movie_id', 'history_rating', 'movie_id', 'rating', 'timestamp'])
    writer.writerows(sequential_interaction_list[int(len(sequential_interaction_list)*0.9):])

### Converting csv to json

In [14]:
def csv_to_json(input_path, output_path):
    data = pd.read_csv(input_path)
    json_list = []
    for index, row in data.iterrows():
        row['history_movie_id'] = eval(row['history_movie_id'])
        row['history_rating'] = eval(row['history_rating'])
        L = len(row['history_movie_id'])
        preference = []
        unpreference = []
        for i in range(L):
            if int(row['history_rating'][i]) == 1:
                preference.append(movie_names[int(row['history_movie_id'][i]) - 1])
            else:
                unpreference.append(movie_names[int(row['history_movie_id'][i]) - 1])
        target_movie = movie_names[int(row['movie_id']) - 1]
        preference_str = ""
        unpreference_str = ""
        for i in range(len(preference)):
            if i == 0:
                preference_str += "\"" + preference[i] + "\""
            else:
                preference_str += ", \"" + preference[i] + "\""
        for i in range(len(unpreference)):
            if i == 0:
                unpreference_str += "\"" + unpreference[i] + "\""
            else:
                unpreference_str += ", \"" + unpreference[i] + "\""
        target_preference = int(row['rating'])
        target_movie_str = "\"" + target_movie + "\""
        target_preference_str = "Yes." if target_preference == 1 else "No."
        json_list.append({
            "instruction": "Given the user's preference and unpreference, identify whether the user will like the target movie by answering \"Yes.\" or \"No.\".",
            "input": f"User Preference: {preference_str}\nUser Unpreference: {unpreference_str}\nWhether the user will like the target movie {target_movie_str}?",
            "output": target_preference_str,
        })
        
    with open(output_path, 'w') as f:
        json.dump(json_list, f, indent=4)

In [15]:
# generate the json file for the TALLRec
csv_to_json('datasets/train.csv', 'datasets/train.json')
csv_to_json('datasets/valid.csv', 'datasets/valid.json')
csv_to_json('datasets/test.csv', 'datasets/test.json')

### Reading a sample (Task Instruction + Task Input, Task Output)

In [16]:
with open('datasets/train.json', 'r') as lst:
    b = json.load(lst)
    
for line in b:
    print(line["instruction"])
    print(line["input"])
    print(line["output"])
    print()
    break

Given the user's preference and unpreference, identify whether the user will like the target movie by answering "Yes." or "No.".
User Preference: "Clockwork Orange, A (1971)"
User Unpreference: "Mary Poppins (1964)", "Graduate, The (1967)", "Sense and Sensibility (1995)", "Taxi Driver (1976)", "Young Frankenstein (1974)", "Aladdin (1992)", "Amadeus (1984)", "This Is Spinal Tap (1984)", "When Harry Met Sally... (1989)"
Whether the user will like the target movie "Dead Poets Society (1989)"?
No.



### Cleaning up the data within datasets

In [18]:
from src.common import cleanup
cleanup(data_dir)