In [1]:
!pip install pyspark pandas numpy scikit-learn boto3

Collecting boto3
  Downloading boto3-1.40.55-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<1.41.0,>=1.40.55 (from boto3)
  Downloading botocore-1.40.55-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.15.0,>=0.14.0 (from boto3)
  Downloading s3transfer-0.14.0-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.40.55-py3-none-any.whl (139 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m139.3/139.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.40.55-py3-none-any.whl (14.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m14.1/14.1 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Down

In [1]:
#importing the required pyspark library
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

#Setup Spark Session
spark = SparkSession.builder.appName('Recommender').getOrCreate()
spark

In [6]:
from google.colab import drive
drive.mount('/content/drive')

DATA_PATH = "/content/drive/MyDrive/Recommendation System/data"


books = spark.read.csv(f"{DATA_PATH}/Books.csv", sep=',', header=True, inferSchema=True, escape='"')
ratings = spark.read.csv(f"{DATA_PATH}/Ratings.csv", sep=',', header=True, inferSchema=True, escape='"')
users = spark.read.csv(f"{DATA_PATH}/Users.csv", sep=',', header=True, inferSchema=True, escape='"')

print("Data loaded successfully!")
ratings.show(5)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Data loaded successfully!
+-------+----------+-----------+
|User-ID|      ISBN|Book-Rating|
+-------+----------+-----------+
| 276725|034545104X|          0|
| 276726|0155061224|          5|
| 276727|0446520802|          0|
| 276729|052165615X|          3|
| 276729|0521795028|          6|
+-------+----------+-----------+
only showing top 5 rows



In [7]:
# Rename columns to remove hyphens
ratings = ratings.withColumnRenamed("User-ID", "UserID") \
                 .withColumnRenamed("Book-Rating", "BookRating")

books = books.withColumnRenamed("Book-Title", "BookTitle") \
             .withColumnRenamed("Book-Author", "BookAuthor") \
             .withColumnRenamed("Year-Of-Publication", "YearOfPublication") \
             .withColumnRenamed("Image-URL-S", "ImageURLSmall") \
             .withColumnRenamed("Image-URL-M", "ImageURLMedium") \
             .withColumnRenamed("Image-URL-L", "ImageURLLarge")

users = users.withColumnRenamed("User-ID", "UserID")

In [8]:
# Explore the data
print("\n=== DATA EXPLORATION ===")
print("Books schema:")
books.printSchema()
print("\nRatings schema:")
ratings.printSchema()
print("\nUsers schema:")
users.printSchema()

print("\nSample books:")
books.show(5, truncate=False)
print("\nSample ratings:")
ratings.show(5)
print("\nSample users:")
users.show(5)

print(f"\nTotal books: {books.count()}")
print(f"Total ratings: {ratings.count()}")
print(f"Total users: {users.count()}")


=== DATA EXPLORATION ===
Books schema:
root
 |-- ISBN: string (nullable = true)
 |-- BookTitle: string (nullable = true)
 |-- BookAuthor: string (nullable = true)
 |-- YearOfPublication: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- ImageURLSmall: string (nullable = true)
 |-- ImageURLMedium: string (nullable = true)
 |-- ImageURLLarge: string (nullable = true)


Ratings schema:
root
 |-- UserID: integer (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- BookRating: integer (nullable = true)


Users schema:
root
 |-- UserID: integer (nullable = true)
 |-- Location: string (nullable = true)
 |-- Age: string (nullable = true)


Sample books:
+----------+--------------------------------------------------------------------------------------------------+--------------------+-----------------+--------------------------+------------------------------------------------------------+------------------------------------------------------------+-------------------

In [9]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import count, col, explode
import pandas as pd
import os

In [10]:

print("\n=== DATA PREPROCESSING ===")

# Filter out ratings of 0 (implicit feedback) - keep only explicit ratings
ratings_filtered = ratings.filter(col('BookRating') > 0)
print(f"Ratings after filtering (BookRating > 0): {ratings_filtered.count()}")

# Check rating distribution
print("\nRating distribution:")
ratings_filtered.groupBy('BookRating').count().orderBy('BookRating').show()

# Check for data quality - users and books with sufficient ratings
print("\nTop 10 users by number of ratings:")
ratings_filtered.groupBy('UserID').agg(count('*').alias('num_ratings')) \
    .orderBy('num_ratings', ascending=False).show(10)

print("\nTop 10 books by number of ratings:")
ratings_filtered.groupBy('ISBN').agg(count('*').alias('num_ratings')) \
    .orderBy('num_ratings', ascending=False).show(10)

# Optional: Filter users and books with minimum ratings (improves quality)
min_ratings_per_user = 5
min_ratings_per_book = 5

user_counts = ratings_filtered.groupBy('UserID').agg(count('*').alias('user_rating_count'))
book_counts = ratings_filtered.groupBy('ISBN').agg(count('*').alias('book_rating_count'))

# Join and filter with proper column selection
ratings_filtered = ratings_filtered.alias('r') \
    .join(user_counts.alias('uc'), col('r.UserID') == col('uc.UserID')) \
    .filter(col('user_rating_count') >= min_ratings_per_user) \
    .select(col('r.UserID'), col('r.ISBN'), col('r.BookRating'))

ratings_filtered = ratings_filtered.alias('r') \
    .join(book_counts.alias('bc'), col('r.ISBN') == col('bc.ISBN')) \
    .filter(col('book_rating_count') >= min_ratings_per_book) \
    .select(col('r.UserID'), col('r.ISBN'), col('r.BookRating'))

print(f"\nRatings after quality filtering: {ratings_filtered.count()}")

# Create user and item indices (ALS needs integer IDs)
print("\n=== CREATING INDICES ===")

user_indexer = StringIndexer(inputCol="UserID", outputCol="userIndex")
book_indexer = StringIndexer(inputCol="ISBN", outputCol="bookIndex")

# Fit and transform
ratings_indexed = user_indexer.fit(ratings_filtered).transform(ratings_filtered)
ratings_indexed = book_indexer.fit(ratings_indexed).transform(ratings_indexed)

print("Sample indexed ratings:")
ratings_indexed.select('UserID', 'userIndex', 'ISBN', 'bookIndex', 'BookRating').show(10)



=== DATA PREPROCESSING ===
Ratings after filtering (BookRating > 0): 433671

Rating distribution:
+----------+------+
|BookRating| count|
+----------+------+
|         1|  1770|
|         2|  2759|
|         3|  5996|
|         4|  8904|
|         5| 50974|
|         6| 36924|
|         7| 76457|
|         8|103736|
|         9| 67541|
|        10| 78610|
+----------+------+


Top 10 users by number of ratings:
+------+-----------+
|UserID|num_ratings|
+------+-----------+
| 11676|       8524|
| 98391|       5802|
|153662|       1969|
|189835|       1906|
| 23902|       1395|
| 76499|       1036|
|171118|       1035|
|235105|       1023|
| 16795|        968|
|248718|        948|
+------+-----------+
only showing top 10 rows


Top 10 books by number of ratings:
+----------+-----------+
|      ISBN|num_ratings|
+----------+-----------+
|0316666343|        707|
|0971880107|        581|
|0385504209|        487|
|0312195516|        383|
|0679781587|        333|
|0060928336|        320|
|05

In [11]:
# Split data into training and test sets
print("\n=== SPLITTING DATA ===")
(training, test) = ratings_indexed.randomSplit([0.8, 0.2], seed=42)

print(f"Training set: {training.count()} ratings")
print(f"Test set: {test.count()} ratings")


=== SPLITTING DATA ===
Training set: 121827 ratings
Test set: 30453 ratings


In [12]:
# Build and train the ALS model
print("\n=== TRAINING ALS MODEL ===")

als = ALS(
    maxIter=10,
    regParam=0.1,
    rank=10,
    userCol="userIndex",
    itemCol="bookIndex",
    ratingCol="BookRating",
    coldStartStrategy="drop",
    nonnegative=True
)

print("Training the model... (this may take a few minutes)")
model = als.fit(training)
print("Model training complete!")


=== TRAINING ALS MODEL ===
Training the model... (this may take a few minutes)
Model training complete!


In [13]:
# Make predictions and evaluate
print("\n=== EVALUATING MODEL ===")

predictions = model.transform(test)
print("Sample predictions:")
predictions.select('UserID', 'ISBN', 'BookRating', 'prediction').show(10)

# Evaluate using RMSE
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="BookRating",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)
print(f"\nRoot Mean Square Error (RMSE): {rmse:.4f}")

# Evaluate using MAE
evaluator_mae = RegressionEvaluator(
    metricName="mae",
    labelCol="BookRating",
    predictionCol="prediction"
)
mae = evaluator_mae.evaluate(predictions)
print(f"Mean Absolute Error (MAE): {mae:.4f}")


=== EVALUATING MODEL ===
Sample predictions:
+------+----------+----------+----------+
|UserID|      ISBN|BookRating|prediction|
+------+----------+----------+----------+
|    99|0312261594|         8| 6.0792923|
|   114|0446608653|         9| 7.4311924|
|   114|0446612618|         8| 7.2180943|
|   232|0486284735|         8|  5.367009|
|   242|3498020862|         8| 0.9798938|
|   243|0316601950|         9|  7.404435|
|   243|0425163407|         9|  8.750659|
|   243|0786863986|         5|  7.198143|
|   254|0060934700|         9|  8.748173|
|   254|0060976977|         7|  8.837635|
+------+----------+----------+----------+
only showing top 10 rows


Root Mean Square Error (RMSE): 2.0185
Mean Absolute Error (MAE): 1.5719


In [14]:
print("\n=== GENERATING RECOMMENDATIONS (Unity Catalog Compatible) ===")

# Get item factors (books) and user factors
item_factors = model.itemFactors
user_factors = model.userFactors

print(f"Total users with factors: {user_factors.count()}")
print(f"Total items with factors: {item_factors.count()}")

# Sample a few users to generate recommendations
sample_users = user_factors.limit(5).collect()

print("\nGenerating recommendations for sample users...")

for user_row in sample_users:
    user_idx = user_row['id']
    user_features = user_row['features']

    # Get this user's info
    user_info = ratings_indexed.filter(col('userIndex') == user_idx).select('UserID').first()
    if user_info:
        user_id = user_info['UserID']
        print(f"\n--- Recommendations for User {user_id} (index: {user_idx}) ---")

        # Get books this user hasn't rated
        user_rated_books = ratings_indexed.filter(col('userIndex') == user_idx).select('bookIndex').distinct()

        # Get all books and their features
        unrated_books = item_factors.join(user_rated_books, item_factors.id == user_rated_books.bookIndex, "left_anti")

        # For simplicity, just show top books by average rating
        # In production,  we compute dot product of user and item features
        top_books = unrated_books.limit(10)

        # Map back to ISBN and book details
        book_index_mapping = ratings_indexed.select('ISBN', 'bookIndex').distinct()
        recommendations = top_books.join(book_index_mapping, top_books.id == book_index_mapping.bookIndex) \
                                   .join(books, 'ISBN') \
                                   .select('ISBN', 'BookTitle', 'BookAuthor', 'YearOfPublication')

        recommendations.show(5, truncate=False)


=== GENERATING RECOMMENDATIONS (Unity Catalog Compatible) ===
Total users with factors: 12993
Total items with factors: 14471

Generating recommendations for sample users...

--- Recommendations for User 11676 (index: 0) ---
+----------+--------------------------------------------+------------------+-----------------+
|ISBN      |BookTitle                                   |BookAuthor        |YearOfPublication|
+----------+--------------------------------------------+------------------+-----------------+
|0064401847|Bridge to Terabithia                        |Katherine Paterson|1987             |
|0140157379|Haroun and the Sea of Stories               |Salman Rushdie    |1991             |
|014043187X|The Picture of Dorian Gray (Penguin Classic)|Oscar Wilde       |1985             |
|0312858787|Gun, With Occasional Music                  |Jonathan Lethem   |1995             |
|0446360511|Eloquent Silence                            |Sandra Brown      |1995             |
+----------+--

In [15]:
# Get recommendations for a specific user with proper computation
print("\n=== DETAILED RECOMMENDATIONS FOR SPECIFIC USER ===")

# Get a sample user
sample_user = ratings_filtered.select("UserID").first()[0]
print(f"\nGenerating recommendations for User ID: {sample_user}")

# Get user index
user_indexer_model = user_indexer.fit(ratings_filtered)
user_df = spark.createDataFrame([(sample_user,)], ["UserID"])
user_indexed = user_indexer_model.transform(user_df)
user_index = user_indexed.select("userIndex").first()[0]

# Get books already rated by this user
user_rated_books = ratings_indexed.filter(col('userIndex') == user_index) \
                                  .select('bookIndex').distinct()

# Get predictions for all unrated books
all_books = ratings_indexed.select('bookIndex').distinct()
unrated_books = all_books.join(user_rated_books, 'bookIndex', 'left_anti')

# Create user-book pairs for prediction
user_book_pairs = unrated_books.withColumn('userIndex', col('bookIndex') * 0 + user_index) \
                               .select('userIndex', 'bookIndex')

# Generate predictions
recommendations = model.transform(user_book_pairs) \
                      .orderBy(col('prediction').desc()) \
                      .limit(10)

# Join with book details
book_index_mapping = ratings_indexed.select('ISBN', 'bookIndex').distinct()
final_recommendations = recommendations.join(book_index_mapping, 'bookIndex') \
                                      .join(books, 'ISBN') \
                                      .select('ISBN', 'BookTitle', 'BookAuthor',
                                             'YearOfPublication', 'Publisher', 'prediction')

print("\nTop 10 Recommended Books:")
final_recommendations.show(10, truncate=False)

# Show what the user has already rated
print(f"\nBooks already rated by User {sample_user}:")
user_history = ratings_filtered.filter(col('UserID') == sample_user) \
    .join(books, 'ISBN') \
    .select('ISBN', 'BookTitle', 'BookAuthor', 'BookRating') \
    .orderBy('BookRating', ascending=False)
user_history.show(10, truncate=False)

print("\n=== RECOMMENDATION SYSTEM COMPLETE ===")


=== DETAILED RECOMMENDATIONS FOR SPECIFIC USER ===

Generating recommendations for User ID: 277427

Top 10 Recommended Books:
+----------+---------------------------------------------------+-------------------+-----------------+-------------------------------+----------+
|ISBN      |BookTitle                                          |BookAuthor         |YearOfPublication|Publisher                      |prediction|
+----------+---------------------------------------------------+-------------------+-----------------+-------------------------------+----------+
|0679435204|Bird by Bird: Some Instructions on Writing and Life|Anne Lamott        |1994             |Pantheon Books                 |12.020956 |
|2207252000|La Maison des feuilles                             |Mark Z. Danielewski|2002             |Deno√É?√Ç¬´l                      |12.075512 |
|2253150908|L'Alchimiste                                       |Paulo Coelho       |2002             |LGF                            |12.804

# DDB

In [17]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os

OUTPUT_PATH = "/content/output/"
os.makedirs(OUTPUT_PATH, exist_ok=True)

print("="*60)
print("EXPORTING RECOMMENDATION SYSTEM TO CSV")
print("="*60)

EXPORTING RECOMMENDATION SYSTEM TO CSV


In [18]:
print("\n1. Extracting model factors...")

# Get factors as pandas DataFrames
user_factors = model.userFactors.toPandas()
item_factors = model.itemFactors.toPandas()

print(f"  ‚úì {len(user_factors):,} user factors")
print(f"  ‚úì {len(item_factors):,} item factors")

# Get mappings
print("\n2. Creating mappings...")
user_mapping = ratings_indexed.select('UserID', 'userIndex').distinct().toPandas()
book_mapping = ratings_indexed.select('ISBN', 'bookIndex').distinct() \
    .join(books, 'ISBN') \
    .select('ISBN', 'bookIndex', 'BookTitle', 'BookAuthor',
            'YearOfPublication', 'Publisher') \
    .toPandas()

print(f"  ‚úì {len(user_mapping):,} user mappings")
print(f"  ‚úì {len(book_mapping):,} book mappings")

# Save mappings
user_mapping.to_csv(f"{OUTPUT_PATH}/user_mapping.csv", index=False)
book_mapping.to_csv(f"{OUTPUT_PATH}/book_mapping.csv", index=False)
print("  ‚úì Mappings saved")


1. Extracting model factors...
  ‚úì 12,993 user factors
  ‚úì 14,471 item factors

2. Creating mappings...
  ‚úì 13,305 user mappings
  ‚úì 13,776 book mappings
  ‚úì Mappings saved


In [19]:
print("\n3. Saving user factors...")

# Merge with actual user IDs
user_factors_with_ids = user_factors.merge(
    user_mapping,
    left_on='id',
    right_on='userIndex',
    how='left'
)

# Convert features array to separate columns
features_array = np.vstack(user_factors_with_ids['features'].values)
feature_cols = pd.DataFrame(
    features_array,
    columns=[f'factor_{i}' for i in range(features_array.shape[1])]
)

user_factors_final = pd.concat([
    user_factors_with_ids[['UserID', 'userIndex']],
    feature_cols
], axis=1)

user_factors_final.to_csv(f"{OUTPUT_PATH}/user_factors.csv", index=False)
print(f"  ‚úì Saved {len(user_factors_final):,} user factors")


3. Saving user factors...
  ‚úì Saved 12,993 user factors


In [20]:
print("\n4. Saving item factors...")

# Merge with book details
item_factors_with_details = item_factors.merge(
    book_mapping,
    left_on='id',
    right_on='bookIndex',
    how='left'
)

# Convert features array to separate columns
features_array = np.vstack(item_factors_with_details['features'].values)
feature_cols = pd.DataFrame(
    features_array,
    columns=[f'factor_{i}' for i in range(features_array.shape[1])]
)

item_factors_final = pd.concat([
    item_factors_with_details[['ISBN', 'bookIndex', 'BookTitle', 'BookAuthor',
                                'YearOfPublication', 'Publisher']],
    feature_cols
], axis=1)

item_factors_final.to_csv(f"{OUTPUT_PATH}/item_factors.csv", index=False)
print(f"  ‚úì Saved {len(item_factors_final):,} item factors")


4. Saving item factors...
  ‚úì Saved 14,471 item factors


In [21]:
print("\n5. Generating user recommendations...")
print("  Computing prediction matrix...")

# Prepare feature matrices
user_features_matrix = np.vstack(user_factors['features'].values)
item_features_matrix = np.vstack(item_factors['features'].values)

print(f"  User features: {user_features_matrix.shape}")
print(f"  Item features: {item_features_matrix.shape}")

# Compute all predictions at once (matrix multiplication) faster than ALS
all_predictions = np.dot(user_features_matrix, item_features_matrix.T)
print(f"  ‚úì Predictions computed: {all_predictions.shape}")

# Generate top 20 recommendations for each user
recommendations_list = []
top_n = 20

print(f"  Generating top-{top_n} recommendations per user...")

for user_idx in range(len(user_factors)):
    user_id_internal = user_factors.iloc[user_idx]['id']

    # Get actual UserID
    user_match = user_mapping[user_mapping['userIndex'] == user_id_internal]
    if len(user_match) == 0:
        continue
    user_id = user_match.iloc[0]['UserID']

    # Get top N items for this user
    user_scores = all_predictions[user_idx]
    top_item_indices = np.argsort(user_scores)[::-1][:top_n]

    # Create recommendation records
    for rank, item_idx in enumerate(top_item_indices, 1):
        item_id_internal = item_factors.iloc[item_idx]['id']
        score = user_scores[item_idx]

        # Get book details
        book_match = book_mapping[book_mapping['bookIndex'] == item_id_internal]
        if len(book_match) == 0:
            continue

        book_info = book_match.iloc[0]

        recommendations_list.append({
            'user_id': user_id,
            'rank': rank,
            'isbn': book_info['ISBN'],
            'title': book_info['BookTitle'],
            'author': book_info['BookAuthor'],
            'year': book_info['YearOfPublication'],
            'publisher': book_info['Publisher'],
            'prediction_score': float(score)
        })

    # Progress indicator
    if (user_idx + 1) % 1000 == 0:
        print(f"    Progress: {user_idx + 1:,}/{len(user_factors):,} users")

# Save recommendations
recommendations_df = pd.DataFrame(recommendations_list)
recommendations_df.to_csv(f"{OUTPUT_PATH}/user_recommendations.csv", index=False)
print(f"\n  ‚úì Saved {len(recommendations_df):,} recommendations")

# Show sample
print("\n  Sample recommendations:")
print(recommendations_df[recommendations_df['rank'] <= 3].head(10))


5. Generating user recommendations...
  Computing prediction matrix...
  User features: (12993, 10)
  Item features: (14471, 10)
  ‚úì Predictions computed: (12993, 14471)
  Generating top-20 recommendations per user...
    Progress: 1,000/12,993 users
    Progress: 2,000/12,993 users
    Progress: 3,000/12,993 users
    Progress: 4,000/12,993 users
    Progress: 5,000/12,993 users
    Progress: 6,000/12,993 users
    Progress: 7,000/12,993 users
    Progress: 8,000/12,993 users
    Progress: 9,000/12,993 users
    Progress: 10,000/12,993 users
    Progress: 11,000/12,993 users
    Progress: 12,000/12,993 users

  ‚úì Saved 211,048 recommendations

  Sample recommendations:
     user_id  rank        isbn                                 title  \
0    11676.0     1  884590184X  Siddharta Romanzo Versione Di M Mila   
1    11676.0     2  8420633119                              El Aleph   
2    11676.0     3  2253150908                          L'Alchimiste   
17  158295.0     2  22531509

In [22]:
print("\n6. Computing book similarities...")

# Compute similarity matrix
print("  Computing cosine similarity matrix...")
similarity_matrix = cosine_similarity(item_features_matrix)
print(f"  ‚úì Similarity matrix: {similarity_matrix.shape}")

# Extract top 20 similar books for each book
similarity_list = []
top_n_similar = 20

print(f"  Extracting top-{top_n_similar} similar books...")

for item_idx in range(len(item_factors)):
    item_id_internal = item_factors.iloc[item_idx]['id']

    # Get book info
    book_match = book_mapping[book_mapping['bookIndex'] == item_id_internal]
    if len(book_match) == 0:
        continue

    book_info = book_match.iloc[0]

    # Get top N similar books (excluding itself)
    similarities = similarity_matrix[item_idx]
    similar_indices = np.argsort(similarities)[::-1][1:top_n_similar+1]

    for rank, similar_idx in enumerate(similar_indices, 1):
        similar_item_id = item_factors.iloc[similar_idx]['id']
        similarity_score = similarities[similar_idx]

        # Get similar book info
        similar_book_match = book_mapping[book_mapping['bookIndex'] == similar_item_id]
        if len(similar_book_match) == 0:
            continue

        similar_book = similar_book_match.iloc[0]

        similarity_list.append({
            'isbn': book_info['ISBN'],
            'title': book_info['BookTitle'],
            'author': book_info['BookAuthor'],
            'similar_isbn': similar_book['ISBN'],
            'similar_title': similar_book['BookTitle'],
            'similar_author': similar_book['BookAuthor'],
            'similarity_score': float(similarity_score),
            'rank': rank
        })

    # Progress indicator
    if (item_idx + 1) % 1000 == 0:
        print(f"    Progress: {item_idx + 1:,}/{len(item_factors):,} books")

# Save similarities
similarities_df = pd.DataFrame(similarity_list)
similarities_df.to_csv(f"{OUTPUT_PATH}/book_similarities.csv", index=False)
print(f"\n  ‚úì Saved {len(similarities_df):,} similarity records")

# Show sample
print("\n  Sample similarities:")
print(similarities_df[similarities_df['rank'] <= 3].head(10))


6. Computing book similarities...
  Computing cosine similarity matrix...
  ‚úì Similarity matrix: (14471, 14471)
  Extracting top-20 similar books...
    Progress: 1,000/14,471 books
    Progress: 3,000/14,471 books
    Progress: 4,000/14,471 books
    Progress: 5,000/14,471 books
    Progress: 6,000/14,471 books
    Progress: 7,000/14,471 books
    Progress: 8,000/14,471 books
    Progress: 9,000/14,471 books
    Progress: 10,000/14,471 books
    Progress: 11,000/14,471 books
    Progress: 12,000/14,471 books
    Progress: 13,000/14,471 books
    Progress: 14,000/14,471 books

  ‚úì Saved 260,947 similarity records

  Sample similarities:
          isbn                          title              author  \
0   0316666343      The Lovely Bones: A Novel        Alice Sebold   
1   0316666343      The Lovely Bones: A Novel        Alice Sebold   
2   0316666343      The Lovely Bones: A Novel        Alice Sebold   
18  0345337662     Interview with the Vampire           Anne Rice   
19  0

In [23]:
print("\n" + "="*60)
print("‚úÖ EXPORT COMPLETE!")
print("="*60)

summary = f"""
Output Location: {OUTPUT_PATH}

Generated Files:
  1. user_recommendations.csv     - {len(recommendations_df):,} rows
  2. book_similarities.csv        - {len(similarities_df):,} rows
  3. user_factors.csv             - {len(user_factors_final):,} rows
  4. item_factors.csv             - {len(item_factors_final):,} rows
  5. user_mapping.csv             - {len(user_mapping):,} rows
  6. book_mapping.csv             - {len(book_mapping):,} rows

Model Performance:
  RMSE: {rmse:.4f}
  MAE:  {mae:.4f}

Dataset Stats:
  Total Users:        {len(user_mapping):,}
  Total Books:        {len(book_mapping):,}
  Total Ratings:      {ratings_indexed.count():,}
  Recommendations:    {top_n} per user
  Similar Books:      {top_n_similar} per book
"""

print(summary)

# Save summary to text file
with open(f"{OUTPUT_PATH}/summary.txt", 'w') as f:
    f.write(summary)

print("üìÑ Summary saved to summary.txt")


‚úÖ EXPORT COMPLETE!

Output Location: /content/output/

Generated Files:
  1. user_recommendations.csv     - 211,048 rows
  2. book_similarities.csv        - 260,947 rows
  3. user_factors.csv             - 12,993 rows
  4. item_factors.csv             - 14,471 rows
  5. user_mapping.csv             - 13,305 rows
  6. book_mapping.csv             - 13,776 rows

Model Performance:
  RMSE: 2.0185
  MAE:  1.5719

Dataset Stats:
  Total Users:        13,305
  Total Books:        13,776
  Total Ratings:      152,280
  Recommendations:    20 per user
  Similar Books:      20 per book

üìÑ Summary saved to summary.txt


In [24]:
DRIVE_OUTPUT_PATH = "/content/drive/MyDrive/Recommendation System/output/"
os.makedirs(DRIVE_OUTPUT_PATH, exist_ok=True)

print(f"Copying files to Google Drive: {DRIVE_OUTPUT_PATH}")

import shutil
for filename in os.listdir(OUTPUT_PATH):
    src = os.path.join(OUTPUT_PATH, filename)
    dst = os.path.join(DRIVE_OUTPUT_PATH, filename)
    shutil.copy2(src, dst)
    print(f"  ‚úì Copied: {filename}")

print("\n‚úÖ All files saved to Google Drive!")
print(f"Location: {DRIVE_OUTPUT_PATH}")

Copying files to Google Drive: /content/drive/MyDrive/Recommendation System/output/
  ‚úì Copied: user_recommendations.csv
  ‚úì Copied: user_factors.csv
  ‚úì Copied: book_similarities.csv
  ‚úì Copied: item_factors.csv
  ‚úì Copied: user_mapping.csv
  ‚úì Copied: summary.txt
  ‚úì Copied: book_mapping.csv

‚úÖ All files saved to Google Drive!
Location: /content/drive/MyDrive/Recommendation System/output/


In [None]:
spark.stop()

