<a href="https://colab.research.google.com/github/rifhatania/UAS_Kelompok4/blob/main/UAS_Kelompok4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from scipy.sparse import csr_matrix, issparse
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# 2. Input Dataset
## a. Books Dataset
print('Books Dataset')
books = pd.read_csv('Books.csv', low_memory=False)
print("Books:", books.shape)
print(books.head())

## b. Users Dataset
print('\nUsers Dataset')
users = pd.read_csv('Users.csv')
print("Users:", users.shape)
print(users.head())

## c. Books-Ratings Dataset
print('\nBooks-Ratings Dataset')
ratings = pd.read_csv('Books-Ratings.csv')
print("Ratings:", ratings.shape)
print(ratings)




Books Dataset
Books: (271360, 8)
         ISBN                                         Book-Title  \
0  0195153448                                Classical Mythology   
1  0002005018                                       Clara Callan   
2  0060973129                               Decision in Normandy   
3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
4  0393045218                             The Mummies of Urumchi   

            Book-Author Year-Of-Publication                   Publisher  \
0    Mark P. O. Morford                2002     Oxford University Press   
1  Richard Bruce Wright                2001       HarperFlamingo Canada   
2          Carlo D'Este                1991             HarperPerennial   
3      Gina Bari Kolata                1999        Farrar Straus Giroux   
4       E. J. W. Barber                1999  W. W. Norton &amp; Company   

                                         Image-URL-S  \
0  http://images.amazon.com/images/P/0195153448.0..

In [None]:
# === 1. Cek Missing Value & Outlier ===
print("=== Missing Value pada Books ===")
print(books[['Book-Title', 'Book-Author', 'Publisher']].isnull().sum())

# Hapus kolom gambar yang tidak digunakan
books.drop(columns=['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], inplace=True)

# Hapus baris buku dengan informasi penting yang kosong
books.dropna(subset=['Book-Title', 'Book-Author', 'Publisher'], inplace=True)

# Bersihkan umur users
users['Age'] = pd.to_numeric(users['Age'], errors='coerce')
users['Age'] = users['Age'].apply(lambda x: np.nan if x < 5 or x > 100 else x)
print("\nJumlah NULL kolom 'Age' setelah dibersihkan:", users['Age'].isna().sum())

# Bersihkan ratings: hanya ambil rating > 0
ratings = ratings[ratings['Book-Rating'] > 0]
print("\nJumlah rating setelah buang rating 0:", ratings.shape[0])

# === 2. Merge ===
# Merge ratings dengan users (biar ada info user kayak Age)
ratings_users = pd.merge(ratings, users, on='User-ID', how='inner')

# Merge ratings+users dengan books
ratings_full = pd.merge(ratings_users, books, on='ISBN', how='inner')

print("\nKolom hasil merge:")
print(ratings_full.columns)

# === 3. Encoding ===
le_author = LabelEncoder()
le_publisher = LabelEncoder()
ratings_full['Author_Encoded'] = le_author.fit_transform(ratings_full['Book-Author'].astype(str))
ratings_full['Publisher_Encoded'] = le_publisher.fit_transform(ratings_full['Publisher'].astype(str))

print("\nContoh hasil encoding:")
print(ratings_full[['Book-Author', 'Author_Encoded', 'Publisher', 'Publisher_Encoded']].head())

# === 4. Normalisasi ===
ratings_full['Year-Of-Publication'] = pd.to_numeric(ratings_full['Year-Of-Publication'], errors='coerce')
ratings_full['Year-Of-Publication'].fillna(ratings_full['Year-Of-Publication'].median(), inplace=True)

# Hitung rata-rata rating per ISBN → lalu merge ke data
avg_rating = ratings_full.groupby('ISBN')['Book-Rating'].mean().reset_index()
avg_rating.columns = ['ISBN', 'AvgRating']
ratings_full = pd.merge(ratings_full, avg_rating, on='ISBN', how='left')

ratings_full['AvgRating'] = ratings_full['AvgRating'].fillna(0)

# Normalisasi numerik
scaler = MinMaxScaler()
ratings_full[['Year_norm', 'AvgRating_norm']] = scaler.fit_transform(
    ratings_full[['Year-Of-Publication', 'AvgRating']]
)

# === 5. Final Fitur Gabungan untuk Content-Based Filtering ===
book_features = ratings_full[['ISBN', 'Book-Title', 'Year_norm', 'AvgRating_norm', 'Author_Encoded', 'Publisher_Encoded']].drop_duplicates()

print("\n=== Contoh Gabungan Fitur Buku ===")
print(book_features.head(10))

# Simpan juga data yang masih punya User-ID untuk evaluasi
ratings_with_features = pd.merge(ratings, book_features, on='ISBN', how='inner')

print("\nCek kolom final:")
print(ratings_with_features[['User-ID', 'ISBN', 'Book-Rating', 'Book-Title']].head())


=== Missing Value pada Books ===
Book-Title     0
Book-Author    2
Publisher      2
dtype: int64

Jumlah NULL kolom 'Age' setelah dibersihkan: 112010

Jumlah rating setelah buang rating 0: 433671

Kolom hasil merge:
Index(['User-ID', 'ISBN', 'Book-Rating', 'Location', 'Age', 'Book-Title',
       'Book-Author', 'Year-Of-Publication', 'Publisher'],
      dtype='object')

Contoh hasil encoding:
     Book-Author  Author_Encoded                   Publisher  \
0     Judith Rae           31469                      Heinle   
1  Philip Prowse           47691  Cambridge University Press   
2    Sue Leather           56130  Cambridge University Press   
3   JOHN GRISHAM           24910                   Doubleday   
4  Rebecca Wells           49197                 HarperTorch   

   Publisher_Encoded  
0               4755  
1               1791  
2               1791  
3               2934  
4               4607  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ratings_full['Year-Of-Publication'].fillna(ratings_full['Year-Of-Publication'].median(), inplace=True)



=== Contoh Gabungan Fitur Buku ===
         ISBN                                         Book-Title  Year_norm  \
0  0155061224                                   Rites of Passage   0.976098   
1  052165615X                                     Help!: Level 1   0.975122   
2  0521795028  The Amsterdam Connection : Level 4 (Cambridge ...   0.976098   
3  038550120X                                    A Painted House   0.976098   
4  0060517794                           Little Altars Everywhere   0.977073   
5  0671537458                                  Waiting to Exhale   0.973171   
6  0679776818                  Birdsong: A Novel of Love and War   0.974146   
7  0943066433                  How to Deal With Difficult People   0.973171   
8  1885408226                      The Golden Rule of Schmoozing   0.974634   
9  0747558167        Apricots on the Nile: A Memoir with Recipes   0.976585   

   AvgRating_norm  Author_Encoded  Publisher_Encoded  
0        0.444444           31469      

In [None]:
# === FILTER DATA UNTUK EFISIENSI ===
# Ambil user yang kasih rating minimal 10 buku
user_rating_count = ratings['User-ID'].value_counts()
active_users = user_rating_count[user_rating_count >= 1].index
ratings_filtered = ratings[ratings['User-ID'].isin(active_users)]

# Ambil buku yang dirating minimal 50 user
book_rating_count = ratings_filtered['ISBN'].value_counts()
popular_books = book_rating_count[book_rating_count >= 100].index
ratings_filtered = ratings_filtered[ratings_filtered['ISBN'].isin(popular_books)]

print("Jumlah data setelah filter:", ratings_filtered.shape)

# === MATRIX RATING USER x ITEM ===
rating_matrix = ratings_filtered.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating').fillna(0)

# Konversi ke sparse matrix
from scipy.sparse import csr_matrix
rating_sparse = csr_matrix(rating_matrix.values)

# === 3a. Euclidean Similarity antar user ===
from sklearn.metrics.pairwise import euclidean_distances
euclidean_sim = euclidean_distances(rating_sparse)

# === 3b. Cosine Similarity antar user ===
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(rating_sparse)

# === 3c. Pearson Correlation antar item ===
pearson_sim = rating_matrix.T.corr(method='pearson', min_periods=10)

print("\nSimilarity calculation selesai.")


Jumlah data setelah filter: (23501, 3)

Similarity calculation selesai.


In [None]:
# === PERBAIKI BOOK FEATURES AGAR LEBIH RINGAN ===
# Filter book_features hanya yang masuk popular_books
book_features_filtered = book_features[book_features['ISBN'].isin(popular_books)].reset_index(drop=True)

# Ambil hanya fitur numerik dari book_features_filtered
book_numerical_features = book_features_filtered[['Year_norm', 'AvgRating_norm', 'Author_Encoded', 'Publisher_Encoded']]

# Hitung cosine similarity antar buku (Content-Based Filtering)
book_content_sim = cosine_similarity(book_numerical_features.values)

# Contoh: rekomendasikan buku mirip dengan buku ke-0
idx_target = 0  # index buku yang ingin dicari rekomendasinya
target_title = book_features_filtered.iloc[idx_target]['Book-Title']

# Ambil indeks buku dengan similarity tertinggi (kecuali dirinya sendiri)
similar_books_idx = book_content_sim[idx_target].argsort()[::-1][1:6]

# Tampilkan hasil rekomendasi
print(f"\nRekomendasi untuk buku: {target_title}")
for idx in similar_books_idx:
    print("-", book_features_filtered.iloc[idx]['Book-Title'])



Rekomendasi untuk buku: Harry Potter and the Order of the Phoenix (Book 5)
- Harry Potter and the Prisoner of Azkaban (Book 3)
- Harry Potter and the Prisoner of Azkaban (Book 3)
- Harry Potter and the Sorcerer's Stone (Book 1)
- Harry Potter and the Chamber of Secrets (Book 2)
- Harry Potter and the Goblet of Fire (Book 4)


In [None]:
import random

# Ambil 20 user aktif secara acak
sample_users = random.sample(list(ratings_filtered['User-ID'].unique()), 20)

# Evaluasi: hitung berapa user yang mendapat rekomendasi yang 'benar'
benar_count = 0

for user_id in sample_users:
    # Ambil buku dengan rating tinggi dari user ini
    user_data = ratings_filtered[ratings_filtered['User-ID'] == user_id]
    user_high_rated = user_data[user_data['Book-Rating'] >= 8]

    if user_high_rated.empty:
        continue  # skip user yang tidak punya rating tinggi

    # Ambil 1 buku favorit dari user
    user_fav_book = user_high_rated.sample(1, random_state=42).iloc[0]
    isbn_fav = user_fav_book['ISBN']

    # Cari indeks buku ini di book_features_filtered
    try:
        target_idx = book_features_filtered[book_features_filtered['ISBN'] == isbn_fav].index[0]
    except IndexError:
        continue  # buku tidak ditemukan dalam fitur, skip

    # Cari rekomendasi dari buku favorit user
    similar_idx = book_content_sim[target_idx].argsort()[::-1][1:6]
    recommended_isbns = book_features_filtered.iloc[similar_idx]['ISBN'].values

    # Cek apakah user pernah kasih rating tinggi ke salah satu rekomendasi
    user_read_isbns = user_data[user_data['Book-Rating'] >= 8]['ISBN'].values

    if any(isbn in user_read_isbns for isbn in recommended_isbns):
        benar_count += 1

# Hasil evaluasi
print(f"\nEvaluasi Akhir:")
print(f"Jumlah user yang dapat rekomendasi benar dari 20: {benar_count}/20")



Evaluasi Akhir:
Jumlah user yang dapat rekomendasi benar dari 20: 1/20


In [None]:
# 4. Similarity

# Merge data
data = ratings.merge(books_filtered, on='ISBN').merge(users_filtered, on='User-ID')

# --- Add Filtering to reduce pivot table size ---
# Count how many ratings each user and book have
user_counts = data['User-ID'].value_counts()
book_counts = data['Book-Title'].value_counts()

# Define thresholds (adjust these based on your RAM and dataset analysis)
# For example, keep users who rated at least 10 books and books rated by at least 5 users
user_threshold = 10
book_threshold = 5

# Get lists of users and books that meet the criteria
users_to_keep = user_counts[user_counts >= user_threshold].index
books_to_keep = book_counts[book_counts >= book_threshold].index

# Filter the merged data
filtered_data = data[data['User-ID'].isin(users_to_keep) & data['Book-Title'].isin(books_to_keep)]

print(f"\nOriginal data shape: {data.shape}")
print(f"Filtered data shape: {filtered_data.shape}")

# Create the user-item pivot table from the filtered data
# This is the step that was likely causing high RAM usage
# Using filtered_data instead of the full 'data'
user_item = filtered_data.pivot_table(index='User-ID', columns='Book-Title', values='Book-Rating').fillna(0)

print(f"\nUser-Item matrix shape: {user_item.shape}")
print(f"Memory usage of user_item (MB): {user_item.memory_usage(deep=True).sum() / (1024**2):.2f}")


## a. Euclidean
# Check if user_item is not empty before calculating similarity
if not user_item.empty:
    euclidean_sim = 1 / (1 + euclidean_distances(user_item))
    euclidean_sim_df = pd.DataFrame(euclidean_sim, index=user_item.index, columns=user_item.index)
    print("\nEuclidean Similarity Matrix calculated.")
else:
    print("\nUser-Item matrix is empty after filtering, skipping Euclidean Similarity.")


## b. Consine Similarity
if not user_item.empty:
    cosine_sim = cosine_similarity(user_item)
    cosine_sim_df = pd.DataFrame(cosine_sim, index=user_item.index, columns=user_item.index)
    print("Cosine Similarity Matrix calculated.")
else:
     print("User-Item matrix is empty after filtering, skipping Cosine Similarity.")


## c. Metode Lain: Pearson Correlation
if not user_item.empty:
    # Pearson correlation works on the transpose of the user-item matrix
    # if calculating similarity between users based on item ratings.
    # If calculating item similarity, you'd use user_item.corr()
    # Assuming you want user similarity based on ratings:
    pearson_sim = user_item.T.corr(method='pearson')
    print("Pearson Correlation Matrix calculated.")
    print(pearson_sim)
else:
    print("User-Item matrix is empty after filtering, skipping Pearson Correlation.")




Original data shape: (1031134, 6)
Filtered data shape: (592850, 6)

User-Item matrix shape: (11710, 39181)
Memory usage of user_item (MB): 3500.53

Euclidean Similarity Matrix calculated.
Cosine Similarity Matrix calculated.


In [None]:
# 5. Content Based Filtering

In [None]:
# 6. Evaluasi

In [None]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

# Ambil fitur numerik saja
numeric_features = book_features[['Year_norm', 'AvgRating_norm', 'Author_Encoded', 'Publisher_Encoded']]

# Cosine Similarity
cos_sim = cosine_similarity(numeric_features)

# Euclidean Distance
euc_dist = euclidean_distances(numeric_features)

print("\n=== Ukuran Matriks Similarity ===")
print("Cosine:", cos_sim.shape)
print("Euclidean:", euc_dist.shape)