In [2]:
pip install surprise

Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install docx

In [None]:
from docx import Document

In [5]:
import pandas as pd


In [6]:
df_final = pd.read_csv("../data/df_final.csv") 
df_240 = pd.read_csv("../data/df_240.csv") 
df_528 = pd.read_csv("../data/df_528.csv")
df_1056 = pd.read_csv("../data/df_1056.csv")

In [3]:
df_final.head()

Unnamed: 0,user_id,app_id,category,rating
0,1,1,Strategy,4
1,2,2,Puzzle,3
2,3,3,Business,2
3,4,4,Simulation,1
4,5,5,Books & Reference,3


## Generating top n recommandation to users based on 4 diffrent datasets(original+ 3 biased)

In [9]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

# Load datasets
df_final = pd.read_csv("../data/df_final.csv")
df_240 = pd.read_csv("../data/df_240.csv")
df_528 = pd.read_csv("../data/df_528.csv")
df_1056 = pd.read_csv("../data/df_1056.csv")

# Define datasets
datasets = {
    "final": df_final,
    "240": df_240,
    "528": df_528,
    "1056": df_1056,
}

def train_and_recommend(train_df, test_users_df, k=50, top_k=25):
    # Create training user-app matrix
    train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)
    user_ids = train_matrix.index
    app_ids = train_matrix.columns

    # SVD
    train_sparse = csr_matrix(train_matrix.values)
    U, sigma, Vt = svds(train_sparse, k=k)
    sigma = np.diag(sigma)
    pred_matrix = np.dot(np.dot(U, sigma), Vt)
    pred_df = pd.DataFrame(pred_matrix, index=user_ids, columns=app_ids)

    # Already rated map
    train_rated = train_df.groupby('user_id')['app_id'].apply(set).to_dict()

    # Filter users and items present in both train and test
    test_users_filtered = test_users_df[test_users_df['user_id'].isin(user_ids)]

    # Generate top-K recommendations
    top_recs = []
    for user in test_users_filtered['user_id'].unique():
        if user not in pred_df.index:
            continue
        user_pred = pred_df.loc[user]
        rated_apps = train_rated.get(user, set())
        recommendations = user_pred.drop(index=rated_apps, errors='ignore').sort_values(ascending=False).head(top_k)
        for app_id, score in recommendations.items():
            top_recs.append({'user_id': user, 'app_id': app_id, 'predicted_rating': score})

    return pd.DataFrame(top_recs)

def merge_with_category_on_app_id(recommendations_df, df_final):
    df_final_subset = df_final[['app_id', 'category']].drop_duplicates()
    merged_df = pd.merge(recommendations_df, df_final_subset, on='app_id', how='left')
    return merged_df

# === Run SVD on each dataset separately for each top-k ===
for k in [15, 25, 35]:
    print(f"🚀 Starting run for top-{k} recommendations...")
    for name, train_df in datasets.items():
        print(f"🔁 Training on {name}, generating top-{k} recommendations for df_final users...")
        recs = train_and_recommend(train_df, df_final, k=50, top_k=k)
        recs_with_category = merge_with_category_on_app_id(recs, df_final)
        print(f"🔎 Number of recommendations for {name} top-{k}: {len(recs_with_category)}")

        filename = f"../result/rec/top_re/top{k}_{name}_with_category.csv"
        recs_with_category.to_csv(filename, index=False)
        print(f"✅ Saved: {filename}")


🚀 Starting run for top-15 recommendations...
🔁 Training on final, generating top-15 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for final top-15: 10501665
✅ Saved: ../result/rec/top_re/top15_final_with_category.csv
🔁 Training on 240, generating top-15 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for 240 top-15: 10501665
✅ Saved: ../result/rec/top_re/top15_240_with_category.csv
🔁 Training on 528, generating top-15 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for 528 top-15: 10501665
✅ Saved: ../result/rec/top_re/top15_528_with_category.csv
🔁 Training on 1056, generating top-15 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for 1056 top-15: 10501665
✅ Saved: ../result/rec/top_re/top15_1056_with_category.csv
🚀 Starting run for top-25 recommendations...
🔁 Training on final, generating top-25 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for final top-25: 17502775
✅ Saved: ../result/rec/top_re/top25_final_with_category.csv
🔁 Training on 240, generating top-25 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for 240 top-25: 17502775
✅ Saved: ../result/rec/top_re/top25_240_with_category.csv
🔁 Training on 528, generating top-25 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for 528 top-25: 17502775
✅ Saved: ../result/rec/top_re/top25_528_with_category.csv
🔁 Training on 1056, generating top-25 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for 1056 top-25: 17502775
✅ Saved: ../result/rec/top_re/top25_1056_with_category.csv
🚀 Starting run for top-35 recommendations...
🔁 Training on final, generating top-35 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for final top-35: 24503885
✅ Saved: ../result/rec/top_re/top35_final_with_category.csv
🔁 Training on 240, generating top-35 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for 240 top-35: 24503885
✅ Saved: ../result/rec/top_re/top35_240_with_category.csv
🔁 Training on 528, generating top-35 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for 528 top-35: 24503885
✅ Saved: ../result/rec/top_re/top35_528_with_category.csv
🔁 Training on 1056, generating top-35 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for 1056 top-35: 24503885
✅ Saved: ../result/rec/top_re/top35_1056_with_category.csv


In [12]:
import pandas as pd

# Define datasets to analyze (new names for new data)
datasets = ["final", "240", "528", "1056"]

# Define top-k recommendation counts to process
top_ks = [15, 25, 35]

# Load user cluster assignments once (avoids reloading each time)
df_cluster = pd.read_csv("../preprocessing/df_cluster.csv")

# === Analysis 1: Cluster-Category Distribution ===

output_filename_cluster = "../result/rec/top_re/analysis/analysis_summary.txt"
with open(output_filename_cluster, "w") as f:
    for dataset_name in datasets:
        for top_k in top_ks:
            filename = f"../result/rec/top_re/top{top_k}_{dataset_name}_with_category.csv"
            print(f"\n📂 Processing recommendations from: {filename}")
            f.write(f"\n📂 Processing recommendations from: {filename}\n")

            try:
                recs = pd.read_csv(filename)
            except FileNotFoundError:
                msg = f"❌ File not found: {filename}"
                print(msg)
                f.write(msg + "\n")
                continue

            # Merge recommendations with user clusters on user_id
            merged = pd.merge(recs, df_cluster[['user_id', 'cluster']], on='user_id', how='left')

            # Group by cluster and category, count recommendations
            cluster_category_distribution = merged.groupby(['cluster', 'category']).size().reset_index(name='count')

            # Write analysis results for each cluster
            header = f"\n📂 Analysis for dataset {dataset_name}, top-{top_k}\n\n"
            print(header.strip())
            f.write(header)

            for cluster_num in sorted(cluster_category_distribution['cluster'].dropna().unique()):
                cluster_header = f"=== Dataset {dataset_name}, Top-{top_k}, Cluster {int(cluster_num)} ==="
                print(cluster_header)
                f.write(cluster_header + "\n")

                cluster_data = cluster_category_distribution[cluster_category_distribution['cluster'] == cluster_num]
                total_recs = cluster_data['count'].sum()
                
                for _, row in cluster_data.sort_values('category').iterrows():
                    category = row['category']
                    count = row['count']
                    percent = (count / total_recs) * 100
                    line = f"Category {category}: {count} recommendations ({percent:.2f}%)"
                    print(line)
                    f.write(line + "\n")

    print(f"\n✅ Saved combined cluster-category analysis: {output_filename_cluster}")

# === Analysis 2: Overall Category Distribution ===

output_filename_category = "../result/rec/top_re/analysis/category_summary_all.txt"
with open(output_filename_category, "w") as f:
    for dataset_name in datasets:
        for top_k in top_ks:
            filename = f"../result/rec/top_re/top{top_k}_{dataset_name}_with_category.csv"
            print(f"\n📂 Processing: {filename}")
            f.write(f"\n📂 Processing: {filename}\n")

            try:
                recs = pd.read_csv(filename)
            except FileNotFoundError:
                msg = f"❌ File not found: {filename}"
                print(msg)
                f.write(msg + "\n")
                continue

            # Group recommendations by category and count
            category_distribution = recs['category'].value_counts().sort_index()

            # Write absolute counts
            header = f"=== Dataset {dataset_name}, Top-{top_k}: Recommendation Count per Category ==="
            print(header)
            f.write(header + "\n")
            for category, count in category_distribution.items():
                line = f"Category {category}: {count} recommendations"
                print(line)
                f.write(line + "\n")

            # Write percentage distribution
            total_recs = category_distribution.sum()
            percent_header = f"\n=== Dataset {dataset_name}, Top-{top_k}: Percent Distribution ==="
            print(percent_header)
            f.write(percent_header + "\n")
            for category, count in category_distribution.items():
                percent = (count / total_recs) * 100
                line = f"Category {category}: {percent:.2f}%"
                print(line)
                f.write(line + "\n")

    print(f"\n✅ Saved combined category analysis: {output_filename_category}")



📂 Processing recommendations from: ../result/rec/top_re/top15_final_with_category.csv
📂 Analysis for dataset final, top-15
=== Dataset final, Top-15, Cluster 0 ===
Category Action: 295433 recommendations (8.44%)
Category Adventure: 61312 recommendations (1.75%)
Category Arcade: 28473 recommendations (0.81%)
Category Art & Design: 774 recommendations (0.02%)
Category Auto & Vehicles: 64294 recommendations (1.84%)
Category Beauty: 223 recommendations (0.01%)
Category Board: 59707 recommendations (1.71%)
Category Books & Reference: 112190 recommendations (3.20%)
Category Business: 40090 recommendations (1.15%)
Category Card: 69927 recommendations (2.00%)
Category Casino: 1120 recommendations (0.03%)
Category Casual: 215004 recommendations (6.14%)
Category Comics: 870 recommendations (0.02%)
Category Communication: 241326 recommendations (6.89%)
Category Dating: 20537 recommendations (0.59%)
Category Education: 28096 recommendations (0.80%)
Category Educational: 1 recommendations (0.00%)


KeyboardInterrupt: 

### Computing recommendation entropy per user, per dataset(all 6)

In [13]:
import pandas as pd
from scipy.stats import entropy
import os

# === Compute entropy of recommendations per user based on category distribution ===
def compute_recommendation_entropy(df):
    # Count recommendations per user per category
    user_category_counts = df.groupby(['user_id', 'category']).size().unstack(fill_value=0)
    # Normalize counts to probabilities
    user_category_dist = user_category_counts.div(user_category_counts.sum(axis=1), axis=0)
    # Compute entropy
    user_entropy = user_category_dist.apply(lambda row: entropy(row), axis=1).reset_index()
    user_entropy.columns = ['user_id', 'recommendation_entropy']
    return user_entropy

# Current datasets to process
datasets = ['final', '240', '528', '1056']

# Top-Ks to process
top_ks = [15, 25, 35]

# Directories
input_dir = '../result/rec/top_re'
output_dir = '../result/rec/top_re/entropy'
os.makedirs(output_dir, exist_ok=True)
cluster_file = '../preprocessing/df_cluster.csv'

# Load cluster assignments once
df_cluster = pd.read_csv(cluster_file)

# Dictionary to store resulting entropy DataFrames
entropy_datasets = {}

# Loop through each top-K and each dataset
for top_k in top_ks:
    for dataset_name in datasets:
        suffix = f"top{top_k}_{dataset_name}"
        input_file = os.path.join(input_dir, f'{suffix}_with_category.csv')
        output_file = os.path.join(output_dir, f'entropy_{suffix}.csv')
        
        print(f"\n📂 Processing: {input_file}")
        if not os.path.exists(input_file):
            print(f"❌ File not found: {input_file}")
            continue

        # Load recommendations for current dataset and top-K
        df = pd.read_csv(input_file)

        # Compute entropy per user
        entropy_df = compute_recommendation_entropy(df)
        entropy_df.to_csv(output_file, index=False)
        print(f"✅ Saved entropy data: {output_file}")

        # Merge entropy with cluster assignments
        merged = pd.merge(df_cluster, entropy_df, on='user_id', how='inner')
        merged = merged[['user_id', 'cluster', 'recommendation_entropy']]

        # Save merged DataFrame to globals and dictionary
        globals()[f'RecEnt_{suffix}'] = merged
        entropy_datasets[f'RecEnt_{suffix}'] = merged

print("\n✅ All datasets and top-Ks processed successfully.")



📂 Processing: ../result/rec/top_re/top15_final_with_category.csv
✅ Saved entropy data: ../result/rec/top_re/entropy/entropy_top15_final.csv

📂 Processing: ../result/rec/top_re/top15_240_with_category.csv
✅ Saved entropy data: ../result/rec/top_re/entropy/entropy_top15_240.csv

📂 Processing: ../result/rec/top_re/top15_528_with_category.csv
✅ Saved entropy data: ../result/rec/top_re/entropy/entropy_top15_528.csv

📂 Processing: ../result/rec/top_re/top15_1056_with_category.csv
✅ Saved entropy data: ../result/rec/top_re/entropy/entropy_top15_1056.csv

📂 Processing: ../result/rec/top_re/top25_final_with_category.csv
✅ Saved entropy data: ../result/rec/top_re/entropy/entropy_top25_final.csv

📂 Processing: ../result/rec/top_re/top25_240_with_category.csv
✅ Saved entropy data: ../result/rec/top_re/entropy/entropy_top25_240.csv

📂 Processing: ../result/rec/top_re/top25_528_with_category.csv
✅ Saved entropy data: ../result/rec/top_re/entropy/entropy_top25_528.csv

📂 Processing: ../result/rec/to

In [19]:
import pandas as pd
import os

# Datasets and Top-Ks you processed
datasets = ['final', '240', '528', '1056']
top_ks = [15, 25, 35]

# Paths
entropy_dir = '../result/rec/top_re/entropy'
cluster_file = '../preprocessing/df_cluster.csv'

# Load cluster assignments (fix: use variable, not string literal)
print(f"📥 Loading cluster assignments from: {cluster_file}")
df_cluster = pd.read_csv(cluster_file)

for top_k in top_ks:
    print(f"\n🚀 Merging entropy files for top-{top_k} recommendations...")

    merged_df = None

    for dataset_name in datasets:
        suffix = f"top{top_k}_{dataset_name}"
        entropy_file = os.path.join(entropy_dir, f'entropy_{suffix}.csv')

        print(f"🔄 Processing entropy file: {entropy_file}")
        if not os.path.exists(entropy_file):
            print(f"❌ Entropy file not found: {entropy_file}")
            continue

        # Load entropy DataFrame
        entropy_df = pd.read_csv(entropy_file)

        # Rename entropy column to avoid collisions
        entropy_df = entropy_df.rename(columns={"recommendation_entropy": f"rec_entropy_{suffix}"})

        # Merge with combined DataFrame
        if merged_df is None:
            merged_df = entropy_df
        else:
            merged_df = pd.merge(merged_df, entropy_df, on='user_id', how='outer')

    if merged_df is not None:
        # Merge with cluster assignments (this includes the 'entropy' column if it exists)
        merged_df = pd.merge(df_cluster, merged_df, on='user_id', how='inner')

        # Build ordered columns: user_id, cluster, entropy (if it exists), and rec_entropy columns
        ordered_cols = (
            ['user_id', 'cluster']
            + (['entropy'] if 'entropy' in merged_df.columns else [])
            + [col for col in merged_df.columns if col.startswith('rec_entropy_')]
        )
        merged_df = merged_df[ordered_cols]

        # Save final merged file for this top-K
        output_file = os.path.join(entropy_dir, f'merged_entropy_top{top_k}.csv')
        merged_df.to_csv(output_file, index=False)
        print(f"✅ Saved merged file: {output_file}")
    else:
        print(f"⚠️ No data available to merge for top-{top_k}.")

print("\n🎉 All top-K merges completed and saved successfully!")


📥 Loading cluster assignments from: ../preprocessing/df_cluster.csv

🚀 Merging entropy files for top-15 recommendations...
🔄 Processing entropy file: ../result/rec/top_re/entropy/entropy_top15_final.csv
🔄 Processing entropy file: ../result/rec/top_re/entropy/entropy_top15_240.csv
🔄 Processing entropy file: ../result/rec/top_re/entropy/entropy_top15_528.csv
🔄 Processing entropy file: ../result/rec/top_re/entropy/entropy_top15_1056.csv
✅ Saved merged file: ../result/rec/top_re/entropy/merged_entropy_top15.csv

🚀 Merging entropy files for top-25 recommendations...
🔄 Processing entropy file: ../result/rec/top_re/entropy/entropy_top25_final.csv
🔄 Processing entropy file: ../result/rec/top_re/entropy/entropy_top25_240.csv
🔄 Processing entropy file: ../result/rec/top_re/entropy/entropy_top25_528.csv
🔄 Processing entropy file: ../result/rec/top_re/entropy/entropy_top25_1056.csv
✅ Saved merged file: ../result/rec/top_re/entropy/merged_entropy_top25.csv

🚀 Merging entropy files for top-35 recomm

In [None]:
entropy_dir = '../result/rec/top_re/entropy'
top_ks = [15, 25, 35]

for top_k in top_ks:
    file_path = f"{entropy_dir}/merged_entropy_top{top_k}.csv"
    print(f"\n📂 Reading: {file_path}")
    
    try:
        df = pd.read_csv(file_path)
        print(f"✅ Loaded file with shape: {df.shape}")
        print(df.head())
    except FileNotFoundError:
        print(f"❌ File not found: {file_path}")



📂 Reading: ../result/rec/top_re/entropy/merged_entropy_top15.csv
✅ Loaded file with shape: (700111, 7)
   user_id  cluster   entropy  rec_entropy_top15_final  rec_entropy_top15_240  \
0   686826        0  0.450561                 2.303488               0.485094   
1   536830        0  0.450561                 2.523211               0.720125   
2   433075        0  0.450561                 2.303488               0.000000   
3    55674        0  0.500402                 2.245952               0.000000   
4   620088        0  0.500402                 2.523211               0.000000   

   rec_entropy_top15_528  rec_entropy_top15_1056  
0               0.244930                0.244930  
1               1.169993                0.627705  
2               0.244930                0.000000  
3               0.000000                0.636514  
4               0.244930                0.244930  

📂 Reading: ../result/rec/top_re/entropy/merged_entropy_top25.csv
✅ Loaded file with shape: (700111, 7)

## sanity check 1.1

In [22]:
import pandas as pd
from scipy.stats import entropy
import numpy as np

# User ID to check
target_user_id = 686826

# Merged file path
merged_file = "../result/rec/top_re/entropy/merged_entropy_top15.csv"

# Paths to original entropy datasets
original_files = {
    "rec_entropy_top15_final": "/home/moshtasa/Research/phd-svd-recsys/Recommender Systems/phd-svd-recsys/MobileRec/result/rec/top_re/top15_final_with_category.csv",
    "rec_entropy_top15_240": "/home/moshtasa/Research/phd-svd-recsys/Recommender Systems/phd-svd-recsys/MobileRec/result/rec/top_re/top15_240_with_category.csv",
    "rec_entropy_top15_528": "/home/moshtasa/Research/phd-svd-recsys/Recommender Systems/phd-svd-recsys/MobileRec/result/rec/top_re/top15_528_with_category.csv",
    "rec_entropy_top15_1056": "/home/moshtasa/Research/phd-svd-recsys/Recommender Systems/phd-svd-recsys/MobileRec/result/rec/top_re/top15_1056_with_category.csv",
}

# Cluster file path
cluster_file = "../preprocessing/df_cluster.csv"

# === 1) Read merged file
merged_df = pd.read_csv(merged_file)
merged_row = merged_df[merged_df["user_id"] == target_user_id]

if merged_row.empty:
    print(f"❌ User ID {target_user_id} not found in merged file.")
    exit()

print("\n✅ Merged row:")
print(merged_row.T)

# Collect expected values from merged row
expected = {col: merged_row.iloc[0][col] for col in merged_row.columns if col.startswith("rec_entropy_")}
expected['entropy'] = merged_row.iloc[0]['entropy']

# === 2) Read each original dataset and show entropy for user
validated = True
for entropy_col, file_path in original_files.items():
    try:
        df = pd.read_csv(file_path)
        user_df = df[df["user_id"] == target_user_id]
        if user_df.empty:
            print(f"\n❌ User ID {target_user_id} not found in original file: {file_path}")
            validated = False
            continue

        # Recompute entropy
        user_cat_counts = user_df.groupby("category").size()
        user_cat_dist = user_cat_counts / user_cat_counts.sum()
        calculated_entropy = entropy(user_cat_dist)

        print(f"\n✅ Original file: {entropy_col}")
        print(f"Calculated entropy for user {target_user_id}: {calculated_entropy:.6f}")

        # Compare to merged value
        merged_value = expected[entropy_col]
        if not np.isclose(calculated_entropy, merged_value, rtol=1e-6):
            print(f"❌ Mismatch! Merged value: {merged_value:.6f}, calculated: {calculated_entropy:.6f}")
            validated = False

    except FileNotFoundError:
        print(f"❌ File not found: {file_path}")
        validated = False

# === 3) Read cluster file
df_cluster = pd.read_csv(cluster_file)
cluster_row = df_cluster[df_cluster["user_id"] == target_user_id]

if cluster_row.empty:
    print(f"\n❌ User ID {target_user_id} not found in cluster file.")
    validated = False
else:
    cluster_entropy = cluster_row.iloc[0]['entropy']
    print("\n✅ Cluster file row:")
    print(cluster_row.T)

    # Compare cluster entropy with merged entropy
    if not np.isclose(cluster_entropy, expected['entropy'], rtol=1e-6):
        print(f"❌ Mismatch in cluster entropy! Merged: {expected['entropy']:.6f}, Cluster file: {cluster_entropy:.6f}")
        validated = False

# === Final validation message
if validated:
    print("\n🎉✅ All entropies match — everything is validated!")
else:
    print("\n⚠️ Validation failed — some entropies do not match.")



✅ Merged row:
                                     0
user_id                  686826.000000
cluster                       0.000000
entropy                       0.450561
rec_entropy_top15_final       2.303488
rec_entropy_top15_240         0.485094
rec_entropy_top15_528         0.244930
rec_entropy_top15_1056        0.244930

✅ Original file: rec_entropy_top15_final
Calculated entropy for user 686826: 2.303488

✅ Original file: rec_entropy_top15_240
Calculated entropy for user 686826: 0.485094

✅ Original file: rec_entropy_top15_528
Calculated entropy for user 686826: 0.244930

✅ Original file: rec_entropy_top15_1056
Calculated entropy for user 686826: 0.244930

✅ Cluster file row:
                     0
user_id  686826.000000
cluster       0.000000
entropy       0.450561

🎉✅ All entropies match — everything is validated!


#### Compute number of users per cluster from the original dataset 

In [14]:

globals()[f'RecEnt_{suffix}'] = merged
print(f"✅ Saved merged entropy as RecEnt_{suffix}")


cluster_counts = merged["cluster"].value_counts(dropna=True).sort_index()


# ===== Display the result: number of users per cluster =====
print("=== Number of Users per Cluster ===")
for cluster_id, count in cluster_counts.items():
    print(f"Cluster {int(cluster_id)}: {count:,} users")

# ===== Compute users per bin (for 10 bins per cluster) =====
num_bins = 10
users_per_bin = (cluster_counts // num_bins).rename("users_per_bin")

# ===== Display the result: number of users per bin =====
print(f"\n=== Users per Bin (10-bin setup) ===")
for cluster_id, count in users_per_bin.items():
    print(f"Cluster {int(cluster_id)}: {count:,} users per bin")


✅ Saved merged entropy as RecEnt_top35_1056
=== Number of Users per Cluster ===
Cluster 0: 233,371 users
Cluster 1: 233,370 users
Cluster 2: 233,370 users

=== Users per Bin (10-bin setup) ===
Cluster 0: 23,337 users per bin
Cluster 1: 23,337 users per bin
Cluster 2: 23,337 users per bin


## sanity check 1.2

In [25]:
import pandas as pd

entropy_dir = "../result/rec/top_re/entropy"
top_ks = [15, 25, 35]

for top_k in top_ks:
    file_path = f"{entropy_dir}/merged_entropy_top{top_k}.csv"
    print(f"\n📂 Reading: {file_path}")
    
    try:
        df = pd.read_csv(file_path)
        print(f"✅ Loaded merged dataset with shape: {df.shape}")
    except FileNotFoundError:
        print(f"❌ File not found: {file_path}")
        continue

    clusters = sorted(df["cluster"].dropna().astype(int).unique())
    print(f"\nIn dataset with {top_k} recommendations, there are {len(clusters)} clusters:")

    for cluster_id in clusters:
        cluster_df = df[df["cluster"] == cluster_id]
        n_users = len(cluster_df)
        min_entropy = cluster_df["entropy"].min()
        max_entropy = cluster_df["entropy"].max()

        print(f"  - Cluster {cluster_id}: {n_users} users")
        print(f"    ➤ Minimum entropy for cluster {cluster_id} in dataset {top_k} is: {min_entropy:.6f}")
        print(f"    ➤ Maximum entropy for cluster {cluster_id} in dataset {top_k} is: {max_entropy:.6f}")



📂 Reading: ../result/rec/top_re/entropy/merged_entropy_top15.csv
✅ Loaded merged dataset with shape: (700111, 7)

In dataset with 15 recommendations, there are 3 clusters:
  - Cluster 0: 233371 users
    ➤ Minimum entropy for cluster 0 in dataset 15 is: 0.450561
    ➤ Maximum entropy for cluster 0 in dataset 15 is: 1.791759
  - Cluster 1: 233370 users
    ➤ Minimum entropy for cluster 1 in dataset 15 is: 1.791759
    ➤ Maximum entropy for cluster 1 in dataset 15 is: 2.351257
  - Cluster 2: 233370 users
    ➤ Minimum entropy for cluster 2 in dataset 15 is: 2.351257
    ➤ Maximum entropy for cluster 2 in dataset 15 is: 3.651480

📂 Reading: ../result/rec/top_re/entropy/merged_entropy_top25.csv
✅ Loaded merged dataset with shape: (700111, 7)

In dataset with 25 recommendations, there are 3 clusters:
  - Cluster 0: 233371 users
    ➤ Minimum entropy for cluster 0 in dataset 25 is: 0.450561
    ➤ Maximum entropy for cluster 0 in dataset 25 is: 1.791759
  - Cluster 1: 233370 users
    ➤ Mini

In [24]:
import pandas as pd

entropy_dir = '../result/rec/top_re/entropy'
top_ks = [15, 25, 35]

for top_k in top_ks:
    file_path = f"{entropy_dir}/merged_entropy_top{top_k}.csv"
    print(f"\n📂 Reading: {file_path}")
    
    try:
        df = pd.read_csv(file_path)
        print(f"✅ Loaded file with shape: {df.shape}")
        print(df.head())
    except FileNotFoundError:
        print(f"❌ File not found: {file_path}")



📂 Reading: ../result/rec/top_re/entropy/merged_entropy_top15.csv
✅ Loaded file with shape: (700111, 7)
   user_id  cluster   entropy  rec_entropy_top15_final  rec_entropy_top15_240  \
0   686826        0  0.450561                 2.303488               0.485094   
1   536830        0  0.450561                 2.523211               0.720125   
2   433075        0  0.450561                 2.303488               0.000000   
3    55674        0  0.500402                 2.245952               0.000000   
4   620088        0  0.500402                 2.523211               0.000000   

   rec_entropy_top15_528  rec_entropy_top15_1056  
0               0.244930                0.244930  
1               1.169993                0.627705  
2               0.244930                0.000000  
3               0.000000                0.636514  
4               0.244930                0.244930  

📂 Reading: ../result/rec/top_re/entropy/merged_entropy_top25.csv
✅ Loaded file with shape: (700111, 7)

# Cluster histograms

#### Compute number of users per cluster from the original dataset 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

entropy_dir = '../result/rec/top_re/entropy'
output_folder = '../result/rec/top_re/entropy/cluster_plots'
os.makedirs(output_folder, exist_ok=True)
num_bins = 10

top_ks = [15, 25, 35]

for top_k in top_ks:
    file_path = f"{entropy_dir}/merged_entropy_top{top_k}.csv"
    print(f"\n📂 Reading: {file_path}")
    
    try:
        df = pd.read_csv(file_path)
        print(f"✅ Loaded file with shape: {df.shape}")
    except FileNotFoundError:
        print(f"❌ File not found: {file_path}")
        continue

    # Identify entropy columns (columns that start with rec_entropy_topXX_)
    entropy_cols = [col for col in df.columns if col.startswith(f"rec_entropy_top{top_k}_")]
    print(f"🔎 Entropy columns: {entropy_cols}")

    # Process each cluster
    for cluster_id in sorted(df["cluster"].dropna().astype(int).unique()):
        cluster_df = df[df["cluster"] == cluster_id].reset_index(drop=True)
        n_users_cluster = len(cluster_df)
        print(f"\n📊 Cluster {cluster_id}: {n_users_cluster} users")

        if n_users_cluster == 0:
            print(f"⚠️ Skipping Cluster {cluster_id} (no users).")
            continue

        users_per_bin = max(1, n_users_cluster // num_bins)
        counts = []
        averages = {col: [] for col in entropy_cols}

        for i in range(num_bins):
            start = i * users_per_bin
            end = (i + 1) * users_per_bin if i < num_bins - 1 else n_users_cluster
            bin_df = cluster_df.iloc[start:end]
            counts.append(len(bin_df))
            for col in entropy_cols:
                averages[col].append(bin_df[col].mean())

        # Print bin user counts
        print(f"📌 Bin user counts for Cluster {cluster_id}: {counts}")

        # Plot
        x = np.arange(num_bins)
        width = 0.15

        plt.figure(figsize=(16, 6))
        for idx, col in enumerate(entropy_cols):
            plt.bar(x + (idx - len(entropy_cols)/2) * width, averages[col], width=width, label=col)

        plt.xlabel('User Bins (equal slices)', fontsize=11)
        plt.ylabel('Average Recommendation Entropy', fontsize=11)
        plt.title(f'Cluster {cluster_id} - Top-{top_k} Entropy per Bin', fontsize=13)
        plt.xticks(x, [f'Bin {i+1}\n({counts[i]})' for i in x], fontsize=10)
        plt.yticks(fontsize=10)
        plt.legend(fontsize=9)
        plt.tight_layout()

        # Save plot
        filename = os.path.join(output_folder, f"cluster_{cluster_id}_top{top_k}_entropy.png")
        plt.savefig(filename)
        plt.close()
        print(f"✅ Saved plot: {filename}")

print("\n🎉 All plots successfully saved in:", output_folder)



📂 Reading: ../result/rec/top_re/entropy/merged_entropy_top15.csv
✅ Loaded file with shape: (700111, 6)
🔎 Entropy columns: ['rec_entropy_top15_final', 'rec_entropy_top15_240', 'rec_entropy_top15_528', 'rec_entropy_top15_1056']

📊 Cluster 0: 233371 users
📌 Bin user counts for Cluster 0: [23337, 23337, 23337, 23337, 23337, 23337, 23337, 23337, 23337, 23338]
✅ Saved plot: ../result/rec/top_re/entropy/cluster_plots_simple/cluster_0_top15_entropy.png

📊 Cluster 1: 233370 users
📌 Bin user counts for Cluster 1: [23337, 23337, 23337, 23337, 23337, 23337, 23337, 23337, 23337, 23337]
✅ Saved plot: ../result/rec/top_re/entropy/cluster_plots_simple/cluster_1_top15_entropy.png

📊 Cluster 2: 233370 users
📌 Bin user counts for Cluster 2: [23337, 23337, 23337, 23337, 23337, 23337, 23337, 23337, 23337, 23337]
✅ Saved plot: ../result/rec/top_re/entropy/cluster_plots_simple/cluster_2_top15_entropy.png

📂 Reading: ../result/rec/top_re/entropy/merged_entropy_top25.csv
✅ Loaded file with shape: (700111, 6)


## histogram

In [32]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

entropy_dir = '../result/rec/top_re/entropy'
output_folder = '../result/rec/top_re/entropy/cluster_plots'
os.makedirs(output_folder, exist_ok=True)
num_bins = 10
for top_k in top_ks:
    file_path = f"{entropy_dir}/merged_entropy_top{top_k}.csv"
    print(f"\n📂 Reading: {file_path}")
    
    try:
        df = pd.read_csv(file_path)
        print(f"✅ Loaded file with shape: {df.shape}")
    except FileNotFoundError:
        print(f"❌ File not found: {file_path}")
        continue

    entropy_columns = [col for col in df.columns if col.startswith(f"rec_entropy_top{top_k}_")]
    if not entropy_columns:
        print(f"❌ No entropy columns found for top-{top_k}!")
        continue

    for cluster_id in sorted(df["cluster"].dropna().astype(int).unique()):
        cluster_df = df[df["cluster"] == cluster_id].reset_index(drop=True)

        # Sort by clustering entropy
        if 'entropy' not in cluster_df.columns:
            print(f"❌ Missing 'entropy' column in dataset {file_path}")
            continue
        cluster_df = cluster_df.sort_values("entropy").reset_index(drop=True)

        n_users = len(cluster_df)
        users_per_bin = max(1, n_users // num_bins)
        counts = []
        averages = {col: [] for col in entropy_columns}

        for i in range(num_bins):
            start = i * users_per_bin
            end = (i + 1) * users_per_bin if i < num_bins - 1 else n_users
            bin_df = cluster_df.iloc[start:end]
            counts.append(len(bin_df))
            for col in entropy_columns:
                averages[col].append(bin_df[col].mean())

        print(f"\n📊 Entropy averages per bin for Cluster {cluster_id} in Top-{top_k}:")
        for i in range(num_bins):
            print(f"  Bin {i+1} (users: {counts[i]}): " +
                  ", ".join(f"{col}: {averages[col][i]:.4f}" for col in entropy_columns))

        x = np.arange(num_bins)
        width = 0.15

        plt.figure(figsize=(16, 6))
        for idx, col in enumerate(entropy_columns):
            bar_positions = x + (idx - len(entropy_columns)/2) * width
            bar_heights = averages[col]

            bars = plt.bar(bar_positions, bar_heights, width=width, label=col)

            # Zigzag label placement
            for i, (pos, height) in enumerate(zip(bar_positions, bar_heights)):
                offset = 0.01 if i % 2 == 0 else 0.03
                plt.text(pos, height + offset, f"{height:.2f}", ha='center', va='bottom', fontsize=8, rotation=0)

        plt.xlabel('User Bins (sorted by entropy)', fontsize=11)
        plt.ylabel('Average Recommendation Entropy', fontsize=11)
        plt.title(f'Cluster {cluster_id} - Top-{top_k} Entropy per Bin', fontsize=13)
        plt.xticks(x, [f'Bin {i+1}\n({counts[i]})' for i in x], fontsize=10)
        plt.yticks(fontsize=10)
        plt.legend(fontsize=9)
        plt.tight_layout()

        filename = os.path.join(output_folder, f"cluster_{cluster_id}_top{top_k}_entropy.png")
        plt.savefig(filename)
        plt.close()
        print(f"✅ Saved plot: {filename}")

        text_output_path = os.path.join(output_folder, f"cluster_{cluster_id}_top{top_k}_entropy.txt")
        with open(text_output_path, 'w') as f:
            f.write(f"Cluster {cluster_id} - Top-{top_k} Entropy per Bin\n")
            f.write("Bin\tUserCount\t" + "\t".join(entropy_columns) + "\n")
            for i in range(num_bins):
                f.write(f"{i+1}\t{counts[i]}\t" +
                        "\t".join(f"{averages[col][i]:.6f}" for col in entropy_columns) + "\n")
        print(f"📝 Saved bin details to: {text_output_path}")
print(f"📝 Saved bin details to: {text_output_path}")


📂 Reading: ../result/rec/top_re/entropy/merged_entropy_top15.csv
✅ Loaded file with shape: (700111, 7)

📊 Entropy averages per bin for Cluster 0 in Top-15:
  Bin 1 (users: 23337): rec_entropy_top15_final: 2.3750, rec_entropy_top15_240: 1.5260, rec_entropy_top15_528: 0.8786, rec_entropy_top15_1056: 0.3915
  Bin 2 (users: 23337): rec_entropy_top15_final: 2.3748, rec_entropy_top15_240: 1.6534, rec_entropy_top15_528: 0.9259, rec_entropy_top15_1056: 0.4036
  Bin 3 (users: 23337): rec_entropy_top15_final: 2.3756, rec_entropy_top15_240: 1.7209, rec_entropy_top15_528: 1.0032, rec_entropy_top15_1056: 0.4446
  Bin 4 (users: 23337): rec_entropy_top15_final: 2.3760, rec_entropy_top15_240: 1.7828, rec_entropy_top15_528: 1.0073, rec_entropy_top15_1056: 0.4427
  Bin 5 (users: 23337): rec_entropy_top15_final: 2.3737, rec_entropy_top15_240: 1.9951, rec_entropy_top15_528: 1.0901, rec_entropy_top15_1056: 0.4717
  Bin 6 (users: 23337): rec_entropy_top15_final: 2.3733, rec_entropy_top15_240: 1.9863, rec_e

In [None]:
import pandas as pd

entropy_dir = '../result/rec/top_re/entropy'
top_ks = [15, 25, 35]

for top_k in top_ks:
    file_path = f"{entropy_dir}/merged_entropy_top{top_k}.csv"
    print(f"\n📂 Reading: {file_path}")
    
    try:
        df = pd.read_csv(file_path)
        print(f"✅ Loaded file with shape: {df.shape}")

        # Check required column
        if 'entropy' not in df.columns:
            print(f"❗ Skipping file: no 'entropy' column found.")
            continue

        # Sanity check per cluster
        print(f"=== Sanity check for top-{top_k} ===")
        for cluster_id in sorted(df["cluster"].dropna().astype(int).unique()):
            cluster_df = df[df["cluster"] == cluster_id]
            count = len(cluster_df)
            min_entropy = cluster_df["entropy"].min()
            max_entropy = cluster_df["entropy"].max()

            print(f"Cluster {cluster_id}: {count} points — Min entropy: {min_entropy:.4f}, Max entropy: {max_entropy:.4f}")

    except FileNotFoundError:
        print(f"❌ File not found: {file_path}")


In [30]:
import pandas as pd

# Load the dataset
df = pd.read_csv('../result/rec/merged_entropy_with_cluster.csv')

# Group by cluster and compute min and max entropy
cluster_entropy_stats = df.groupby('cluster')['entropy'].agg(['min', 'max']).reset_index()

# Print results
for _, row in cluster_entropy_stats.iterrows():
    print(f"Cluster {int(row['cluster'])} → 🔻 Min entropy: {row['min']:.4f}, 🔺 Max entropy: {row['max']:.4f}")


  df = pd.read_csv('../result/rec/merged_entropy_with_cluster.csv')


Cluster 0 → 🔻 Min entropy: 0.4506, 🔺 Max entropy: 1.7918
Cluster 1 → 🔻 Min entropy: 1.7918, 🔺 Max entropy: 2.3513
Cluster 2 → 🔻 Min entropy: 2.3513, 🔺 Max entropy: 3.6515


In [None]:
import pandas as pd

# Load the main dataset
df = pd.read_csv('../result/rec/merged_entropy_with_cluster.csv')

# Ensure cluster column is integer type
df['cluster'] = df['cluster'].astype(int)

# Split into 3 cluster-specific datasets
df_cluster_0 = df[df['cluster'] == 0].copy()
df_cluster_1 = df[df['cluster'] == 1].copy()
df_cluster_2 = df[df['cluster'] == 2].copy()

# Define the entropy columns
entropy_columns = ["RecEnt_final", "RecEnt_240", "RecEnt_528", "RecEnt_1056"]

# Organize clusters
clusters = {
    0: df[df['cluster'] == 0],
    1: df[df['cluster'] == 1],
    2: df[df['cluster'] == 2]
}

# Compute and print min/max for each cluster and column
for cluster_id, cluster_df in clusters.items():
    print(f"\n🧠 Cluster {cluster_id}:")
    for col in entropy_columns:
        col_min = cluster_df[col].min()
        col_max = cluster_df[col].max()
        print(f"   {col} → 🔻 Min: {col_min:.4f}, 🔺 Max: {col_max:.4f}")


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

# Load the updated dataset
df = pd.read_csv('../result/rec/merged_entropy_with_cluster.csv')

# Output folder for base entropy visualizations
output_folder_base = '../result/rec/cluster_histograms/base'
os.makedirs(output_folder_base, exist_ok=True)

num_bins = 10

# Base entropy set
set_name = "base"
entropy_columns = ["RecEnt_final", "RecEnt_240", "RecEnt_528", "RecEnt_1056"]

for cluster_id in sorted(df["cluster"].dropna().astype(int).unique()):
    cluster_df = df[df["cluster"] == cluster_id].sort_values("entropy").reset_index(drop=True)
    users_per_bin = max(1, len(cluster_df) // num_bins)

    averages = {col: [] for col in entropy_columns}
    counts = []
    avg_entropy_per_bin = []

    for i in range(num_bins):
        start = i * users_per_bin
        end = (i + 1) * users_per_bin if i < num_bins - 1 else len(cluster_df)
        bin_df = cluster_df.iloc[start:end]
        counts.append(len(bin_df))
        avg_entropy_per_bin.append(bin_df["entropy"].mean())
        for col in entropy_columns:
            averages[col].append(bin_df[col].mean())

    # Plot
    x = np.arange(num_bins)
    width = 0.15

    plt.figure(figsize=(16, 6))
    for idx, col in enumerate(entropy_columns):
        plt.bar(x + (idx - len(entropy_columns)/2) * width, averages[col], width=width, label=col)

    plt.xlabel('User Bins (sorted by entropy)', fontsize=11)
    plt.ylabel('Average Recommendation Entropy', fontsize=11)
    plt.title(f'Cluster {cluster_id} - Entropy Across Bins ({set_name})', fontsize=13)
    plt.xticks(x, [f'Bin {i+1}\n({counts[i]})' for i in x], fontsize=10)
    plt.yticks(fontsize=10)
    plt.legend(fontsize=10)
    plt.tight_layout()

    # Save plot
    filename = os.path.join(output_folder_base, f"cluster_{cluster_id}_entropy_{set_name}.png")
    plt.savefig(filename)
    plt.close()

    # Save text file
    text_output_path = os.path.join(output_folder_base, f"cluster_{cluster_id}_entropy_{set_name}.txt")
    with open(text_output_path, 'w') as f:
        f.write(f"Cluster {cluster_id} - Average Entropy per Bin ({set_name})\n")
        f.write("Bin\tUserCount\tRawEntropy\t" + "\t".join(entropy_columns) + "\n")
        for i in range(num_bins):
            f.write(f"{i+1}\t{counts[i]}\t{avg_entropy_per_bin[i]:.4f}\t" +
                    "\t".join(f"{averages[col][i]:.4f}" for col in entropy_columns) + "\n")

print("\n✅ All base entropy plots and text files saved successfully.")
