In [2]:
pip install surprise

Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install docx

In [None]:
from docx import Document

In [5]:
import pandas as pd


In [6]:
df_final = pd.read_csv("../data/df_final.csv") 
df_240 = pd.read_csv("../data/df_240.csv") 
df_528 = pd.read_csv("../data/df_528.csv")
df_1056 = pd.read_csv("../data/df_1056.csv")

In [3]:
df_final.head()

Unnamed: 0,user_id,app_id,category,rating
0,1,1,Strategy,4
1,2,2,Puzzle,3
2,3,3,Business,2
3,4,4,Simulation,1
4,5,5,Books & Reference,3


## Generating top n recommandation to users based on 4 diffrent datasets(original+ 3 biased)

In [9]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

# Load datasets
df_final = pd.read_csv("../data/df_final.csv")
df_240 = pd.read_csv("../data/df_240.csv")
df_528 = pd.read_csv("../data/df_528.csv")
df_1056 = pd.read_csv("../data/df_1056.csv")

# Define datasets
datasets = {
    "final": df_final,
    "240": df_240,
    "528": df_528,
    "1056": df_1056,
}

def train_and_recommend(train_df, test_users_df, k=50, top_k=25):
    # Create training user-app matrix
    train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)
    user_ids = train_matrix.index
    app_ids = train_matrix.columns

    # SVD
    train_sparse = csr_matrix(train_matrix.values)
    U, sigma, Vt = svds(train_sparse, k=k)
    sigma = np.diag(sigma)
    pred_matrix = np.dot(np.dot(U, sigma), Vt)
    pred_df = pd.DataFrame(pred_matrix, index=user_ids, columns=app_ids)

    # Already rated map
    train_rated = train_df.groupby('user_id')['app_id'].apply(set).to_dict()

    # Filter users and items present in both train and test
    test_users_filtered = test_users_df[test_users_df['user_id'].isin(user_ids)]

    # Generate top-K recommendations
    top_recs = []
    for user in test_users_filtered['user_id'].unique():
        if user not in pred_df.index:
            continue
        user_pred = pred_df.loc[user]
        rated_apps = train_rated.get(user, set())
        recommendations = user_pred.drop(index=rated_apps, errors='ignore').sort_values(ascending=False).head(top_k)
        for app_id, score in recommendations.items():
            top_recs.append({'user_id': user, 'app_id': app_id, 'predicted_rating': score})

    return pd.DataFrame(top_recs)

def merge_with_category_on_app_id(recommendations_df, df_final):
    df_final_subset = df_final[['app_id', 'category']].drop_duplicates()
    merged_df = pd.merge(recommendations_df, df_final_subset, on='app_id', how='left')
    return merged_df

# === Run SVD on each dataset separately for each top-k ===
for k in [15, 25, 35]:
    print(f"🚀 Starting run for top-{k} recommendations...")
    for name, train_df in datasets.items():
        print(f"🔁 Training on {name}, generating top-{k} recommendations for df_final users...")
        recs = train_and_recommend(train_df, df_final, k=50, top_k=k)
        recs_with_category = merge_with_category_on_app_id(recs, df_final)
        print(f"🔎 Number of recommendations for {name} top-{k}: {len(recs_with_category)}")

        filename = f"../result/rec/top_re/top{k}_{name}_with_category.csv"
        recs_with_category.to_csv(filename, index=False)
        print(f"✅ Saved: {filename}")


🚀 Starting run for top-15 recommendations...
🔁 Training on final, generating top-15 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for final top-15: 10501665
✅ Saved: ../result/rec/top_re/top15_final_with_category.csv
🔁 Training on 240, generating top-15 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for 240 top-15: 10501665
✅ Saved: ../result/rec/top_re/top15_240_with_category.csv
🔁 Training on 528, generating top-15 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for 528 top-15: 10501665
✅ Saved: ../result/rec/top_re/top15_528_with_category.csv
🔁 Training on 1056, generating top-15 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for 1056 top-15: 10501665
✅ Saved: ../result/rec/top_re/top15_1056_with_category.csv
🚀 Starting run for top-25 recommendations...
🔁 Training on final, generating top-25 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for final top-25: 17502775
✅ Saved: ../result/rec/top_re/top25_final_with_category.csv
🔁 Training on 240, generating top-25 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for 240 top-25: 17502775
✅ Saved: ../result/rec/top_re/top25_240_with_category.csv
🔁 Training on 528, generating top-25 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for 528 top-25: 17502775
✅ Saved: ../result/rec/top_re/top25_528_with_category.csv
🔁 Training on 1056, generating top-25 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for 1056 top-25: 17502775
✅ Saved: ../result/rec/top_re/top25_1056_with_category.csv
🚀 Starting run for top-35 recommendations...
🔁 Training on final, generating top-35 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for final top-35: 24503885
✅ Saved: ../result/rec/top_re/top35_final_with_category.csv
🔁 Training on 240, generating top-35 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for 240 top-35: 24503885
✅ Saved: ../result/rec/top_re/top35_240_with_category.csv
🔁 Training on 528, generating top-35 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for 528 top-35: 24503885
✅ Saved: ../result/rec/top_re/top35_528_with_category.csv
🔁 Training on 1056, generating top-35 recommendations for df_final users...


  train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)


🔎 Number of recommendations for 1056 top-35: 24503885
✅ Saved: ../result/rec/top_re/top35_1056_with_category.csv


In [None]:
import pandas as pd

# Define datasets to analyze (new names for new data)
datasets = ["final", "240", "528", "1056"]

# Define top-k recommendation counts to process
top_ks = [15, 25, 35]

# Load user cluster assignments once (avoids reloading each time)
df_cluster = pd.read_csv("../preprocessing/df_cluster.csv")

# === Analysis 1: Cluster-Category Distribution ===

output_filename_cluster = "../result/rec/top_re/analysis/analysis_summary.txt"
with open(output_filename_cluster, "w") as f:
    for dataset_name in datasets:
        for top_k in top_ks:
            filename = f"../result/rec/top_re/top{top_k}_{dataset_name}_with_category.csv"
            print(f"\n📂 Processing recommendations from: {filename}")
            f.write(f"\n📂 Processing recommendations from: {filename}\n")

            try:
                recs = pd.read_csv(filename)
            except FileNotFoundError:
                msg = f"❌ File not found: {filename}"
                print(msg)
                f.write(msg + "\n")
                continue

            # Merge recommendations with user clusters on user_id
            merged = pd.merge(recs, df_cluster[['user_id', 'cluster']], on='user_id', how='left')

            # Group by cluster and category, count recommendations
            cluster_category_distribution = merged.groupby(['cluster', 'category']).size().reset_index(name='count')

            # Write analysis results for each cluster
            header = f"\n📂 Analysis for dataset {dataset_name}, top-{top_k}\n\n"
            print(header.strip())
            f.write(header)

            for cluster_num in sorted(cluster_category_distribution['cluster'].dropna().unique()):
                cluster_header = f"=== Dataset {dataset_name}, Top-{top_k}, Cluster {int(cluster_num)} ==="
                print(cluster_header)
                f.write(cluster_header + "\n")

                cluster_data = cluster_category_distribution[cluster_category_distribution['cluster'] == cluster_num]
                total_recs = cluster_data['count'].sum()
                
                for _, row in cluster_data.sort_values('category').iterrows():
                    category = row['category']
                    count = row['count']
                    percent = (count / total_recs) * 100
                    line = f"Category {category}: {count} recommendations ({percent:.2f}%)"
                    print(line)
                    f.write(line + "\n")

    print(f"\n✅ Saved combined cluster-category analysis: {output_filename_cluster}")

# === Analysis 2: Overall Category Distribution ===

output_filename_category = "../result/rec/top_re/analysis/category_summary_all.txt"
with open(output_filename_category, "w") as f:
    for dataset_name in datasets:
        for top_k in top_ks:
            filename = f"../result/rec/top_re/top{top_k}_{dataset_name}_with_category.csv"
            print(f"\n📂 Processing: {filename}")
            f.write(f"\n📂 Processing: {filename}\n")

            try:
                recs = pd.read_csv(filename)
            except FileNotFoundError:
                msg = f"❌ File not found: {filename}"
                print(msg)
                f.write(msg + "\n")
                continue

            # Group recommendations by category and count
            category_distribution = recs['category'].value_counts().sort_index()

            # Write absolute counts
            header = f"=== Dataset {dataset_name}, Top-{top_k}: Recommendation Count per Category ==="
            print(header)
            f.write(header + "\n")
            for category, count in category_distribution.items():
                line = f"Category {category}: {count} recommendations"
                print(line)
                f.write(line + "\n")

            # Write percentage distribution
            total_recs = category_distribution.sum()
            percent_header = f"\n=== Dataset {dataset_name}, Top-{top_k}: Percent Distribution ==="
            print(percent_header)
            f.write(percent_header + "\n")
            for category, count in category_distribution.items():
                percent = (count / total_recs) * 100
                line = f"Category {category}: {percent:.2f}%"
                print(line)
                f.write(line + "\n")

    print(f"\n✅ Saved combined category analysis: {output_filename_category}")



📂 Processing recommendations from: ../result/rec/top_re/top15_final_with_category.csv
📂 Analysis for dataset final, top-15
=== Dataset final, Top-15, Cluster 0 ===
Category Action: 295433 recommendations (8.44%)
Category Adventure: 61312 recommendations (1.75%)
Category Arcade: 28473 recommendations (0.81%)
Category Art & Design: 774 recommendations (0.02%)
Category Auto & Vehicles: 64294 recommendations (1.84%)
Category Beauty: 223 recommendations (0.01%)
Category Board: 59707 recommendations (1.71%)
Category Books & Reference: 112190 recommendations (3.20%)
Category Business: 40090 recommendations (1.15%)
Category Card: 69927 recommendations (2.00%)
Category Casino: 1120 recommendations (0.03%)
Category Casual: 215004 recommendations (6.14%)
Category Comics: 870 recommendations (0.02%)
Category Communication: 241326 recommendations (6.89%)
Category Dating: 20537 recommendations (0.59%)
Category Education: 28096 recommendations (0.80%)
Category Educational: 1 recommendations (0.00%)


### Computing recommendation entropy per user, per dataset(all 6)

In [None]:
# import pandas as pd
# import os

# # Paths
# input_dir = '../result/rec'
# cluster_file = '../preprocessing/df_cluster.csv'
# df_cluster = pd.read_csv(cluster_file)

# # Recommendation suffixes
# suffixes = ['final', '240', '528', '1056']

# # Load and prepare entropy data
# entropy_dfs = {}
# for suffix in suffixes:
#     df = globals()[f'RecEnt_df_{suffix}'].rename(columns={"recommendation_entropy": f"RecEnt_{suffix}"})
#     entropy_dfs[suffix] = df[['user_id', f"RecEnt_{suffix}"]]

# # Merge all entropy values into one user-level dataframe
# merged_user_entropy = df_cluster.copy()
# for suffix in suffixes:
#     merged_user_entropy = merged_user_entropy.merge(entropy_dfs[suffix], on="user_id", how="left")

# # Save this clean user-level dataframe
# merged_user_entropy.to_csv('../result/rec/user_level_entropy_with_cluster.csv', index=False)

# # Now attach user-level info to each recommendation file
# merged_recs = []

# for suffix in suffixes:
#     rec_file = os.path.join(input_dir, f'top25_df_{suffix}_with_category.csv')
#     rec_df = pd.read_csv(rec_file)
    
#     # Merge user-level info into recommendation-level file
#     rec_df = rec_df.merge(merged_user_entropy, on="user_id", how="left")
#     rec_df['source'] = suffix  # optional: track which recommendation version
#     merged_recs.append(rec_df)

# # Concatenate all into one final DataFrame
# full_merged_df = pd.concat(merged_recs, ignore_index=True)

# # Optional: reorder columns
# cols = ['user_id', 'item_id', 'category', 'cluster', 'entropy',
#         'RecEnt_final', 'RecEnt_240', 'RecEnt_528', 'RecEnt_1056', 'source']
# full_merged_df = full_merged_df[[c for c in cols if c in full_merged_df.columns]]

# # Save recommendation-level file (contains repetition — intended)
# full_merged_df.to_csv('../result/rec/merged_entropy_with_cluster.csv', index=False)

# # ✅ If needed: Deduplicated version of user-level info from rec data
# user_level_df = full_merged_df[[
#     'user_id', 'cluster', 'entropy',
#     'RecEnt_final', 'RecEnt_240', 'RecEnt_528', 'RecEnt_1056'
# ]].drop_duplicates()

# user_level_df.to_csv('../result/rec/merged_entropy_with_cluster.csv', index=False)

# # Preview
# print(user_level_df.head())


     user_id  cluster   entropy  RecEnt_final  RecEnt_240  RecEnt_528  \
0          1        2  2.809783      2.677950    2.619729    1.811208   
25         2        2  3.066907      2.553455    0.661477    0.899564   
50         3        2  3.063229      2.754332    2.243999    2.142310   
75         4        2  3.409559      2.643428    2.456143    1.873357   
100        5        2  3.427526      2.809783    2.698880    2.397922   

     RecEnt_1056  
0       0.958945  
25      0.690457  
50      1.356440  
75      0.849269  
100     1.241938  


In [None]:
import pandas as pd
from scipy.stats import entropy
import os

# === Compute entropy of recommendations per user based on category distribution ===
def compute_recommendation_entropy(df):
    # Count recommendations per user per category
    user_category_counts = df.groupby(['user_id', 'category']).size().unstack(fill_value=0)
    # Normalize counts to probabilities
    user_category_dist = user_category_counts.div(user_category_counts.sum(axis=1), axis=0)
    # Compute entropy
    user_entropy = user_category_dist.apply(lambda row: entropy(row), axis=1).reset_index()
    user_entropy.columns = ['user_id', 'recommendation_entropy']
    return user_entropy

# Current datasets to process
datasets = ['final', '240', '528', '1056']

# Top-Ks to process
top_ks = [15, 25, 35]

# Directories
input_dir = '../result/rec/top_re'
output_dir = '../result/rec/top_re/entropy'
os.makedirs(output_dir, exist_ok=True)
cluster_file = '../preprocessing/df_cluster.csv'

# Load cluster assignments once
df_cluster = pd.read_csv(cluster_file)

# Dictionary to store resulting entropy DataFrames
entropy_datasets = {}

# Loop through each top-K and each dataset
for top_k in top_ks:
    for dataset_name in datasets:
        suffix = f"top{top_k}_{dataset_name}"
        input_file = os.path.join(input_dir, f'{suffix}_with_category.csv')
        output_file = os.path.join(output_dir, f'entropy_{suffix}.csv')
        
        print(f"\n📂 Processing: {input_file}")
        if not os.path.exists(input_file):
            print(f"❌ File not found: {input_file}")
            continue

        # Load recommendations for current dataset and top-K
        df = pd.read_csv(input_file)

        # Compute entropy per user
        entropy_df = compute_recommendation_entropy(df)
        entropy_df.to_csv(output_file, index=False)
        print(f"✅ Saved entropy data: {output_file}")

        # Merge entropy with cluster assignments
        merged = pd.merge(df_cluster, entropy_df, on='user_id', how='inner')
        merged = merged[['user_id', 'cluster', 'recommendation_entropy']]

        # Save merged DataFrame to globals and dictionary
        globals()[f'RecEnt_{suffix}'] = merged
        entropy_datasets[f'RecEnt_{suffix}'] = merged

print("\n✅ All datasets and top-Ks processed successfully.")


In [19]:
RecEnt_df_1056.count()

user_id                   700111
cluster                   700111
entropy                   700111
recommendation_entropy    700111
dtype: int64

In [24]:
# Count users per cluster
cluster_counts = user_level_df['cluster'].value_counts().sort_index()

# Print nicely
print("👥 Number of users per cluster:")
for cluster_id, count in cluster_counts.items():
    print(f"Cluster {cluster_id}: {count} users")


👥 Number of users per cluster:
Cluster 0: 233371 users
Cluster 1: 233370 users
Cluster 2: 233370 users


#### Compute number of users per cluster from the original dataset 

## histogram

In [30]:
import pandas as pd

# Load the dataset
df = pd.read_csv('../result/rec/merged_entropy_with_cluster.csv')

# Group by cluster and compute min and max entropy
cluster_entropy_stats = df.groupby('cluster')['entropy'].agg(['min', 'max']).reset_index()

# Print results
for _, row in cluster_entropy_stats.iterrows():
    print(f"Cluster {int(row['cluster'])} → 🔻 Min entropy: {row['min']:.4f}, 🔺 Max entropy: {row['max']:.4f}")


  df = pd.read_csv('../result/rec/merged_entropy_with_cluster.csv')


Cluster 0 → 🔻 Min entropy: 0.4506, 🔺 Max entropy: 1.7918
Cluster 1 → 🔻 Min entropy: 1.7918, 🔺 Max entropy: 2.3513
Cluster 2 → 🔻 Min entropy: 2.3513, 🔺 Max entropy: 3.6515


In [None]:
import pandas as pd

# Load the main dataset
df = pd.read_csv('../result/rec/merged_entropy_with_cluster.csv')

# Ensure cluster column is integer type
df['cluster'] = df['cluster'].astype(int)

# Split into 3 cluster-specific datasets
df_cluster_0 = df[df['cluster'] == 0].copy()
df_cluster_1 = df[df['cluster'] == 1].copy()
df_cluster_2 = df[df['cluster'] == 2].copy()

# Define the entropy columns
entropy_columns = ["RecEnt_final", "RecEnt_240", "RecEnt_528", "RecEnt_1056"]

# Organize clusters
clusters = {
    0: df[df['cluster'] == 0],
    1: df[df['cluster'] == 1],
    2: df[df['cluster'] == 2]
}

# Compute and print min/max for each cluster and column
for cluster_id, cluster_df in clusters.items():
    print(f"\n🧠 Cluster {cluster_id}:")
    for col in entropy_columns:
        col_min = cluster_df[col].min()
        col_max = cluster_df[col].max()
        print(f"   {col} → 🔻 Min: {col_min:.4f}, 🔺 Max: {col_max:.4f}")


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

# Load the updated dataset
df = pd.read_csv('../result/rec/merged_entropy_with_cluster.csv')

# Output folder for base entropy visualizations
output_folder_base = '../result/rec/cluster_histograms/base'
os.makedirs(output_folder_base, exist_ok=True)

num_bins = 10

# Base entropy set
set_name = "base"
entropy_columns = ["RecEnt_final", "RecEnt_240", "RecEnt_528", "RecEnt_1056"]

for cluster_id in sorted(df["cluster"].dropna().astype(int).unique()):
    cluster_df = df[df["cluster"] == cluster_id].sort_values("entropy").reset_index(drop=True)
    users_per_bin = max(1, len(cluster_df) // num_bins)

    averages = {col: [] for col in entropy_columns}
    counts = []
    avg_entropy_per_bin = []

    for i in range(num_bins):
        start = i * users_per_bin
        end = (i + 1) * users_per_bin if i < num_bins - 1 else len(cluster_df)
        bin_df = cluster_df.iloc[start:end]
        counts.append(len(bin_df))
        avg_entropy_per_bin.append(bin_df["entropy"].mean())
        for col in entropy_columns:
            averages[col].append(bin_df[col].mean())

    # Plot
    x = np.arange(num_bins)
    width = 0.15

    plt.figure(figsize=(16, 6))
    for idx, col in enumerate(entropy_columns):
        plt.bar(x + (idx - len(entropy_columns)/2) * width, averages[col], width=width, label=col)

    plt.xlabel('User Bins (sorted by entropy)', fontsize=11)
    plt.ylabel('Average Recommendation Entropy', fontsize=11)
    plt.title(f'Cluster {cluster_id} - Entropy Across Bins ({set_name})', fontsize=13)
    plt.xticks(x, [f'Bin {i+1}\n({counts[i]})' for i in x], fontsize=10)
    plt.yticks(fontsize=10)
    plt.legend(fontsize=10)
    plt.tight_layout()

    # Save plot
    filename = os.path.join(output_folder_base, f"cluster_{cluster_id}_entropy_{set_name}.png")
    plt.savefig(filename)
    plt.close()

    # Save text file
    text_output_path = os.path.join(output_folder_base, f"cluster_{cluster_id}_entropy_{set_name}.txt")
    with open(text_output_path, 'w') as f:
        f.write(f"Cluster {cluster_id} - Average Entropy per Bin ({set_name})\n")
        f.write("Bin\tUserCount\tRawEntropy\t" + "\t".join(entropy_columns) + "\n")
        for i in range(num_bins):
            f.write(f"{i+1}\t{counts[i]}\t{avg_entropy_per_bin[i]:.4f}\t" +
                    "\t".join(f"{averages[col][i]:.4f}" for col in entropy_columns) + "\n")

print("\n✅ All base entropy plots and text files saved successfully.")
