In [1]:
pip install surprise

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from docx import Document

In [2]:
df_final = pd.read_csv("../data/df_final.csv") 
df_240 = pd.read_csv("../data/df_240.csv") 
df_528 = pd.read_csv("../data/df_528.csv")
df_1056 = pd.read_csv("../data/df_1056.csv")

In [3]:
df_final.head()

Unnamed: 0,user_id,app_id,category,rating
0,1,1,Strategy,4
1,2,2,Puzzle,3
2,3,3,Business,2
3,4,4,Simulation,1
4,5,5,Books & Reference,3


## Generating top n recommandation to users based on 4 diffrent datasets(original+ 3 biased)

In [None]:
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import numpy as np

# Load datasets
df_final = pd.read_csv("../data/df_final.csv")
df_240 = pd.read_csv("../data/df_240.csv")
df_528 = pd.read_csv("../data/df_528.csv")
df_1056 = pd.read_csv("../data/df_1056.csv")

# Define datasets
datasets = {
    "df_final": df_final,
    "df_240": df_240,
    "df_528": df_528,
    'df_1056':df_1056
}

def train_and_recommend(train_df, test_users_df, k=50, top_k=25):
    # Create training user-item matrix
    train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)
    user_ids = train_matrix.index
    app_ids = train_matrix.columns

    # SVD
    train_sparse = csr_matrix(train_matrix.values)
    U, sigma, Vt = svds(train_sparse, k=k)
    sigma = np.diag(sigma)
    pred_matrix = np.dot(np.dot(U, sigma), Vt)
    pred_df = pd.DataFrame(pred_matrix, index=user_ids, columns=app_ids)

    # Already rated map
    train_rated = train_df.groupby('user_id')['app_id'].apply(set).to_dict()

    # Filter users and items present in both train and test
    test_users_filtered = test_users_df[test_users_df['user_id'].isin(user_ids)]

    # Generate top-K recommendations
    top_recs = []
    for user in test_users_filtered['user_id'].unique():
        if user not in pred_df.index:
            continue
        user_pred = pred_df.loc[user]
        rated_apps = train_rated.get(user, set())
        recommendations = user_pred.drop(index=rated_apps, errors='ignore').sort_values(ascending=False).head(top_k)
        for app_id, score in recommendations.items():
            top_recs.append({'user_id': user, 'app_id': app_id, 'predicted_rating': score})

    return pd.DataFrame(top_recs)

def merge_with_category_on_app_id(recommendations_df, df_final):
    df_final_subset = df_final[['app_id', 'category']].drop_duplicates()
    merged_df = pd.merge(recommendations_df, df_final_subset, on='app_id', how='left')
    return merged_df

# Run SVD on each dataset, always test on df_final users
for name, train_df in datasets.items():
    print(f"Training on {name}, generating top-25 recommendations for df_final users...")
    recs = train_and_recommend(train_df, df_final, k=50, top_k=25)
    recs_with_category = merge_with_category_on_app_id(recs, df_final)

    # Save to correct filename
    filename = f"../result/rec/top25_{name}_with_category.csv"
    recs_with_category.to_csv(filename, index=False)
    print(f"Saved: {filename}")

Training on df_1056, generating top-25 recommendations for df_final users...




Saved: ../result/rec/top25_df_1056_with_category.csv


### Computing recommendation entropy per user, per dataset(all 6)

In [None]:
import pandas as pd
import os

# Paths
input_dir = '../result/rec'
cluster_file = '../preprocessing/df_cluster.csv'
df_cluster = pd.read_csv(cluster_file)

# Recommendation suffixes
suffixes = ['final', '240', '528', '1056']

# Load and prepare entropy data
entropy_dfs = {}
for suffix in suffixes:
    df = globals()[f'RecEnt_df_{suffix}'].rename(columns={"recommendation_entropy": f"RecEnt_{suffix}"})
    entropy_dfs[suffix] = df[['user_id', f"RecEnt_{suffix}"]]

# Merge all entropy values into one user-level dataframe
merged_user_entropy = df_cluster.copy()
for suffix in suffixes:
    merged_user_entropy = merged_user_entropy.merge(entropy_dfs[suffix], on="user_id", how="left")

# Preview:
# ['user_id', 'cluster', 'entropy', 'RecEnt_final', 'RecEnt_240', 'RecEnt_528', 'RecEnt_1056']

# Now attach to each recommendation file
merged_recs = []

for suffix in suffixes:
    rec_file = os.path.join(input_dir, f'top25_df_{suffix}_with_category.csv')
    rec_df = pd.read_csv(rec_file)
    
    rec_df = rec_df.merge(merged_user_entropy, on="user_id", how="left")
    rec_df['source'] = suffix  # optional: to track which set the record came from
    merged_recs.append(rec_df)

# Concatenate all into one final DataFrame
full_merged_df = pd.concat(merged_recs, ignore_index=True)

# Optional: reorder columns
cols = ['user_id', 'item_id', 'category', 'cluster', 'entropy',
        'RecEnt_final', 'RecEnt_240', 'RecEnt_528', 'RecEnt_1056', 'source']
full_merged_df = full_merged_df[[c for c in cols if c in full_merged_df.columns]]

# Save result
full_merged_df.to_csv('../result/rec/merged_entropy_with_cluster.csv', index=False)

# Preview
print(full_merged_df.head())


In [10]:
RecEnt_df_1056.head()

Unnamed: 0,user_id,cluster,entropy,Rec_1056
0,686826,0,0.450561,0.167944
1,536830,0,0.450561,0.81997
2,433075,0,0.450561,0.0
3,55674,0,0.500402,0.592953
4,620088,0,0.500402,0.167944


In [16]:
# Count users per cluster
cluster_counts = merged_entropy_df['cluster'].value_counts().sort_index()

# Print nicely
print("👥 Number of users per cluster:")
for cluster_id, count in cluster_counts.items():
    print(f"Cluster {cluster_id}: {count} users")


👥 Number of users per cluster:
Cluster 0: 233371 users
Cluster 1: 233370 users
Cluster 2: 233370 users


In [7]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# === Merge cluster info with RecEnt_df_final ===
user_cluster_df = df_cluster[['user_id', 'cluster', 'entropy']].drop_duplicates()
merged_df = pd.merge(RecEnt_df_final[['user_id', 'recommendation_entropy']], user_cluster_df, on='user_id', how='inner')

# === Paths ===
output_folder = '/home/moshtasa/Research/phd-svd-recsys/Recommender Systems/phd-svd-recsys/MobileRec/result/rec/cluster_histograms/base/'
os.makedirs(output_folder, exist_ok=True)

# === Parameters ===
num_bins = 10
entropy_column = 'recommendation_entropy'

# === Loop per cluster ===
for cluster_id in sorted(merged_df["cluster"].dropna().astype(int).unique()):
    cluster_df = merged_df[merged_df["cluster"] == cluster_id].sort_values("entropy").reset_index(drop=True)
    users_per_bin = max(1, len(cluster_df) // num_bins)

    avg_entropy_bins = []
    user_counts = []

    for i in range(num_bins):
        start = i * users_per_bin
        end = (i + 1) * users_per_bin if i < num_bins - 1 else len(cluster_df)
        bin_df = cluster_df.iloc[start:end]
        avg_entropy_bins.append(bin_df[entropy_column].mean())
        user_counts.append(len(bin_df))

    # === Plot ===
    x = np.arange(num_bins)
    width = 0.6

    plt.figure(figsize=(12, 5))
    plt.bar(x, avg_entropy_bins, width=width, color='steelblue')
    plt.xlabel('User Bins (sorted by base entropy)', fontsize=11)
    plt.ylabel('Avg. Recommendation Entropy', fontsize=11)
    plt.title(f'Cluster {cluster_id} - Avg. Recommendation Entropy per Bin', fontsize=13)
    plt.xticks(x, [f'Bin {i+1}\n({user_counts[i]})' for i in range(num_bins)], fontsize=10)
    plt.yticks(fontsize=10)
    plt.tight_layout()

    # Save plot
    plot_filename = os.path.join(output_folder, f"cluster_{cluster_id}_rec_entropy.png")
    plt.savefig(plot_filename)
    plt.close()

    # === Save text output ===
    text_output_path = os.path.join(output_folder, f"cluster_{cluster_id}_rec_entropy.txt")
    with open(text_output_path, 'w') as f:
        f.write(f"Cluster {cluster_id} - Avg. Recommendation Entropy per Bin\n")
        f.write("Bin\tUserCount\tAvgRecEntropy\n")
        for i in range(num_bins):
            f.write(f"{i+1}\t{user_counts[i]}\t{avg_entropy_bins[i]:.4f}\n")

print("\n✅ All cluster plots + text files saved to:", output_folder)



✅ All cluster plots + text files saved to: /home/moshtasa/Research/phd-svd-recsys/Recommender Systems/phd-svd-recsys/MobileRec/result/rec/cluster_histograms/base/


## histogram

In [25]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# === Load merged mobile dataset (with all entropy columns) ===
merged_df = pd.read_csv('../preprocessing/merged_df_mobiles.csv')

# === Output folder ===
output_folder = '../result/rec/cluster_entropy_bins_mobile_grouped'
os.makedirs(output_folder, exist_ok=True)

# === Parameters ===
num_bins = 10
rec_entropy_cols = ['rec_entropy_final', 'rec_entropy_240', 'rec_entropy_528', 'rec_entropy_1056']

# === Loop through each cluster ===
for cluster_id in sorted(merged_df["cluster"].dropna().astype(int).unique()):
    cluster_df = merged_df[merged_df["cluster"] == cluster_id]
    
    # Sort users by original entropy
    cluster_df_sorted = cluster_df.sort_values("entropy").reset_index(drop=True)
    users_per_bin = max(1, len(cluster_df_sorted) // num_bins)

    cluster_entropy_min = cluster_df["entropy"].min()
    cluster_entropy_max = cluster_df["entropy"].max()

    # === Initialize bin-level results ===
    bin_results = {col: [] for col in rec_entropy_cols}
    user_counts = []

    for i in range(num_bins):
        start = i * users_per_bin
        end = (i + 1) * users_per_bin if i < num_bins - 1 else len(cluster_df_sorted)
        bin_df = cluster_df_sorted.iloc[start:end]
        user_counts.append(len(bin_df))
        for col in rec_entropy_cols:
            bin_results[col].append(bin_df[col].mean())

    # === Print summary ===
    print(f"📊 Cluster {cluster_id} | Base Entropy Range: {cluster_entropy_min:.4f} → {cluster_entropy_max:.4f}")

    # === Plot grouped bar chart ===
    x = np.arange(num_bins)
    width = 0.18  # Width of each bar
    offsets = np.linspace(-1.5 * width, 1.5 * width, len(rec_entropy_cols))

    plt.figure(figsize=(14, 6))
    for i, col in enumerate(rec_entropy_cols):
        plt.bar(x + offsets[i], bin_results[col], width=width, label=col)

    plt.xlabel('User Bins (sorted by base entropy)', fontsize=11)
    plt.ylabel('Average Recommendation Entropy', fontsize=11)
    plt.title(f'Mobile Dataset - Cluster {cluster_id}: Entropy Comparison per Bin', fontsize=13)
    plt.xticks(x, [f'Bin {i+1}\n({user_counts[i]})' for i in range(num_bins)], fontsize=10)
    plt.yticks(fontsize=10)
    plt.legend(fontsize=10)
    plt.tight_layout()

    # === Save plot ===
    plot_filename = os.path.join(output_folder, f"cluster_{cluster_id}_grouped_entropy.png")
    plt.savefig(plot_filename)
    plt.close()

    # === Save text summary ===
    text_output_path = os.path.join(output_folder, f"cluster_{cluster_id}_grouped_entropy.txt")
    with open(text_output_path, 'w') as f:
        f.write(f"Cluster {cluster_id} - All Entropy Columns per Bin\n")
        f.write(f"Base Entropy Range: {cluster_entropy_min:.4f} → {cluster_entropy_max:.4f}\n\n")
        f.write("Bin\tUserCount\t" + "\t".join(rec_entropy_cols) + "\n")
        for i in range(num_bins):
            line = f"{i+1}\t{user_counts[i]}\t" + "\t".join(f"{bin_results[col][i]:.4f}" for col in rec_entropy_cols)
            f.write(line + "\n")

print("\n✅ Grouped mobile entropy plots and summaries saved to:", output_folder)


📊 Cluster 0 | Base Entropy Range: 0.4506 → 1.7918
📊 Cluster 1 | Base Entropy Range: 1.7918 → 2.3513
📊 Cluster 2 | Base Entropy Range: 2.3513 → 3.6515

✅ Grouped mobile entropy plots and summaries saved to: ../result/rec/cluster_entropy_bins_mobile_grouped


In [26]:
import pandas as pd

# === Load merged mobile dataset ===
merged_df = pd.read_csv('../preprocessing/merged_df_mobiles.csv')

# === Define columns to summarize ===
rec_entropy_cols = ['rec_entropy_final', 'rec_entropy_240', 'rec_entropy_528', 'rec_entropy_1056']
summary_cols = ['entropy'] + rec_entropy_cols

# === Print average entropy per cluster ===
print("\n📊 Average Entropy Values per Cluster:\n")

for col in summary_cols:
    print(f"\n➡️ {col} (Mean per Cluster):")
    for cluster_id in sorted(merged_df['cluster'].dropna().astype(int).unique()):
        cluster_data = merged_df[merged_df['cluster'] == cluster_id]
        mean_val = cluster_data[col].mean()
        print(f"  Cluster {cluster_id}: {mean_val:.4f}")



📊 Average Entropy Values per Cluster:


➡️ entropy (Mean per Cluster):
  Cluster 0: 1.5666
  Cluster 1: 1.9818
  Cluster 2: 3.0335

➡️ rec_entropy_final (Mean per Cluster):
  Cluster 0: 2.6807
  Cluster 1: 2.6829
  Cluster 2: 2.6928

➡️ rec_entropy_240 (Mean per Cluster):
  Cluster 0: 1.8479
  Cluster 1: 2.0584
  Cluster 2: 2.2488

➡️ rec_entropy_528 (Mean per Cluster):
  Cluster 0: 0.9822
  Cluster 1: 1.1587
  Cluster 2: 1.5431

➡️ rec_entropy_1056 (Mean per Cluster):
  Cluster 0: 0.4456
  Cluster 1: 0.5911
  Cluster 2: 1.0723


In [28]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# === Merge cluster info with RecEnt_df_final ===
user_cluster_df = df_cluster[['user_id', 'cluster', 'entropy']].drop_duplicates()
merged_df = pd.merge(RecEnt_df_final[['user_id', 'recommendation_entropy']], user_cluster_df, on='user_id', how='inner')

# === Paths ===
output_folder = '../result/rec/cluster_histograms/recentropy'
os.makedirs(output_folder, exist_ok=True)

# === Parameters ===
num_bins = 10
entropy_column = 'recommendation_entropy'

# === Loop per cluster ===
for cluster_id in sorted(merged_df["cluster"].dropna().astype(int).unique()):
    cluster_df = merged_df[merged_df["cluster"] == cluster_id].sort_values("entropy").reset_index(drop=True)
    users_per_bin = max(1, len(cluster_df) // num_bins)

    avg_entropy_bins = []
    user_counts = []

    for i in range(num_bins):
        start = i * users_per_bin
        end = (i + 1) * users_per_bin if i < num_bins - 1 else len(cluster_df)
        bin_df = cluster_df.iloc[start:end]
        avg_entropy_bins.append(bin_df[entropy_column].mean())
        user_counts.append(len(bin_df))

    # === Plot ===
    x = np.arange(num_bins)
    width = 0.6

    plt.figure(figsize=(12, 5))
    plt.bar(x, avg_entropy_bins, width=width, color='steelblue')
    plt.xlabel('User Bins (sorted by base entropy)', fontsize=11)
    plt.ylabel('Avg. Recommendation Entropy', fontsize=11)
    plt.title(f'Cluster {cluster_id} - Avg. Recommendation Entropy per Bin', fontsize=13)
    plt.xticks(x, [f'Bin {i+1}\n({user_counts[i]})' for i in range(num_bins)], fontsize=10)
    plt.yticks(fontsize=10)
    plt.tight_layout()

    # Save plot
    plot_filename = os.path.join(output_folder, f"cluster_{cluster_id}_rec_entropy.png")
    plt.savefig(plot_filename)
    plt.close()

    # === Save text output ===
    text_output_path = os.path.join(output_folder, f"cluster_{cluster_id}_rec_entropy.txt")
    with open(text_output_path, 'w') as f:
        f.write(f"Cluster {cluster_id} - Avg. Recommendation Entropy per Bin\n")
        f.write("Bin\tUserCount\tAvgRecEntropy\n")
        for i in range(num_bins):
            f.write(f"{i+1}\t{user_counts[i]}\t{avg_entropy_bins[i]:.4f}\n")

print("\n✅ All cluster plots + text files saved to:", output_folder)



✅ All cluster plots + text files saved to: ../result/rec/cluster_histograms/recentropy
