In [1]:
pip install surprise

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from docx import Document

In [3]:
df_final = pd.read_csv("../data/df_final.csv") 
df_240 = pd.read_csv("../data/df_240.csv") 
df_528 = pd.read_csv("../data/df_528.csv")
df_1056 = pd.read_csv("../data/df_1056.csv")

In [4]:
df_final.head()

Unnamed: 0,user_id,app_id,category,rating
0,1,1,Strategy,4
1,2,2,Puzzle,3
2,3,3,Business,2
3,4,4,Simulation,1
4,5,5,Books & Reference,3


## Generating top n recommandation to users based on 4 diffrent datasets(original+ 3 biased)

In [None]:
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import numpy as np

# Load datasets
df_final = pd.read_csv("../data/df_final.csv")
df_240 = pd.read_csv("../data/df_240.csv")
df_528 = pd.read_csv("../data/df_528.csv")
df_1056 = pd.read_csv("../data/df_1056.csv")

# Define datasets
datasets = {
    "df_final": df_final,
    "df_240": df_240,
    "df_528": df_528,
    'df_1056':df_1056
}

def train_and_recommend(train_df, test_users_df, k=50, top_k=25):
    # Create training user-item matrix
    train_matrix = train_df.pivot(index='user_id', columns='app_id', values='rating').fillna(0)
    user_ids = train_matrix.index
    app_ids = train_matrix.columns

    # SVD
    train_sparse = csr_matrix(train_matrix.values)
    U, sigma, Vt = svds(train_sparse, k=k)
    sigma = np.diag(sigma)
    pred_matrix = np.dot(np.dot(U, sigma), Vt)
    pred_df = pd.DataFrame(pred_matrix, index=user_ids, columns=app_ids)

    # Already rated map
    train_rated = train_df.groupby('user_id')['app_id'].apply(set).to_dict()

    # Filter users and items present in both train and test
    test_users_filtered = test_users_df[test_users_df['user_id'].isin(user_ids)]

    # Generate top-K recommendations
    top_recs = []
    for user in test_users_filtered['user_id'].unique():
        if user not in pred_df.index:
            continue
        user_pred = pred_df.loc[user]
        rated_apps = train_rated.get(user, set())
        recommendations = user_pred.drop(index=rated_apps, errors='ignore').sort_values(ascending=False).head(top_k)
        for app_id, score in recommendations.items():
            top_recs.append({'user_id': user, 'app_id': app_id, 'predicted_rating': score})

    return pd.DataFrame(top_recs)

def merge_with_category_on_app_id(recommendations_df, df_final):
    df_final_subset = df_final[['app_id', 'category']].drop_duplicates()
    merged_df = pd.merge(recommendations_df, df_final_subset, on='app_id', how='left')
    return merged_df

# Run SVD on each dataset, always test on df_final users
for name, train_df in datasets.items():
    print(f"Training on {name}, generating top-25 recommendations for df_final users...")
    recs = train_and_recommend(train_df, df_final, k=50, top_k=25)
    recs_with_category = merge_with_category_on_app_id(recs, df_final)

    # Save to correct filename
    filename = f"../result/rec/top25_{name}_with_category.csv"
    recs_with_category.to_csv(filename, index=False)
    print(f"Saved: {filename}")

Training on df_1056, generating top-25 recommendations for df_final users...




Saved: ../result/rec/top25_df_1056_with_category.csv


### Computing recommendation entropy per user, per dataset(all 6)

In [5]:
import pandas as pd
from scipy.stats import entropy
import os

# === Function to compute entropy based on app categories ===
def compute_recommendation_entropy(df):
    user_category_counts = df.groupby(['user_id', 'category']).size().unstack(fill_value=0)
    user_category_dist = user_category_counts.div(user_category_counts.sum(axis=1), axis=0)
    user_entropy = user_category_dist.apply(lambda row: entropy(row), axis=1).reset_index()
    user_entropy.columns = ['user_id', 'recommendation_entropy']
    return user_entropy

# === File suffixes for different mobile app recommendation sets ===
file_suffixes = ['final', '240', '528', '1056']  # Match to your filenames: top25_df_240_with_category.csv

# === Paths ===
input_dir = '../result/rec'
output_dir = '../result/rec'
cluster_file = '../preprocessing/df_cluster.csv'  # Make sure this matches your cluster file path

# === Load cluster file once ===
df_cluster = pd.read_csv(cluster_file)

# === Loop through each top-25 recommendation file and compute entropy ===
entropy_datasets = {}

for suffix in file_suffixes:
    input_file = os.path.join(input_dir, f'top25_df_{suffix}_with_category.csv')
    output_file = os.path.join(output_dir, f'entropy_top25_df_{suffix}.csv')
    
    df = pd.read_csv(input_file)
    entropy_df = compute_recommendation_entropy(df)
    entropy_df.to_csv(output_file, index=False)
    
    # Merge entropy values with user clusters
    merged = pd.merge(df_cluster, entropy_df, on='user_id', how='inner')
    merged = merged[['user_id', 'cluster', 'entropy', 'recommendation_entropy']]
    
    # Store result
    globals()[f'RecEnt_df_{suffix}'] = merged
    entropy_datasets[f'RecEnt_df_{suffix}'] = merged

# === Example output for one user (change ID as needed) ===
print(RecEnt_df_final[RecEnt_df_final['user_id'] == 1])


        user_id  cluster   entropy  recommendation_entropy
533205        1        2  2.809783                 2.67795


In [None]:
import pandas as pd
from scipy.stats import entropy
import os

# === Function to compute entropy based on app categories ===
def compute_recommendation_entropy(df):
    user_category_counts = df.groupby(['user_id', 'category']).size().unstack(fill_value=0)
    user_category_dist = user_category_counts.div(user_category_counts.sum(axis=1), axis=0)
    user_entropy = user_category_dist.apply(lambda row: entropy(row), axis=1).reset_index()
    user_entropy.columns = ['user_id', 'recommendation_entropy']
    return user_entropy

# === Setup ===
file_suffixes = ['final', '240', '528', '1056']
input_dir = '../result/rec'
output_dir = '../result/rec'
cluster_file = '../preprocessing/df_cluster.csv'

# === Load cluster info ===
df_cluster = pd.read_csv(cluster_file)

# === Main loop ===
for suffix in file_suffixes:
    input_file = os.path.join(input_dir, f'top25_df_{suffix}_with_category.csv')
    df = pd.read_csv(input_file)

    # Compute entropy
    entropy_df = compute_recommendation_entropy(df)

    # Merge with recommendations
    merged = pd.merge(df, entropy_df, on='user_id', how='left')

    # Merge with cluster info
    merged = pd.merge(merged, df_cluster, on='user_id', how='left')

    # Keep only user_id, app_id, recommendation_entropy, cluster
    merged = merged[['user_id', 'app_id', 'recommendation_entropy', 'cluster']]

    # Save to variable
    globals()[f"RecEnt_df_{suffix}"] = merged


In [6]:
RecEnt_df_1056.head()

Unnamed: 0,user_id,cluster,entropy,recommendation_entropy
0,686826,0,0.450561,0.167944
1,536830,0,0.450561,0.81997
2,433075,0,0.450561,0.0
3,55674,0,0.500402,0.592953
4,620088,0,0.500402,0.167944


In [15]:
# Step 1: Rename recommendation_entropy column in each DataFrame
RecEnt_df_final = RecEnt_df_final.rename(columns={"recommendation_entropy": "final"})
RecEnt_df_240 = RecEnt_df_240.rename(columns={"recommendation_entropy": "240"})
RecEnt_df_528 = RecEnt_df_528.rename(columns={"recommendation_entropy": "528"})
RecEnt_df_1056 = RecEnt_df_1056.rename(columns={"recommendation_entropy": "1056"})

# Step 2: Keep only necessary columns
RecEnt_df_final = RecEnt_df_final[['user_id', 'cluster', 'final']]
RecEnt_df_240 = RecEnt_df_240[['user_id', '240']]
RecEnt_df_528 = RecEnt_df_528[['user_id', '528']]
RecEnt_df_1056 = RecEnt_df_1056[['user_id', '1056']]

# Step 3: Merge all on user_id
merged_entropy_df = RecEnt_df_final \
    .merge(RecEnt_df_240, on="user_id", how="outer") \
    .merge(RecEnt_df_528, on="user_id", how="outer") \
    .merge(RecEnt_df_1056, on="user_id", how="outer")

# Step 4: Optional – sort or fill NaNs
# merged_entropy_df = merged_entropy_df.fillna(0)
merged_entropy_df = merged_entropy_df.sort_values("user_id")

# Step 5:
merged_entropy_df.to_csv("../result//rec/merged_entropy_per_user.csv", index=False)

# Preview
print(merged_entropy_df.head())


        user_id  cluster     final       240       528      1056
533205        1        2  2.677950  2.619729  1.811208  0.958945
575670        2        2  2.553455  0.661477  0.899564  0.690457
575074        3        2  2.754332  2.243999  2.142310  1.356440
668988        4        2  2.643428  2.456143  1.873357  0.849269
675699        5        2  2.809783  2.698880  2.397922  1.241938


In [16]:
# Count users per cluster
cluster_counts = merged_entropy_df['cluster'].value_counts().sort_index()

# Print nicely
print("👥 Number of users per cluster:")
for cluster_id, count in cluster_counts.items():
    print(f"Cluster {cluster_id}: {count} users")


👥 Number of users per cluster:
Cluster 0: 233371 users
Cluster 1: 233370 users
Cluster 2: 233370 users


In [7]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# === Merge cluster info with RecEnt_df_final ===
user_cluster_df = df_cluster[['user_id', 'cluster', 'entropy']].drop_duplicates()
merged_df = pd.merge(RecEnt_df_final[['user_id', 'recommendation_entropy']], user_cluster_df, on='user_id', how='inner')

# === Paths ===
output_folder = '/home/moshtasa/Research/phd-svd-recsys/Recommender Systems/phd-svd-recsys/MobileRec/result/rec/cluster_histograms/base/'
os.makedirs(output_folder, exist_ok=True)

# === Parameters ===
num_bins = 10
entropy_column = 'recommendation_entropy'

# === Loop per cluster ===
for cluster_id in sorted(merged_df["cluster"].dropna().astype(int).unique()):
    cluster_df = merged_df[merged_df["cluster"] == cluster_id].sort_values("entropy").reset_index(drop=True)
    users_per_bin = max(1, len(cluster_df) // num_bins)

    avg_entropy_bins = []
    user_counts = []

    for i in range(num_bins):
        start = i * users_per_bin
        end = (i + 1) * users_per_bin if i < num_bins - 1 else len(cluster_df)
        bin_df = cluster_df.iloc[start:end]
        avg_entropy_bins.append(bin_df[entropy_column].mean())
        user_counts.append(len(bin_df))

    # === Plot ===
    x = np.arange(num_bins)
    width = 0.6

    plt.figure(figsize=(12, 5))
    plt.bar(x, avg_entropy_bins, width=width, color='steelblue')
    plt.xlabel('User Bins (sorted by base entropy)', fontsize=11)
    plt.ylabel('Avg. Recommendation Entropy', fontsize=11)
    plt.title(f'Cluster {cluster_id} - Avg. Recommendation Entropy per Bin', fontsize=13)
    plt.xticks(x, [f'Bin {i+1}\n({user_counts[i]})' for i in range(num_bins)], fontsize=10)
    plt.yticks(fontsize=10)
    plt.tight_layout()

    # Save plot
    plot_filename = os.path.join(output_folder, f"cluster_{cluster_id}_rec_entropy.png")
    plt.savefig(plot_filename)
    plt.close()

    # === Save text output ===
    text_output_path = os.path.join(output_folder, f"cluster_{cluster_id}_rec_entropy.txt")
    with open(text_output_path, 'w') as f:
        f.write(f"Cluster {cluster_id} - Avg. Recommendation Entropy per Bin\n")
        f.write("Bin\tUserCount\tAvgRecEntropy\n")
        for i in range(num_bins):
            f.write(f"{i+1}\t{user_counts[i]}\t{avg_entropy_bins[i]:.4f}\n")

print("\n✅ All cluster plots + text files saved to:", output_folder)



✅ All cluster plots + text files saved to: /home/moshtasa/Research/phd-svd-recsys/Recommender Systems/phd-svd-recsys/MobileRec/result/rec/cluster_histograms/base/


## histogram

In [18]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

# === Paths ===
input_csv = "../result/rec/merged_entropy_per_user.csv"
output_folder_base = "/home/moshtasa/Research/phd-svd-recsys/Recommender Systems/phd-svd-recsys/MobileRec/result/rec/cluster_histograms/base"
os.makedirs(output_folder_base, exist_ok=True)

# === Load merged data ===
df = pd.read_csv(input_csv)
df["cluster"] = df["cluster"].astype(int)
num_bins = 10

# === Define entropy sets ===
entropy_sets = {
    "base": ["final", "240", "528", "1056"]
}

# === Loop per cluster and entropy set ===
for cluster_id in sorted(df["cluster"].dropna().astype(int).unique()):
    cluster_df = df[df["cluster"] == cluster_id].sort_values("final").reset_index(drop=True)
    users_per_bin = max(1, len(cluster_df) // num_bins)

    for set_name, entropy_columns in entropy_sets.items():
        missing_cols = [col for col in entropy_columns if col not in df.columns]
        if missing_cols:
            print(f"Skipping {set_name} — missing columns: {missing_cols}")
            continue

        averages = {col: [] for col in entropy_columns}
        counts = []
        avg_raw_entropy = []

        for i in range(num_bins):
            start = i * users_per_bin
            end = (i + 1) * users_per_bin if i < num_bins - 1 else len(cluster_df)
            bin_df = cluster_df.iloc[start:end]
            counts.append(len(bin_df))
            avg_raw_entropy.append(bin_df["final"].mean())  # using 'final' as proxy for base entropy

            for col in entropy_columns:
                averages[col].append(bin_df[col].mean())

        # === Plotting ===
        x = np.arange(num_bins)
        width = 0.18
        plt.figure(figsize=(16, 6))

        for idx, col in enumerate(entropy_columns):
            plt.bar(x + (idx - len(entropy_columns)/2) * width, averages[col], width=width, label=col)

        plt.xlabel("User Bins (sorted by 'final')", fontsize=11)
        plt.ylabel("Average Entropy", fontsize=11)
        plt.title(f"Cluster {cluster_id} - Average Entropy Across Bins ({set_name})", fontsize=13)
        plt.xticks(x, [f"Bin {i+1}\n({counts[i]})" for i in x], fontsize=10)
        plt.yticks(fontsize=10)
        plt.legend(fontsize=10)
        plt.tight_layout()

        # === Save Plot ===
        plot_filename = os.path.join(output_folder_base, f"cluster_{cluster_id}_entropy_{set_name}.png")
        plt.savefig(plot_filename)
        plt.close()

        # === Save .txt Summary ===
        text_output_path = os.path.join(output_folder_base, f"cluster_{cluster_id}_entropy_{set_name}.txt")
        with open(text_output_path, 'w') as f:
            f.write(f"Cluster {cluster_id} - Average Entropy per Bin ({set_name})\n")
            f.write("Bin\tUserCount\tFinalEntropy\t" + "\t".join(entropy_columns) + "\n")
            for i in range(num_bins):
                f.write(f"{i+1}\t{counts[i]}\t{avg_raw_entropy[i]:.4f}\t" +
                        "\t".join(f"{averages[col][i]:.4f}" for col in entropy_columns) + "\n")

print("\n✅ All base cluster plots + summaries saved successfully.")



✅ All base cluster plots + summaries saved successfully.
