In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
import pandas as pd

# -----------------------------
# Base dataset
# -----------------------------
df_original = pd.read_csv("/home/moshtasa/Research/phd-svd-recsys/SVD/Movie_Lens/data/df_final.csv")

# Remove NaN decades
df_original = df_original.dropna(subset=["decade"])

unique_decades = sorted(df_original["decade"].unique())

print(f"\n✅ You have {len(unique_decades)} decades: {unique_decades}")

# Movies per decade
movies_per_decade = {
    decade: df_original[df_original["decade"] == decade]["item_id"].unique()
    for decade in unique_decades
}

for decade, movies in movies_per_decade.items():
    print(f"Decade {int(decade)} : {len(movies)} movies")

# All movies in dataset
all_movies = df_original[["item_id", "decade"]].drop_duplicates()

# Users info
original_max_user_id = df_original["user_id"].max()
unique_users = df_original["user_id"].nunique()

print(f"\n✅ Original users: {unique_users}")
print(f"✅ Max user_id: {original_max_user_id}")

# -----------------------------
# Fictitious biased user generator
# -----------------------------
def generate_biased_users(start_user_id, num_users, all_movies, target_decade):
    rows = []

    for user_id in range(start_user_id, start_user_id + num_users):
        for _, row in all_movies.iterrows():
            rating = 5 if row["decade"] == target_decade else 1

            rows.append({
                "user_id": user_id,
                "item_id": row["item_id"],
                "rating": rating,
                "decade": row["decade"]
            })

    return pd.DataFrame(rows)

# Injection sizes
fictitious_counts = [2, 4, 10, 20, 40]

# -----------------------------
# Injection loop
# -----------------------------
for target_decade in unique_decades:
    target_movies = movies_per_decade[target_decade]
    num_target_movies = len(target_movies)
    num_total_movies = len(all_movies)

    for count in fictitious_counts:
        start_user_id = original_max_user_id + 1

        new_users_df = generate_biased_users(
            start_user_id=start_user_id,
            num_users=count,
            all_movies=all_movies,
            target_decade=target_decade
        )

        # -----------------------------
        # VALIDATION
        # -----------------------------
        expected_rows = count * num_total_movies
        actual_rows = len(new_users_df)

        valid_shape = actual_rows == expected_rows

        # Validate rating distribution per user
        validation_ok = True
        for user_id in range(start_user_id, start_user_id + count):
            user_df = new_users_df[new_users_df["user_id"] == user_id]

            pos = (user_df["rating"] == 5).sum()
            neg = (user_df["rating"] == 1).sum()

            if pos != num_target_movies or neg != (num_total_movies - num_target_movies):
                validation_ok = False
                break

        status = "✅ VALID" if (valid_shape and validation_ok) else "❌ INVALID"

        print(f"\n=== Target Decade {int(target_decade)} | {count} Biased Users ===")
        print(f"User IDs: {start_user_id} → {start_user_id + count - 1}")
        print(f"Total movies rated per user: {num_total_movies}")
        print(f"Target decade movies (rating=5): {num_target_movies}")
        print(f"Other movies (rating=1): {num_total_movies - num_target_movies}")
        print(f"Rows added: {actual_rows} (Expected: {expected_rows}) → {status}")

        if status == "❌ INVALID":
            raise ValueError("❌ Validation failed — stopping execution.")

        # Concatenate with ORIGINAL dataset
        df_extended = pd.concat([df_original, new_users_df], ignore_index=True)

                # -----------------------------
        # Save
        # -----------------------------
        decade_str = str(int(target_decade))

        save_path = (
            "/home/moshtasa/Research/phd-svd-recsys/SVD/Movie_Lens/result/rec/1215/data"
            f"/df_biased_{count}_{decade_str}.csv"
        )

        df_extended.to_csv(save_path, index=False)




✅ You have 8 decades: [1920.0, 1930.0, 1940.0, 1950.0, 1960.0, 1970.0, 1980.0, 1990.0]
Decade 1920 : 2 movies
Decade 1930 : 29 movies
Decade 1940 : 45 movies
Decade 1950 : 54 movies
Decade 1960 : 43 movies
Decade 1970 : 53 movies
Decade 1980 : 107 movies
Decade 1990 : 1348 movies

✅ Original users: 943
✅ Max user_id: 943

=== Target Decade 1920 | 2 Biased Users ===
User IDs: 944 → 945
Total movies rated per user: 1681
Target decade movies (rating=5): 2
Other movies (rating=1): 1679
Rows added: 3362 (Expected: 3362) → ✅ VALID

=== Target Decade 1920 | 4 Biased Users ===
User IDs: 944 → 947
Total movies rated per user: 1681
Target decade movies (rating=5): 2
Other movies (rating=1): 1679
Rows added: 6724 (Expected: 6724) → ✅ VALID

=== Target Decade 1920 | 10 Biased Users ===
User IDs: 944 → 953
Total movies rated per user: 1681
Target decade movies (rating=5): 2
Other movies (rating=1): 1679
Rows added: 16810 (Expected: 16810) → ✅ VALID

=== Target Decade 1920 | 20 Biased Users ===
Use

In [3]:
test = pd.read_csv("/home/moshtasa/Research/phd-svd-recsys/SVD/Movie_Lens/result/rec/1215/SVD/df_biased_2_1920_15recommendation.csv")

In [4]:
test.head()

Unnamed: 0,user_id,item_id,est_score,rank,item_decade
0,1,1367,5.750102,1,1990
1,1,1589,5.57296,2,1990
2,1,613,5.565859,3,1930
3,1,1467,5.495706,4,1990
4,1,516,5.41312,5,1980
