In [5]:
pip install numpy==1.23.5

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install scikit-surprise==1.1.0

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [9]:
# Import necessary libraries
import numpy as np
import pandas as pd

Load the dataset and add the decade column by performing a join operation.

In [10]:
# Define the path to your local dataset directory
dataset_path = "/Users/saramoshtaghi/Documents/Research/Recommender Systems/phd-svd-recsys/data/ml-100k"

# Load ratings data (assumes the file is 'u.data' in the ml-100k folder)
df_ratings = pd.read_csv(f"{dataset_path}/u.data", sep='\t', header=None, 
                         names=['user_id', 'item_id', 'rating', 'timestamp'])

# Convert item_id to integer
df_ratings['item_id'] = df_ratings['item_id'].astype(int)

# Define movie metadata columns based on u.item structure
movie_columns = ['item_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL',
                 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
                 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Load movie metadata from local u.item file
df_movies = pd.read_csv(f"{dataset_path}/u.item", sep='|', encoding='latin-1',
                        names=movie_columns, usecols=['item_id', 'release_date'])

# Convert item_id in df_movies to int
df_movies['item_id'] = df_movies['item_id'].astype(int)

# Merge ratings with movie release dates
df_final = pd.merge(df_ratings, df_movies, on='item_id', how='left')

# Drop timestamp as it's not needed
df_final.drop(columns=['timestamp'], inplace=True)

# Convert release_date to datetime, handling missing values
df_final['release_date'] = pd.to_datetime(df_final['release_date'], errors='coerce')

# Extract the year from release_date
df_final['year'] = df_final['release_date'].dt.year

# Create a new column 'decade' by rounding down the year to the nearest decade
df_final['decade'] = (df_final['year'] // 10) * 10

# Drop the 'year' and 'release_date' columns as they're no longer needed
df_final.drop(columns=['year', 'release_date'], inplace=True)

# Display the first few rows of the final DataFrame
print(df_final.head())


   user_id  item_id  rating  decade
0      196      242       3  1990.0
1      186      302       3  1990.0
2       22      377       1  1990.0
3      244       51       2  1990.0
4      166      346       1  1990.0


In [None]:
# Save the df_final dataset as a CSV file in the current directory
df_final.to_csv("df_final.csv", index=False)

In [12]:
df = df_final 
# Select 10 random unique users from the dataset
random_users = np.random.choice(df['user_id'].unique(), 200, replace=False)

# Count how many movies each user watched
user_movie_counts = df[df['user_id'].isin(random_users)].groupby('user_id')['item_id'].count()

# Convert to DataFrame for better display
df_user_movie_counts = user_movie_counts.reset_index()
df_user_movie_counts.columns = ['user_id', 'movies_watched']

# Display the result
print("Random Users Movie Count:")
print(df_user_movie_counts.max())

Random Users Movie Count:
user_id           943
movies_watched    518
dtype: int64


In [13]:
df = df_final  # Assuming df_final is the DataFrame you're working with

# Count how many movies each user watched in the entire dataset
user_movie_counts_all = df.groupby('user_id')['item_id'].count()

# Get the maximum count of movies rated by any user in the entire dataset
max_movie_count_all = user_movie_counts_all.max()
min_movie_count_all = user_movie_counts_all.min()
print("Maximum number of movies watched by any user in the entire dataset:", max_movie_count_all)
print("Minimum number of movies watched by any user in the entire dataset:", min_movie_count_all)


Maximum number of movies watched by any user in the entire dataset: 737
Minimum number of movies watched by any user in the entire dataset: 20


Counting Movies Per Decade

In [15]:

# Ensure df_final exists (Load your dataset if needed)
# df_final = pd.read_csv("your_dataset.csv")  # Uncomment if df_final is not loaded

# Extract unique decades
unique_decades = df_final['decade'].dropna().unique()

# Extract unique item_ids and count total movies
existing_item_ids = df_final['item_id'].unique()
total_movies = df_final['item_id'].nunique()

# Extract the last user ID in df_final and define a starting point for new users
max_existing_user_id = df_final['user_id'].max()
num_new_users = 40
new_user_start_id = max_existing_user_id + 1

# Count movies per decade
movies_per_decade = df_final.groupby('decade')['item_id'].nunique().reset_index(name='movie_count')

print("Unique Decades Available:", unique_decades)
print("Total Unique Movies Available:", total_movies)
print("Starting User ID for New Users:", new_user_start_id)
print("\nMovies Per Decade:")
print(movies_per_decade)


Unique Decades Available: [1990. 1960. 1970. 1950. 1980. 1940. 1930. 1920.]
Total Unique Movies Available: 1682
Starting User ID for New Users: 944

Movies Per Decade:
   decade  movie_count
0  1920.0            2
1  1930.0           29
2  1940.0           45
3  1950.0           54
4  1960.0           43
5  1970.0           53
6  1980.0          107
7  1990.0         1348


Aggregate User Ratings by Decade: For each user, count the number of movies they’ve rated in each of the 8 decades. You’ll end up with a 943 x 8 matrix where each row is a user, and each column is a decade.

In [18]:
df_final.columns

Index(['user_id', 'item_id', 'rating', 'decade'], dtype='object')

 Code to compute user-decade diversity and clustering

In [24]:
# Step 1: Count number of ratings per user per decade
user_decade_counts = df_final.groupby(['user_id', 'decade']).size().unstack(fill_value=0)

# Step 2: Normalize counts so each user's row sums to 1
user_decade_normalized = user_decade_counts.div(user_decade_counts.sum(axis=1), axis=0)

# Step 3: Rename columns to indicate normalization
user_decade_normalized.columns = [f'{int(c)}_norm' for c in user_decade_normalized.columns]

# Step 4: Merge raw counts with normalized values
user_decade_merged = pd.concat([user_decade_counts, user_decade_normalized], axis=1).reset_index()

# Step 5: Preview the result
print(user_decade_merged.head())


   user_id  1920.0  1930.0  1940.0  1950.0  1960.0  1970.0  1980.0  1990.0  \
0        1       0       4       1       3       9      18      44     192   
1        2       0       0       0       0       0       2       0      60   
2        3       0       0       0       0       0       0       0      54   
3        4       0       0       0       0       0       2       1      21   
4        5       0       2       3       6       9      20      32     102   

   1920_norm  1930_norm  1940_norm  1950_norm  1960_norm  1970_norm  \
0        0.0   0.014760   0.003690   0.011070   0.033210   0.066421   
1        0.0   0.000000   0.000000   0.000000   0.000000   0.032258   
2        0.0   0.000000   0.000000   0.000000   0.000000   0.000000   
3        0.0   0.000000   0.000000   0.000000   0.000000   0.083333   
4        0.0   0.011494   0.017241   0.034483   0.051724   0.114943   

   1980_norm  1990_norm  
0   0.162362   0.708487  
1   0.000000   0.967742  
2   0.000000   1.000000  


In [19]:
import pandas as pd
import numpy as np
from scipy.stats import entropy
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Load your df_final if not loaded
# df_final = pd.read_csv('your_file.csv')  # example if needed

# Step 1: Get counts of ratings per user per decade
user_decade_counts = df_final.groupby(['user_id', 'decade']).size().unstack(fill_value=0)

# Step 2: Normalize counts to get distribution (row-wise)
user_decade_dist = user_decade_counts.div(user_decade_counts.sum(axis=1), axis=0)

# Step 3: Calculate entropy (diversity) for each user
user_entropy = user_decade_dist.apply(lambda x: entropy(x), axis=1).to_frame(name='entropy')
user_entropy.reset_index(inplace=True)

# Step 4: Clustering into 3 groups
kmeans = KMeans(n_clusters=3, random_state=42)
user_entropy['cluster'] = kmeans.fit_predict(user_entropy[['entropy']])

# Step 5: Optional — sort clusters by average entropy for interpretation
cluster_order = user_entropy.groupby('cluster')['entropy'].mean().sort_values().index
entropy_cluster_map = {old: new for new, old in enumerate(cluster_order)}
user_entropy['cluster'] = user_entropy['cluster'].map(entropy_cluster_map)

# Show sample result
print(user_entropy.head())


   user_id   entropy  cluster
0        1  0.965269        1
1        2  0.142506        0
2        3  0.000000        0
3        4  0.456334        0
4        5  1.263807        2


In [20]:
# 1. Count of users per cluster
user_counts = user_entropy['cluster'].value_counts().sort_index()
print("Users per cluster:\n", user_counts)

# 2. Get 10 example user_ids per cluster
example_users = user_entropy.groupby('cluster')['user_id'].apply(lambda x: x.sample(10, random_state=42)).reset_index(drop=True)
print("\n10 example user_ids per cluster:")
for cluster in sorted(user_entropy['cluster'].unique()):
    users = example_users[user_entropy['cluster'] == cluster].tolist()
    print(f"Cluster {cluster}: {users}")


Users per cluster:
 cluster
0    353
1    287
2    303
Name: count, dtype: int64

10 example user_ids per cluster:
Cluster 0: [112, 602, 558, 719, 296, 20, 371]
Cluster 1: [630, 150, 30, 433, 334, 545, 707, 766, 835, 96]
Cluster 2: [149, 518, 46, 414, 439, 854, 663, 603, 228, 115, 340, 202, 691]


In [26]:
# Step 1: Get user_ids from Cluster 0 (low entropy)
cluster_0_users = user_entropy[user_entropy['cluster'] == 0]['user_id']

# Step 2: Filter original df_final for only those users
cluster_0_ratings = df_final[df_final['user_id'].isin(cluster_0_users)]

# Step 3: Count ratings per user
ratings_per_user_cluster_0 = cluster_0_ratings.groupby('user_id').size().reset_index(name='num_ratings')

# Step 4: Preview the result
print(ratings_per_user_cluster_0.head())


   user_id  num_ratings
0        2           62
1        3           54
2        4           24
3       15          104
4       17           28


In [28]:
user_15_data = df_final[df_final['user_id'] == 15]
print(user_15_data)


       user_id  item_id  rating  decade
206         15      405       2  1990.0
642         15      749       1  1990.0
708         15       25       3  1990.0
1208        15      331       3  1990.0
1365        15      222       3  1990.0
...        ...      ...     ...     ...
79919       15      472       3  1990.0
80757       15      938       3  1990.0
81481       15      322       3  1990.0
93458       15      845       2  1990.0
94881       15      225       3  1990.0

[104 rows x 4 columns]


In [31]:
import pandas as pd

# Assuming df_final and user_entropy are already available in memory

# Step 1: Get user clusters
user_clusters = user_entropy[['user_id', 'cluster']]

# Step 2: Count number of ratings per user
user_rating_counts = df_final.groupby('user_id').size().reset_index(name='num_ratings')

# Step 3: Merge counts with cluster labels
user_info = pd.merge(user_clusters, user_rating_counts, on='user_id')

# Step 4: Split into three DataFrames by cluster
cluster_0_df = user_info[user_info['cluster'] == 0][['user_id', 'num_ratings']]
cluster_1_df = user_info[user_info['cluster'] == 1][['user_id', 'num_ratings']]
cluster_2_df = user_info[user_info['cluster'] == 2][['user_id', 'num_ratings']]

# Step 5: Save to CSV
cluster_0_df.to_csv("../cluster_0_users.csv", index=False)
cluster_1_df.to_csv("../cluster_1_users.csv", index=False)
cluster_2_df.to_csv("../cluster_2_users.csv", index=False)

"/mnt/data/cluster_0_users.csv", "/mnt/data/cluster_1_users.csv", "/mnt/data/cluster_2_users.csv"


('/mnt/data/cluster_0_users.csv',
 '/mnt/data/cluster_1_users.csv',
 '/mnt/data/cluster_2_users.csv')

Creating biased datasets

In [None]:
def create_new_users_dataset(num_new_users, users_per_decade, start_user_id, df_final):
    # List to store new user data
    new_users_data = []
    
    # Define fixed decades
    decades = [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990]

    # ✅ Loop to create new users
    for i in range(num_new_users):
        user_id = start_user_id + i  # Generate unique user ID
        
        # Handle case when index exceeds available decades
        if i // users_per_decade >= len(decades):
            favorite_decade = decades[-1]  # Assign the last available decade if overflow
        else:
            favorite_decade = decades[i // users_per_decade]  # Assign a favorite decade for the user

        # 🎥 Get all movies from the selected favorite decade
        movies_from_fav_decade = df_final[df_final['decade'] == favorite_decade]['item_id'].unique()

        # 🎯 Assign a rating of 5.0 only for movies in the favorite decade
        for item_id in movies_from_fav_decade:
            new_users_data.append([user_id, item_id, 5.0, favorite_decade])

    # ✅ Convert new users' data into a DataFrame
    df_new_users = pd.DataFrame(new_users_data, columns=['user_id', 'item_id', 'rating', 'decade'])

    # 🔄 Merge the new users' data with the original dataset
    df_merged = pd.concat([df_final, df_new_users], ignore_index=True)

    print(f"✅ New dataset generated for {num_new_users} users with {users_per_decade} users per decade.")
    print(f"🔹 New dataset size: {df_merged.shape}")
    
    return df_merged, df_new_users


In [6]:

# Count initial rows in the original dataset
initial_rows = df_final.shape[0]
print(f"✅ Initial number of rows in the original dataset: {initial_rows}")


✅ Initial number of rows in the original dataset: 100000


In [7]:
# Generate dataset for 40 new users (5 users per decade)
df_40_40, df_new_40 = create_new_users_dataset(40, 5, 944, df_final)

# Generate dataset for 80 new users (10 users per decade)
df_40_80, df_new_80 = create_new_users_dataset(80, 10, 944, df_final)

# Generate dataset for 120 new users (15 users per decade)
df_40_120, df_new_120 = create_new_users_dataset(120, 15, 944, df_final)


✅ New dataset generated for 40 users with 5 users per decade.
🔹 New dataset size: (108405, 4)
✅ New dataset generated for 80 users with 10 users per decade.
🔹 New dataset size: (116810, 4)
✅ New dataset generated for 120 users with 15 users per decade.
🔹 New dataset size: (125215, 4)


In [8]:
def check_validity(df_new_users, num_users):
    # 🔍 Check if any user has been mistakenly assigned more than 1 favorite decade
    favorite_decades_per_user = df_new_users.groupby('user_id')['decade'].nunique()
    multiple_favorite_decades = favorite_decades_per_user[favorite_decades_per_user > 1]

    # ✅ Verify that no user has more than 1 favorite decade
    if multiple_favorite_decades.empty:
        print(f"✅ All {num_users} new users have 1 and only 1 favorite decade.")
    else:
        print(f"❌ These users have more than 1 favorite decade: {multiple_favorite_decades.index.tolist()}")

# Check validity for 40 new users
check_validity(df_new_40, 40)

# Check validity for 80 new users
check_validity(df_new_80, 80)

# Check validity for 120 new users
check_validity(df_new_120, 120)


✅ All 40 new users have 1 and only 1 favorite decade.
✅ All 80 new users have 1 and only 1 favorite decade.
✅ All 120 new users have 1 and only 1 favorite decade.


In [9]:
# Select the first three columns for each dataset
columns_to_save = ['user_id', 'item_id', 'rating']

# Extract and save the selected columns for each dataset
df_40_40[columns_to_save].to_csv("df_40.csv", index=False)
df_40_80[columns_to_save].to_csv("df_80.csv", index=False)
df_40_120[columns_to_save].to_csv("df_120.csv", index=False)

print("✅ First three columns of each dataset saved successfully!")


✅ First three columns of each dataset saved successfully!


In [223]:
pip install python-pptx


Defaulting to user installation because normal site-packages is not writeable
Collecting python-pptx
  Downloading python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Downloading XlsxWriter-3.2.2-py3-none-any.whl.metadata (2.8 kB)
Collecting lxml>=3.1.0 (from python-pptx)
  Downloading lxml-5.3.1-cp39-cp39-macosx_10_9_universal2.whl.metadata (3.7 kB)
Collecting typing-extensions>=4.9.0 (from python-pptx)
  Downloading typing_extensions-4.13.0-py3-none-any.whl.metadata (3.0 kB)
Downloading python_pptx-1.0.2-py3-none-any.whl (472 kB)
Downloading lxml-5.3.1-cp39-cp39-macosx_10_9_universal2.whl (8.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading typing_extensions-4.13.0-py3-none-any.whl (45 kB)
Downloading XlsxWriter-3.2.2-py3-none-any.whl (165 kB)
Installing collected packages: XlsxWriter, typing-extensions, lxml, python-pptx
Succe

In [224]:
from pptx import Presentation
from pptx.util import Inches, Pt


# Create a new PowerPoint presentation
prs = Presentation()

# Define a title and content layout
title_slide_layout = prs.slide_layouts[0]
content_slide_layout = prs.slide_layouts[1]

# 🎯 Slide 1: Introduction
slide1 = prs.slides.add_slide(title_slide_layout)
title1 = slide1.shapes.title
subtitle1 = slide1.placeholders[1]

title1.text = "Impact of Adding New Users on Dataset"
subtitle1.text = "Analyzing dataset size after adding 40, 80, and 120 new users.\nEach user rates movies from their favorite decade."

# 🎯 Slide 2: Scenario 1 – Adding 40 New Users
slide2 = prs.slides.add_slide(content_slide_layout)
title2, content2 = slide2.shapes.title, slide2.placeholders[1]

title2.text = "Scenario 1: Adding 40 New Users"
content2.text = (
    "✅ Users 944 to 948 rate movies from 1920 (2 movies each).\n"
    "✅ Users 949 to 953 rate movies from 1930 (29 movies each).\n"
    "✅ Users continue rating movies based on their favorite decade.\n"
    "✅ Total New Ratings = 8,405\n"
    "🔹 New Dataset Size: 108,405 rows."
)

# 🎯 Slide 3: Scenario 2 – Adding 80 New Users
slide3 = prs.slides.add_slide(content_slide_layout)
title3, content3 = slide3.shapes.title, slide3.placeholders[1]

title3.text = "Scenario 2: Adding 80 New Users"
content3.text = (
    "✅ 10 users per decade, rating movies from their favorite decade.\n"
    "✅ Total New Ratings = 16,810\n"
    "🔹 New Dataset Size: 116,810 rows."
)

# 🎯 Slide 4: Scenario 3 – Adding 120 New Users
slide4 = prs.slides.add_slide(content_slide_layout)
title4, content4 = slide4.shapes.title, slide4.placeholders[1]

title4.text = "Scenario 3: Adding 120 New Users"
content4.text = (
    "✅ 15 users per decade, rating movies from their favorite decade.\n"
    "✅ Total New Ratings = 25,215\n"
    "🔹 New Dataset Size: 125,215 rows."
)

# 🎯 Slide 5: Final Summary
slide5 = prs.slides.add_slide(content_slide_layout)
title5, content5 = slide5.shapes.title, slide5.placeholders[1]

title5.text = "Final Summary"
content5.text = (
    "🔹 40 Users: 8,405 new ratings → 108,405 rows\n"
    "🔹 80 Users: 16,810 new ratings → 116,810 rows\n"
    "🔹 120 Users: 25,215 new ratings → 125,215 rows\n"
    "✅ Adding more users increases the dataset size significantly."
)

# Save the PowerPoint presentation
prs.save("Impact_of_Adding_Users.pptx")

print("✅ PowerPoint presentation created successfully: Impact_of_Adding_Users.pptx")


✅ PowerPoint presentation created successfully: Impact_of_Adding_Users.pptx


1M