In [2]:
import pandas as pd

# Define the path to your local dataset directory
dataset_path = "/Users/saramoshtaghi/Documents/Research/Recommender Systems/RS/data/ml-100k"

# Load ratings data (assumes the file is 'u.data' in the ml-100k folder)
df_ratings = pd.read_csv(f"{dataset_path}/u.data", sep='\t', header=None, 
                         names=['user_id', 'item_id', 'rating', 'timestamp'])

# Convert item_id to integer
df_ratings['item_id'] = df_ratings['item_id'].astype(int)

# Define movie metadata columns based on u.item structure
movie_columns = ['item_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL',
                 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
                 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Load movie metadata from local u.item file
df_movies = pd.read_csv(f"{dataset_path}/u.item", sep='|', encoding='latin-1',
                        names=movie_columns, usecols=['item_id', 'release_date'])

# Convert item_id in df_movies to int
df_movies['item_id'] = df_movies['item_id'].astype(int)

# Merge ratings with movie release dates
df_final = pd.merge(df_ratings, df_movies, on='item_id', how='left')

# Drop timestamp as it's not needed
df_final.drop(columns=['timestamp'], inplace=True)

# Convert release_date to datetime, handling missing values
df_final['release_date'] = pd.to_datetime(df_final['release_date'], errors='coerce')

# Extract the year from release_date
df_final['year'] = df_final['release_date'].dt.year

# Create a new column 'decade' by rounding down the year to the nearest decade
df_final['decade'] = (df_final['year'] // 10) * 10

# Drop the 'year' and 'release_date' columns as they're no longer needed
df_final.drop(columns=['year', 'release_date'], inplace=True)

# Display the first few rows of the final DataFrame
print(df_final.head())


   user_id  item_id  rating  decade
0      196      242       3  1990.0
1      186      302       3  1990.0
2       22      377       1  1990.0
3      244       51       2  1990.0
4      166      346       1  1990.0


In [47]:
import pandas as pd

# Load df_40 from its CSV file
df_40 = pd.read_csv("/Users/saramoshtaghi/Documents/Research/Recommender Systems/RS/genre/df_40.csv")

# Define the path to your local dataset directory
dataset_path = "/Users/saramoshtaghi/Documents/Research/Recommender Systems/RS/data/ml-100k"

# Define movie metadata columns based on u.item structure
movie_columns = ['item_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL',
                 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
                 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Load movie metadata from local u.item file
df_movies = pd.read_csv(f"{dataset_path}/u.item", sep='|', encoding='latin-1',
                        names=movie_columns, usecols=['item_id', 'unknown', 'Action', 'Adventure',
                                                      'Animation', 'Children', 'Comedy', 'Crime',
                                                      'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
                                                      'Horror', 'Musical', 'Mystery', 'Romance',
                                                      'Sci-Fi', 'Thriller', 'War', 'Western'])

# Convert item_id in df_movies to int
df_movies['item_id'] = df_movies['item_id'].astype(int)

# Create a genre column by concatenating genres where the value is 1
genre_columns = ['unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
                 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Create genre column by joining genres with a ',' where the value is 1
df_movies['genre'] = df_movies.apply(lambda row: ', '.join([genre for genre in genre_columns if row[genre] == 1]), axis=1)

# Drop the individual genre columns after combining them into 'genre'
df_movies.drop(columns=genre_columns, inplace=True)

# Merge df_40 with movie metadata
df_genre = pd.merge(df_40, df_movies, on='item_id', how='left')

# Drop timestamp if it's in df_40 (optional, to match the previous version)
if 'timestamp' in df_genre.columns:
    df_genre.drop(columns=['timestamp'], inplace=True)

# Display the first few rows of df_genre to confirm
print(df_genre.head())


   user_id  item_id  rating  decade                                genre
0      196      242     3.0  1990.0                               Comedy
1      186      302     3.0  1990.0  Crime, Film-Noir, Mystery, Thriller
2       22      377     1.0  1990.0                     Children, Comedy
3      244       51     2.0  1990.0         Drama, Romance, War, Western
4      166      346     1.0  1990.0                         Crime, Drama


In [50]:
# Count the number of movies where 'unknown' genre is explicitly present
unknown_count = (df_genre['genre'] == 'unknown').sum()

print(f"Number of movies with the unknown genre: {unknown_count}")


Number of movies with the unknown genre: 15


In [52]:
import pandas as pd

# Filter and display movies where 'decade' is NaN
unknown_movies = df_genre[pd.isna(df_genre['decade'])]


# Print the result
print(unknown_movies)


       user_id  item_id  rating  decade    genre
2172       130      267     5.0     NaN  unknown
3781         5      267     4.0     NaN  unknown
7245       268      267     3.0     NaN  unknown
12475      297      267     3.0     NaN  unknown
14756      319      267     4.0     NaN  unknown
15292        1      267     4.0     NaN  unknown
49295      532      267     3.0     NaN  unknown
93523      833      267     1.0     NaN  unknown
99723      422      267     4.0     NaN  unknown


In [54]:
import pandas as pd

# Count how many items have genre as NaN or empty
no_genre_count = df_genre['genre'].isna().sum()

# Count how many items have decade as NaN
no_decade_count = df_genre['decade'].isna().sum()

print(f"Number of movies without a genre: {no_genre_count}")
print(f"Number of movies without a decade: {no_decade_count}")


Number of movies without a genre: 0
Number of movies without a decade: 9


In [55]:
import pandas as pd

# Remove movies with the 'unknown' genre
df_genre = df_genre[df_genre['genre'] != 'unknown']

# Remove movies where 'decade' is NaN
df_genre = df_genre[~df_genre['decade'].isna()]

# Print the first few rows to confirm
print(df_genre.head())


   user_id  item_id  rating  decade                                genre
0      196      242     3.0  1990.0                               Comedy
1      186      302     3.0  1990.0  Crime, Film-Noir, Mystery, Thriller
2       22      377     1.0  1990.0                     Children, Comedy
3      244       51     2.0  1990.0         Drama, Romance, War, Western
4      166      346     1.0  1990.0                         Crime, Drama


In [56]:
df_genre.info()

<class 'pandas.core.frame.DataFrame'>
Index: 108390 entries, 0 to 108404
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   user_id  108390 non-null  int64  
 1   item_id  108390 non-null  int64  
 2   rating   108390 non-null  float64
 3   decade   108390 non-null  float64
 4   genre    108390 non-null  object 
dtypes: float64(2), int64(2), object(1)
memory usage: 5.0+ MB


In [None]:
# Get unique individual genres easily
unique_genres = pd.Series(', '.join(df_genre['genre'].dropna()).split(', ')).unique()

print(unique_genres)


['Comedy' 'Crime' 'Film-Noir' 'Mystery' 'Thriller' 'Children' 'Drama'
 'Romance' 'War' 'Western' 'Sci-Fi' 'Action' 'Adventure' 'Musical'
 'Documentary' 'Animation' 'Horror' 'Fantasy']


In [59]:
# Split genres and explode them into individual rows
df_exploded = df_genre.assign(genre=df_genre['genre'].str.split(', ')).explode('genre')

# Count unique movies per genre
unique_genre_counts = df_exploded.groupby('genre')['item_id'].nunique().sort_values(ascending=False)

# Print the number of unique movies per genre
print(unique_genre_counts)


genre
Drama          725
Comedy         505
Action         251
Thriller       251
Romance        247
Adventure      135
Children       122
Crime          109
Sci-Fi         101
Horror          92
War             71
Mystery         61
Musical         56
Documentary     50
Animation       42
Western         27
Film-Noir       24
Fantasy         22
Name: item_id, dtype: int64


In [61]:
# Count unique movies after exploding genres
unique_movie_count = df_exploded['item_id'].nunique()

print(f"Number of unique movies: {unique_movie_count}")


Number of unique movies: 1680


In [64]:
df_40_genre = df_genre

In [65]:
df_40_genre

Unnamed: 0,user_id,item_id,rating,decade,genre
0,196,242,3.0,1990.0,Comedy
1,186,302,3.0,1990.0,"Crime, Film-Noir, Mystery, Thriller"
2,22,377,1.0,1990.0,"Children, Comedy"
3,244,51,2.0,1990.0,"Drama, Romance, War, Western"
4,166,346,1.0,1990.0,"Crime, Drama"
...,...,...,...,...,...
108400,983,1682,5.0,1990.0,Drama
108401,983,1640,5.0,1990.0,Drama
108402,983,1637,5.0,1990.0,Drama
108403,983,1630,5.0,1990.0,Drama


80

In [69]:
import pandas as pd

# Load df_80 from its CSV file
df_80 = pd.read_csv("/Users/saramoshtaghi/Documents/Research/Recommender Systems/RS/genre/df_80.csv")

# Define the path to your local dataset directory
dataset_path = "/Users/saramoshtaghi/Documents/Research/Recommender Systems/RS/data/ml-100k"

# Define movie metadata columns based on u.item structure
movie_columns = ['item_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL',
                 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
                 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Load movie metadata from local u.item file
df_movies = pd.read_csv(f"{dataset_path}/u.item", sep='|', encoding='latin-1',
                        names=movie_columns, usecols=['item_id', 'unknown', 'Action', 'Adventure',
                                                      'Animation', 'Children', 'Comedy', 'Crime',
                                                      'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
                                                      'Horror', 'Musical', 'Mystery', 'Romance',
                                                      'Sci-Fi', 'Thriller', 'War', 'Western'])

# Convert item_id in df_movies to int
df_movies['item_id'] = df_movies['item_id'].astype(int)

# Create a genre column by concatenating genres where the value is 1
genre_columns = ['unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
                 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Create genre column by joining genres with a ',' where the value is 1
df_movies['genre'] = df_movies.apply(lambda row: ', '.join([genre for genre in genre_columns if row[genre] == 1]), axis=1)

# Drop the individual genre columns after combining them into 'genre'
df_movies.drop(columns=genre_columns, inplace=True)

# Merge df_80 with movie metadata
df_genre = pd.merge(df_80, df_movies, on='item_id', how='left')

# Drop timestamp if it's in df_80 (optional, to match the previous version)
if 'timestamp' in df_genre.columns:
    df_genre.drop(columns=['timestamp'], inplace=True)

# Display the first few rows of df_genre to confirm
print(df_genre.head())
df_80_genre = df_genre


   user_id  item_id  rating  decade                                genre
0      196      242     3.0  1990.0                               Comedy
1      186      302     3.0  1990.0  Crime, Film-Noir, Mystery, Thriller
2       22      377     1.0  1990.0                     Children, Comedy
3      244       51     2.0  1990.0         Drama, Romance, War, Western
4      166      346     1.0  1990.0                         Crime, Drama


In [71]:
# Count the number of movies where 'unknown' genre is explicitly present
unknown_count = (df_80_genre['genre'] == 'unknown').sum()

print(f"Number of movies with the unknown genre: {unknown_count}")

Number of movies with the unknown genre: 20


In [72]:
import pandas as pd

# Filter and display movies where 'decade' is NaN
unknown_movies = df_80_genre[pd.isna(df_80_genre['decade'])]


# Print the result
print(unknown_movies)


       user_id  item_id  rating  decade    genre
2172       130      267     5.0     NaN  unknown
3781         5      267     4.0     NaN  unknown
7245       268      267     3.0     NaN  unknown
12475      297      267     3.0     NaN  unknown
14756      319      267     4.0     NaN  unknown
15292        1      267     4.0     NaN  unknown
49295      532      267     3.0     NaN  unknown
93523      833      267     1.0     NaN  unknown
99723      422      267     4.0     NaN  unknown


In [73]:
import pandas as pd

# Count how many items have genre as NaN or empty
no_genre_count = df_80_genre['genre'].isna().sum()

# Count how many items have decade as NaN
no_decade_count = df_80_genre['decade'].isna().sum()

print(f"Number of movies without a genre: {no_genre_count}")
print(f"Number of movies without a decade: {no_decade_count}")


Number of movies without a genre: 0
Number of movies without a decade: 9


In [74]:
import pandas as pd

# Remove movies with the 'unknown' genre
df_80_genre = df_80_genre[df_80_genre['genre'] != 'unknown']

# Remove movies where 'decade' is NaN
df_80_genre = df_80_genre[~df_80_genre['decade'].isna()]

# Print the first few rows to confirm
print(df_80_genre.head())


   user_id  item_id  rating  decade                                genre
0      196      242     3.0  1990.0                               Comedy
1      186      302     3.0  1990.0  Crime, Film-Noir, Mystery, Thriller
2       22      377     1.0  1990.0                     Children, Comedy
3      244       51     2.0  1990.0         Drama, Romance, War, Western
4      166      346     1.0  1990.0                         Crime, Drama


In [77]:
# Get unique individual genres easily
unique_genres = pd.Series(', '.join(df_80_genre['genre'].dropna()).split(', ')).unique()

print(unique_genres)


['Comedy' 'Crime' 'Film-Noir' 'Mystery' 'Thriller' 'Children' 'Drama'
 'Romance' 'War' 'Western' 'Sci-Fi' 'Action' 'Adventure' 'Musical'
 'Documentary' 'Animation' 'Horror' 'Fantasy']


In [78]:
# Split genres and explode them into individual rows
df_exploded = df_80_genre.assign(genre=df_80_genre['genre'].str.split(', ')).explode('genre')

# Count unique movies per genre
unique_genre_counts = df_exploded.groupby('genre')['item_id'].nunique().sort_values(ascending=False)

# Print the number of unique movies per genre
print(unique_genre_counts)


genre
Drama          725
Comedy         505
Action         251
Thriller       251
Romance        247
Adventure      135
Children       122
Crime          109
Sci-Fi         101
Horror          92
War             71
Mystery         61
Musical         56
Documentary     50
Animation       42
Western         27
Film-Noir       24
Fantasy         22
Name: item_id, dtype: int64


In [95]:
# Count unique movies after exploding genres
unique_movie_count = df['genre'].nunique()

print(f"Number of unique movies: {unique_movie_count}")


Number of unique movies: 215


120

In [80]:
import pandas as pd

# Load df_120 from its CSV file
df_120 = pd.read_csv("/Users/saramoshtaghi/Documents/Research/Recommender Systems/RS/genre/df_120.csv")

# Define the path to your local dataset directory
dataset_path = "/Users/saramoshtaghi/Documents/Research/Recommender Systems/RS/data/ml-100k"

# Define movie metadata columns based on u.item structure
movie_columns = ['item_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL',
                 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
                 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Load movie metadata from local u.item file
df_movies = pd.read_csv(f"{dataset_path}/u.item", sep='|', encoding='latin-1',
                        names=movie_columns, usecols=['item_id', 'unknown', 'Action', 'Adventure',
                                                      'Animation', 'Children', 'Comedy', 'Crime',
                                                      'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
                                                      'Horror', 'Musical', 'Mystery', 'Romance',
                                                      'Sci-Fi', 'Thriller', 'War', 'Western'])

# Convert item_id in df_movies to int
df_movies['item_id'] = df_movies['item_id'].astype(int)

# Create a genre column by concatenating genres where the value is 1
genre_columns = ['unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
                 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Create genre column by joining genres with a ',' where the value is 1
df_movies['genre'] = df_movies.apply(lambda row: ', '.join([genre for genre in genre_columns if row[genre] == 1]), axis=1)

# Drop the individual genre columns after combining them into 'genre'
df_movies.drop(columns=genre_columns, inplace=True)

# Merge df_120 with movie metadata
df_genre = pd.merge(df_120, df_movies, on='item_id', how='left')

# Drop timestamp if it's in df_120 (optional, to match the previous version)
if 'timestamp' in df_genre.columns:
    df_genre.drop(columns=['timestamp'], inplace=True)

# Display the first few rows of df_genre to confirm
print(df_genre.head())
df_120_genre = df_genre


   user_id  item_id  rating  decade                                genre
0      196      242     3.0  1990.0                               Comedy
1      186      302     3.0  1990.0  Crime, Film-Noir, Mystery, Thriller
2       22      377     1.0  1990.0                     Children, Comedy
3      244       51     2.0  1990.0         Drama, Romance, War, Western
4      166      346     1.0  1990.0                         Crime, Drama


In [81]:
# Count the number of movies where 'unknown' genre is explicitly present
unknown_count = (df_120_genre['genre'] == 'unknown').sum()

print(f"Number of movies with the unknown genre: {unknown_count}")

Number of movies with the unknown genre: 25


In [82]:
import pandas as pd

# Filter and display movies where 'decade' is NaN
unknown_movies = df_120_genre[pd.isna(df_120_genre['decade'])]


# Print the result
print(unknown_movies)


       user_id  item_id  rating  decade    genre
2172       130      267     5.0     NaN  unknown
3781         5      267     4.0     NaN  unknown
7245       268      267     3.0     NaN  unknown
12475      297      267     3.0     NaN  unknown
14756      319      267     4.0     NaN  unknown
15292        1      267     4.0     NaN  unknown
49295      532      267     3.0     NaN  unknown
93523      833      267     1.0     NaN  unknown
99723      422      267     4.0     NaN  unknown


In [83]:
import pandas as pd

# Count how many items have genre as NaN or empty
no_genre_count = df_120_genre['genre'].isna().sum()

# Count how many items have decade as NaN
no_decade_count = df_120_genre['decade'].isna().sum()

print(f"Number of movies without a genre: {no_genre_count}")
print(f"Number of movies without a decade: {no_decade_count}")


Number of movies without a genre: 0
Number of movies without a decade: 9


In [84]:
import pandas as pd

# Remove movies with the 'unknown' genre
df_120_genre = df_120_genre[df_120_genre['genre'] != 'unknown']

# Remove movies where 'decade' is NaN
df_120_genre = df_120_genre[~df_120_genre['decade'].isna()]

# Print the first few rows to confirm
print(df_120_genre.head())


   user_id  item_id  rating  decade                                genre
0      196      242     3.0  1990.0                               Comedy
1      186      302     3.0  1990.0  Crime, Film-Noir, Mystery, Thriller
2       22      377     1.0  1990.0                     Children, Comedy
3      244       51     2.0  1990.0         Drama, Romance, War, Western
4      166      346     1.0  1990.0                         Crime, Drama


In [85]:
# Split genres and explode them into individual rows
df_exploded = df_120_genre.assign(genre=df_120_genre['genre'].str.split(', ')).explode('genre')

# Count unique movies per genre
unique_genre_counts = df_exploded.groupby('genre')['item_id'].nunique().sort_values(ascending=False)

# Print the number of unique movies per genre
print(unique_genre_counts)


genre
Drama          725
Comedy         505
Action         251
Thriller       251
Romance        247
Adventure      135
Children       122
Crime          109
Sci-Fi         101
Horror          92
War             71
Mystery         61
Musical         56
Documentary     50
Animation       42
Western         27
Film-Noir       24
Fantasy         22
Name: item_id, dtype: int64


In [86]:
# Count unique movies after exploding genres
unique_movie_count = df_exploded['item_id'].nunique()

print(f"Number of unique movies: {unique_movie_count}")


Number of unique movies: 1680


In [87]:
# Save all columns to CSV in the 'genre' folder
df_40_genre.to_csv("genre/df_40_genre.csv", index=False)
df_80_genre.to_csv("genre/df_80_genre.csv", index=False)
df_120_genre.to_csv("genre/df_120_genre.csv", index=False)

print("✅ All datasets saved successfully in the 'genre' folder!")


✅ All datasets saved successfully in the 'genre' folder!


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data
categories = ['Books', 'Ratings', 'Tags', 'To-Read Lists']
values = [10000, 5976479, 34252, 912705]
colors = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D']

# Create figure
fig, ax = plt.subplots(figsize=(5, 6))

# Vertical bar chart with log scale
bars = ax.bar(categories, values, color=colors, alpha=0.8, edgecolor='black', linewidth=1)

# Set log scale for better visualization
ax.set_yscale('log')

# Extend Y-axis limits to give more space for the ratings label
ax.set_ylim(1000, 50000000)  # Extend upper limit to 50M

# Add value labels on top of bars (horizontally oriented)
for i, (bar, value) in enumerate(zip(bars, values)):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2, height * 1.3,  # Increased multiplier from 1.1 to 1.3
            f'{value:,}', ha='center', va='bottom', 
            fontsize=14, fontweight='bold')

# Styling
ax.set_ylabel('Count (Log Scale)', fontsize=16, fontweight='bold')
ax.set_title('Goodreads Dataset Scale: Real-World Data Volume', 
             fontsize=18, fontweight='bold', pad=20)

# Grid and formatting
ax.grid(axis='y', alpha=0.3, linestyle='--')
ax.set_axisbelow(True)
plt.xticks(fontsize=12, fontweight='bold', rotation=45, ha='right')
plt.yticks(fontsize=12, fontweight='bold')

# Fix the layout to accommodate labels - give more space at the top
plt.subplots_adjust(top=0.85, bottom=0.15)

# Save high-resolution figure before showing
plt.savefig("goodreads_dataset_scale.png", dpi=600, bbox_inches="tight")

# Show
plt.show()
