In [14]:
import pandas as pd
df = pd.read_csv("../data/movie_ratings.csv")

In [15]:
print(df.columns)
print(len(df))

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')
25000095


In [31]:
import pandas as pd
import numpy as np

# 1. compute number of ratings per user
counts = df.groupby('userId').size().reset_index(name='n_ratings')

# 2. find 75th percentile (threshold)
threshold = counts['n_ratings'].quantile(0.95)

# 3. select users with counts >= threshold (this may include more than exactly 25% if ties)
top_users = counts[counts['n_ratings'] >= threshold]['userId']

# 4. filter original dataframe to only include ratings from those users
df_top25pct = df[df['userId'].isin(top_users)].copy()

# quick checks
print(f"Users total: {len(counts):,}")
print(f"Selected users: {top_users.nunique():,}")
print(f"Rows in filtered df: {len(df_top25pct):,}")

Users total: 162,541
Selected users: 8,150
Rows in filtered df: 8,138,831


In [32]:
# number of users to keep (ceiling to include at least 25%)
n_users = len(counts)
top_n = int(np.ceil(n_users * 0.05))

# sort and take top_n users
top_users_exact = counts.sort_values('n_ratings', ascending=False).head(top_n)['userId']

# filter original dataframe
df_top25_exact = df[df['userId'].isin(top_users_exact)].copy()

# checks
print(f"Users total: {n_users:,}, keeping top_n = {top_n}")
print(f"Rows in filtered df: {len(df_top25_exact):,}")



Users total: 162,541, keeping top_n = 8128
Rows in filtered df: 8,126,643


In [33]:
# Save without row index, UTF-8 encoding
output_path = "../data/movie_ratings_top25pct.csv"
df_top25pct.to_csv(output_path, index=False, encoding="utf-8")
print(f"Saved {len(df_top25pct):,} rows to {output_path}")

Saved 8,138,831 rows to ../data/movie_ratings_top25pct.csv
