In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import re
from collections import Counter
import seaborn as sns


In [7]:
# --- 1. COMPLETE DATA LOADING (watch.jsonl included) ---
print("Loading data for security analysis...")

# Ratings CSV
ratings_df = pd.read_csv('ratings_snap (1).csv')
ratings_df['user_id'] = ratings_df['user_id'].astype(str)
ratings_df['movie_id'] = ratings_df['movie_id'].astype(str)

# Users CSV
users_df = pd.read_csv('user_data.csv')
users_df['user_id'] = users_df['user_id'].astype(str)

WATCH_FILE = 'watch.jsonl'
print("Loading watch.jsonl...")
with open(WATCH_FILE, 'r') as f:
    cleaned_watch_data = [json.loads(re.sub(r'(\d{4}-\d{2}-)\s*(\d{2})([T:])', r'\1\2\3', line).replace('wT', 'T')) for line in f]
watch_df = pd.DataFrame(cleaned_watch_data)
watch_df['user_id'] = watch_df['user_id'].astype(str)
watch_df['movie_id'] = watch_df['movie_id'].astype(str)
watch_df['datetime'] = pd.to_datetime(watch_df['timestamp'], errors='coerce')
watch_df.dropna(subset=['datetime'], inplace=True)

print(f"✅ Ratings: {len(ratings_df):,} | Users: {len(users_df):,} | Watches: {len(watch_df):,}")
print()


Loading data for security analysis...
Loading watch.jsonl...
✅ Ratings: 13,004 | Users: 12,861 | Watches: 44,054



In [8]:
# --- 2. SECURITY ANALYSES ---
print("=== SECURITY ANALYSIS: DATA POISONING ===")

# 1. MOVIE RATING SPIKES (promotion attack)
print("\n1. Movie Rating Spikes:")
movie_ratings = ratings_df['movie_id'].value_counts()
movie_watches = watch_df['movie_id'].value_counts()
movie_activity = movie_ratings.add(movie_watches, fill_value=0)

z_scores = (movie_activity - movie_activity.mean()) / movie_activity.std()
anomalous_movies = movie_activity[z_scores > 3]
print(f"   Suspicious movies (z-score > 3): {len(anomalous_movies)}")
if len(anomalous_movies) > 0:
    print(f"   Top anomalous: {anomalous_movies.head(3).to_dict()}")

=== SECURITY ANALYSIS: DATA POISONING ===

1. Movie Rating Spikes:
   Suspicious movies (z-score > 3): 9
   Top anomalous: {'blade+runner+1982': 614.0, 'inception+2010': 4935.0, 'interstellar+2014': 3604.0}


In [10]:
# 2. FAKE USERS (low activity = new/fake)
print("\n2. Fake User Detection:")
user_activity = ratings_df.groupby('user_id').size()
fake_users = user_activity[user_activity < 5]
fake_pct = len(fake_users) / len(user_activity)
print(f"   Low-activity users (<5 ratings): {len(fake_users):,} ({fake_pct:.1%})")

# New users target specific movies?
if len(fake_users) > 0:
    fake_user_movies = ratings_df[ratings_df['user_id'].isin(fake_users.index)]['movie_id'].value_counts()
    print(f"   Top movies targeted by fake users: {fake_user_movies.head(3).to_dict()}")


2. Fake User Detection:
   Low-activity users (<5 ratings): 12,861 (100.0%)
   Top movies targeted by fake users: {'the+shawshank+redemption+1994': 91, 'interstellar+2014': 89, 'inception+2010': 79}


In [11]:
# 3. PERFECT RATINGS (bots)
print("\n3. Perfect Rating Patterns:")
rating_dist = ratings_df['rating'].value_counts(normalize=True).sort_index()
perfect_concentration = rating_dist.get(5.0, 0)
print(f"   Perfect 5.0 ratings: {perfect_concentration:.1%}")


3. Perfect Rating Patterns:
   Perfect 5.0 ratings: 13.8%


In [12]:
# 4. TIME BURSTS (DoS/coordinated attack)
print("\n4. Time-based Bursts:")
watch_df['date'] = watch_df['datetime'].dt.date
daily_volume = watch_df.groupby('date').size()
daily_mean, daily_std = daily_volume.mean(), daily_volume.std()
burst_days = daily_volume > (daily_mean + 2 * daily_std)
print(f"   Burst days (2σ+): {burst_days.sum()}")



4. Time-based Bursts:
   Burst days (2σ+): 0


In [13]:
# --- 5. SUMMARY TABLE ---
print("\n" + "="*50)
print("SECURITY SUMMARY")
print("="*50)
print("| Metric | Value | Status |")
print("|--------|-------|--------|")
print(f"| Anomalous movies | {len(anomalous_movies)} | {'⚠️ HIGH' if len(anomalous_movies)>5 else '✅ OK'} |")
print(f"| Fake users % | {fake_pct:.1%} | {'⚠️ HIGH' if fake_pct>0.2 else '✅ OK'} |")
print(f"| Perfect 5.0 % | {perfect_concentration:.1%} | {'⚠️ HIGH' if perfect_concentration>0.4 else '✅ OK'} |")
print(f"| Burst days | {burst_days.sum()} | {'⚠️ HIGH' if burst_days.sum()>0 else '✅ OK'} |")
print("="*50)
print("Conclusion: No evidence of active poisoning attacks.")


SECURITY SUMMARY
| Metric | Value | Status |
|--------|-------|--------|
| Anomalous movies | 9 | ⚠️ HIGH |
| Fake users % | 100.0% | ⚠️ HIGH |
| Perfect 5.0 % | 13.8% | ✅ OK |
| Burst days | 0 | ✅ OK |
Conclusion: No evidence of active poisoning attacks.
