In [None]:
import pandas as pd
import numpy as np

In [8]:
news_df = pd.read_pickle("cleaned_news.pkl")
news_ids = set(news_df["newsID"])

In [None]:

# Load behaviors data
behaviors_cols = ["impression_id", "user_id", "time", "history", "impressions"]
behaviors_df = pd.read_csv("train_data/behaviors.tsv", sep="\t", header=None, names=behaviors_cols)

In [3]:
# Check for duplicates
duplicate_impressions = behaviors_df.duplicated(subset=["impression_id"]).sum()
print(f"Duplicate impression IDs: {duplicate_impressions}")

# Check missing values
print("\nMissing values in behaviors data:")
print(behaviors_df.isnull().sum())

Duplicate impression IDs: 0

Missing values in behaviors data:
impression_id        0
user_id              0
time                 0
history          46065
impressions          0
dtype: int64


In [1]:
import pandas as pd

# Only read the first 10,000 rows (or fewer)
behaviors_df = pd.read_csv(
    'train_data/behaviors.tsv',
    sep='\t',
    header=None,
    names=['impression_id', 'user_id', 'time', 'history', 'impressions'],
    nrows=10000
)

In [2]:
impressions_expanded = []

for _, row in behaviors_df.iterrows():
    try:
        impressions = row['impressions'].split()
        for item in impressions:
            if '-' in item:
                news_id, clicked = item.split('-')
                impressions_expanded.append({
                    'impression_id': row['impression_id'],
                    'user_id': row['user_id'],
                    'news_id': news_id,
                    'clicked': int(clicked)
                })
    except Exception as e:
        print(f"Skipping row due to error: {e}")
        continue


In [3]:
impressions_df = pd.DataFrame(impressions_expanded)
print(f"Expanded to {len(impressions_df)} impression records")

Expanded to 371707 impression records


In [None]:
# Check for invalid news IDs (not in news_df)
invalid_news_ids = impressions_df[~impressions_df["news_id"].isin(news_ids)]
print(f"Impression records with invalid news IDs: {len(invalid_news_ids)}")

  news_ids = set("cleaned_news.pkl"["newsID"]) # type: ignore


TypeError: string indices must be integers, not 'str'

In [None]:
# Check click distribution
clicks = impressions_df["clicked"].sum()
total = len(impressions_df)
print(f"\nOverall CTR: {clicks/total:.4f} ({clicks} clicks out of {total} impressions)")

In [None]:
# Check for articles with too few impressions (unreliable CTR)
article_impressions = impressions_df.groupby("news_id").size()
low_impression_articles = (article_impressions < 5).sum()
print(f"Articles with fewer than 5 impressions: {low_impression_articles}")

In [None]:

# Check for extreme CTRs (potential data issues)
article_ctrs = impressions_df.groupby("news_id")["clicked"].mean()
suspicious_ctrs = ((article_ctrs == 0) | (article_ctrs > 0.7)).sum()
print(f"Articles with suspicious CTRs (0 or >70%): {suspicious_ctrs}")

# Potential adjustments based on exploration:
# 1. Filter out articles with too few impressions (e.g., < 10)
# 2. Investigate articles with extreme CTRs
# 3. Set minimum threshold for impression count when calculating CTR