In [None]:
import json
from collections import Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast

In [2]:
def load_user_items(path):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            # parse Python-style dict string into a real dict
            user = ast.literal_eval(line)
            uid = str(user["user_id"])
            for it in user["items"]:
                rows.append(
                    {
                        "user_id": uid,
                        "item_id": str(it["item_id"]),
                        "item_name": it["item_name"],
                        "playtime_forever": it.get("playtime_forever", 0),
                        "playtime_2weeks": it.get("playtime_2weeks", 0),
                    }
                )
    return pd.DataFrame(rows)




In [None]:
user_items = load_user_items("australian_users_items.json")
user_items.head(), user_items.shape

In [None]:
def load_metadata(path):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = ast.literal_eval(line)
            rows.append(obj)
    return pd.DataFrame(rows)


In [None]:
metadata = load_metadata("steam_games.json")
metadata.head(), metadata.shape

In [None]:
metadata["id"] = metadata["id"].astype(str)
user_items["item_id"] = user_items["item_id"].astype(str)


In [None]:
num_users = user_items["user_id"].nunique()
num_games = user_items["item_id"].nunique()
num_entries = len(user_items)

print("Number of users:", num_users)
print("Number of games:", num_games)
print("Number of user–game interactions:", num_entries)


In [None]:
playtime = user_items["playtime_forever"]
playtime_pos = playtime[playtime > 0]

plt.figure(figsize=(6,4))
plt.hist(np.log1p(playtime_pos), bins=50)
plt.xlabel("log(1 + playtime_forever) (hours)")
plt.ylabel("Count")
plt.title("Distribution of Playtime (Positive Only)")
plt.tight_layout()
plt.show()


In [None]:
games_per_user = user_items.groupby("user_id")["item_id"].nunique()

plt.figure(figsize=(6,4))
plt.hist(games_per_user, bins=50)
plt.xlabel("Number of games owned per user")
plt.ylabel("Number of users")
plt.title("User Purchase Frequency")
plt.tight_layout()
plt.show()

games_per_user.describe()


In [None]:
users_per_game = user_items.groupby("item_id")["user_id"].nunique()

plt.figure(figsize=(6,4))
plt.hist(np.log1p(users_per_game), bins=50)
plt.xlabel("log(1 + users per game)")
plt.ylabel("Number of games")
plt.title("Game Popularity Distribution")
plt.tight_layout()
plt.show()

users_per_game.describe()


In [None]:
prices = metadata["price"].replace("Free", 0)
prices = pd.to_numeric(prices, errors="coerce").dropna()

plt.figure(figsize=(6,4))
plt.hist(prices, bins=50)
plt.xlabel("Game price ($)")
plt.ylabel("Number of games")
plt.title("Distribution of Steam Game Prices")
plt.tight_layout()
plt.show()

prices.describe()


In [None]:
def split_list_column(series):
    all_items = Counter()
    for x in series.dropna():
        for g in x:
            all_items[g] += 1
    return all_items

genre_counts = split_list_column(metadata["genres"])
top_genres = genre_counts.most_common(10)

labels, counts = zip(*top_genres)
plt.figure(figsize=(7,4))
plt.bar(labels, counts)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Count")
plt.title("Top 10 Genres")
plt.tight_layout()
plt.show()

top_genres


In [None]:
tag_counts = split_list_column(metadata["tags"])
top_tags = tag_counts.most_common(10)

labels_t, counts_t = zip(*top_tags)
plt.figure(figsize=(7,4))
plt.bar(labels_t, counts_t)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Count")
plt.title("Top 10 Tags")
plt.tight_layout()
plt.show()

top_tags


In [None]:
metadata["release_date"] = pd.to_datetime(metadata["release_date"], errors="coerce")
metadata["release_year"] = metadata["release_date"].dt.year

print("Earliest release year:", metadata["release_year"].min())
print("Latest release year:", metadata["release_year"].max())

year_counts = metadata["release_year"].value_counts().sort_index()

plt.figure(figsize=(7,4))
plt.plot(year_counts.index, year_counts.values, marker="o")
plt.xlabel("Release year")
plt.ylabel("Number of games")
plt.title("Number of Games Released per Year")
plt.tight_layout()
plt.show()


In [None]:

df = user_items.merge(
    metadata,
    left_on="item_id",
    right_on="id",
    how="left"
)

df["label"] = 1

df.head(), df.shape

In [None]:
import random

all_games = set(metadata["id"])

neg_rows = []

for uid, group in df.groupby("user_id"):
    bought = set(group["item_id"])
    not_bought = list(all_games - bought)

    if len(not_bought) < len(bought):
        continue

    sampled_negs = random.sample(not_bought, len(bought))

    for g in sampled_negs:
        neg_rows.append({
            "user_id": uid,
            "item_id": g,
            "label": 0
        })

neg_df = pd.DataFrame(neg_rows)
neg_df = neg_df.merge(metadata, left_on="item_id", right_on="id", how="left")
full_df = pd.concat([df, neg_df], ignore_index=True)
full_df = full_df.sample(frac=1, random_state=42)  # shuffle

user_game_counts = user_items.groupby("user_id")["item_id"].nunique()
full_df["user_game_count"] = full_df["user_id"].map(user_game_counts)

full_df["price"] = pd.to_numeric(full_df["price"], errors="coerce").fillna(0)

list(full_df.columns)


In [None]:
# Take a random subset of full_df for modeling
MODEL_N = 200_000  # try 200k first; increase later if memory allows

model_df = full_df.sample(n=MODEL_N, random_state=42).copy()
print("model_df shape:", model_df.shape)


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import pandas as pd

# Clean genres in metadata
def to_list_or_empty(x):
    if isinstance(x, list):
        return x
    elif pd.isna(x):
        return []
    else:
        return [x]

metadata["genres_clean"] = metadata["genres"].apply(to_list_or_empty)

mlb = MultiLabelBinarizer()
genre_array = mlb.fit_transform(metadata["genres_clean"])

genre_cols = [f"genre_{g}" for g in mlb.classes_]

genre_df = pd.DataFrame(
    genre_array,
    columns=genre_cols,
    index=metadata["id"]
)

# Make genre columns int8 to save memory
genre_df = genre_df.astype(np.int8)

# Join onto the *smaller* model_df
model_df = model_df.join(genre_df, on="item_id")

print("model_df shape after genres:", model_df.shape)


In [None]:
sentiment_map = {
    "Overwhelmingly Positive": 5,
    "Very Positive": 4,
    "Positive": 3,
    "Mostly Positive": 2,
    "Mixed": 1,
    "Negative": 0,
    "Mostly Negative": -1,
    "Very Negative": -2,
    "Overwhelmingly Negative": -3,
}

def sentiment_to_score(s):
    if pd.isna(s):
        return np.nan
    s = str(s)
    if "user reviews" in s:
        return np.nan
    return sentiment_map.get(s, np.nan)

model_df["sentiment_score"] = model_df["sentiment"].apply(sentiment_to_score)
model_df["sentiment_score"] = model_df["sentiment_score"].fillna(0)


In [None]:
from sklearn.model_selection import train_test_split

y = model_df["label"]

drop_cols = [
    "label",            # target
    "playtime_forever", # leakage
    "playtime_2weeks",  # leakage
    "user_id",
    "item_id",
    "id",
    "item_name",
    "app_name",
    "title",
    "url",
    "reviews_url",
    "release_date",
    "tags",
    "genres",
    "publisher",
    "developer",
    "specs",
    "sentiment",        # raw text; we use sentiment_score
]

X = model_df.drop(columns=[c for c in drop_cols if c in model_df.columns])

# Keep only numeric / bool
X = X.select_dtypes(include=[np.number, bool]).fillna(0)

print("Feature columns (first 20):", X.columns[:20].tolist())
print("X shape:", X.shape)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

log_reg_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000))
])

log_reg_pipeline.fit(X_train, y_train)

y_pred_lr = log_reg_pipeline.predict(X_test)
y_prob_lr = log_reg_pipeline.predict_proba(X_test)[:, 1]

print("\n=== Logistic Regression (Linear) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("ROC AUC:", roc_auc_score(y_test, y_prob_lr))


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

print("\n=== Random Forest (Non-linear) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, y_prob_rf))

importances = pd.Series(rf.feature_importances_, index=X_train.columns)
print("\nTop 15 RF Features:")
print(importances.sort_values(ascending=False).head(15))


In [None]:
#full df

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# ───────────── A. Clean genres in metadata ─────────────

def to_list_or_empty(x):
    # metadata["genres"] is usually list-like; some entries may be NaN or strings
    if isinstance(x, list):
        return x
    elif pd.isna(x):
        return []
    else:
        # if it's a string like "Action", wrap in list
        return [x]

metadata["genres_clean"] = metadata["genres"].apply(to_list_or_empty)

# ───────────── B. Fit MultiLabelBinarizer on metadata genres ─────────────

mlb = MultiLabelBinarizer()
genre_array = mlb.fit_transform(metadata["genres_clean"])

# Prefix columns with "genre_" to keep things clear
genre_cols = [f"genre_{g}" for g in mlb.classes_]

genre_df = pd.DataFrame(
    genre_array,
    columns=genre_cols,
    index=metadata["id"]  # index by app id
)

# ───────────── C. Join genre one-hot features into full_df ─────────────

# full_df has "item_id" which matches metadata["id"]
full_df = full_df.join(genre_df, on="item_id")




In [None]:
# Convert sentiment -> numeric score

sentiment_map = {
    "Overwhelmingly Positive": 5,
    "Very Positive": 4,
    "Positive": 3,
    "Mostly Positive": 2,
    "Mixed": 1,
    "Negative": 0,
    "Mostly Negative": -1,
    "Very Negative": -2,
    "Overwhelmingly Negative": -3,
}

def sentiment_to_score(s):
    if pd.isna(s):
        return np.nan
    s = str(s)
    # Entries like "8 user reviews", too few reviews, treat as missing
    if "user reviews" in s:
        return np.nan
    return sentiment_map.get(s, np.nan)

full_df["sentiment_score"] = full_df["sentiment"].apply(sentiment_to_score)

# Fill missing sentiment with 0 (neutral-ish)
full_df["sentiment_score"] = full_df["sentiment_score"].fillna(0)


In [None]:
from sklearn.model_selection import train_test_split

# Build feature matrix X and target y

y = full_df["label"]

# Columns we DO NOT want as features
drop_cols = [
    "label",            # target
    "playtime_forever", # post-purchase info (leaks label)
    "playtime_2weeks",  # post-purchase info
    "user_id",          # ids
    "item_id",
    "id",
    "item_name",
    "app_name",
    "title",
    "url",
    "reviews_url",
    "release_date",
    "tags",
    "genres",           # raw list/text
    "publisher",
    "developer",
    "specs",
    "sentiment",        # raw text; we use sentiment_score instead
]

X = full_df.drop(columns=[c for c in drop_cols if c in full_df.columns])

# Keep only numeric and boolean columns
X = X.select_dtypes(include=[np.number, bool]).fillna(0)

# Train/test split (stratified to preserve class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
# Linear model — Logistic Regression

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

log_reg_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000))
])

log_reg_pipeline.fit(X_train, y_train)

y_pred_lr = log_reg_pipeline.predict(X_test)
y_prob_lr = log_reg_pipeline.predict_proba(X_test)[:, 1]

print("\n=== Logistic Regression (Linear Model) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("ROC AUC:", roc_auc_score(y_test, y_prob_lr))


In [None]:
# Non-linear model — Random Forest

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

print("\n=== Random Forest (Non-linear Model) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, y_prob_rf))

# Feature importances
importances = pd.Series(rf.feature_importances_, index=X_train.columns)
print("\nTop 15 Random Forest Features:")
print(importances.sort_values(ascending=False).head(15))


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

# ───────────── 1. Build X, y from full_df ─────────────

# Columns that must NOT be used as features
leak_cols = [
    "label",            # target
    "playtime_forever", # post-purchase info
    "playtime_2weeks",  # post-purchase info
    "item_id",          # IDs, not features
    "user_id",
    "item_name",
    "app_name",
    "title",
    "url",
    "release_date",
    "tags",
    "publisher",
    "developer",
    "reviews_url",
    "specs",
    "genres",           # raw list/text, not encoded
    "id"                # duplicate of item_id from metadata
]

y = full_df["label"]

X = full_df.drop(columns=[c for c in leak_cols if c in full_df.columns])
X = X.select_dtypes(include=[np.number, bool]).fillna(0)

print("Feature columns used:", list(X.columns))
print("X shape:", X.shape)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# ───────────── 2. Linear model: Logistic Regression ─────────────

log_reg_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000))
])

log_reg_pipeline.fit(X_train, y_train)

y_pred_lr = log_reg_pipeline.predict(X_test)
y_prob_lr = log_reg_pipeline.predict_proba(X_test)[:, 1]

print("\n=== Logistic Regression (Linear Model) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("ROC AUC:", roc_auc_score(y_test, y_prob_lr))

In [None]:
# ───────────── 3. Non-linear model: Random Forest ─────────────
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

print("\n=== Random Forest (Non-linear Model) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, y_prob_rf))

# Top feature importances for RF
importances = pd.Series(rf.feature_importances_, index=X_train.columns)
print("\nTop Random Forest Features:")
print(importances.sort_values(ascending=False).head(10))

In [None]:
full_df.head()

In [None]:
y.value_counts(normalize=True)


In [None]:
X.corrwith(y).sort_values(ascending=False).head(10)

In [None]:
leak_cols = [
    "label",              
    "popularity" ,
    "playtime_forever",    
    "playtime_2weeks",    
    "item_id",             
    "user_id"             
]
X_noleak = X.drop(columns=[c for c in leaky_cols if c in X.columns])

X_train, X_test, y_train, y_test = train_test_split(
    X_noleak, y, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))


In [None]:
leak_cols = [
    "label",               # ← direct leakage
    "playtime_forever",    # ← post-purchase information
    "playtime_2weeks",     # ← post-purchase information
    "item_id",             # ← ID only, no predictive meaning
    "user_id"              # ← ID only
]

X_clean = full_df.drop(columns=[c for c in leak_cols if c in full_df.columns])
y = full_df["label"]

# Keep only numeric columns
import numpy as np
X_clean = X_clean.select_dtypes(include=[np.number, bool]).fillna(0)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y, test_size=0.2, random_state=42
)

from sklearn.metrics import accuracy_score, roc_auc_score

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

leak_cols = [
    "label",               # ← direct leakage
    "playtime_forever",    # ← post-purchase information
    "playtime_2weeks",     # ← post-purchase information
    "item_id",             # ← ID only, no predictive meaning
    "user_id"              # ← ID only
]

X_clean = full_df.drop(columns=[c for c in leak_cols if c in full_df.columns])
y = full_df["label"]

# Keep only numeric columns
import numpy as np
X_clean = X_clean.select_dtypes(include=[np.number, bool]).fillna(0)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y, test_size=0.2, random_state=42
)

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest ROC AUC:", roc_auc_score(y_test, y_prob_rf))


In [None]:
import pandas as pd

importances = pd.Series(rf.feature_importances_, index=X_train.columns)
importances.sort_values(ascending=False).head(10)


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

def to_list_or_empty(x):
    if isinstance(x, list):
        return x
    else:
        return []

# Work on metadata first
metadata["genres_clean"] = metadata["genres"].apply(to_list_or_empty)

mlb = MultiLabelBinarizer()
genre_array = mlb.fit_transform(metadata["genres_clean"])

genre_df = pd.DataFrame(
    genre_array,
    columns=mlb.classes_,
    index=metadata["id"]
)

full_df = full_df.join(genre_df, on="item_id")


In [None]:
leak_cols = ["label", "playtime_forever", "playtime_2weeks", "item_id", "user_id"]

X_clean = full_df.drop(columns=[c for c in leak_cols if c in full_df.columns])
X_clean = X_clean.select_dtypes(include=[np.number, bool]).fillna(0)

y = full_df["label"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y, test_size=0.2, random_state=42
)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)


In [None]:
importances = pd.Series(rf.feature_importances_, index=X_train.columns)
importances.sort_values(ascending=False).head(15)
