 # Anime Popularity Prediction: Model Training

## Imports and Setup

In [11]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shap
import xgboost as xgb
from scipy.stats import kendalltau, spearmanr
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

## Data Preparation

In [12]:
anime_df = pd.read_csv("processed_anime_data.csv")

train_size = int(len(anime_df) * 0.8)
train_df = anime_df.iloc[:train_size]
test_df = anime_df.iloc[train_size:]

X_train = train_df.drop(["anime_id", "popularity"], axis=1)
y_train = train_df["popularity"]
X_test = test_df.drop(["anime_id", "popularity"], axis=1)
y_test = test_df["popularity"]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Initial Model Training

In [None]:
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
dtest = xgb.DMatrix(X_test_scaled, label=y_test)

params = {
    "objective": "reg:squarederror",
    "eval_metric": "mae",
    "max_depth": 6,
    "eta": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
}

num_rounds = 100
model = xgb.train(params, dtrain, num_rounds)

y_pred = model.predict(dtest)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(root_mean_squared_error(y_test, y_pred))
spearman, _ = spearmanr(y_test, y_pred)
kendall, _ = kendalltau(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"Spearman's rank correlation: {spearman:.2f}")
print(f"Kendall's Tau: {kendall:.2f}")

## Hyperparameter Tuning

In [None]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100)

param_grid = {
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.3],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "min_child_weight": [1, 3, 5],
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    cv=5,
    scoring="neg_mean_absolute_error",
    verbose=2,
    n_jobs=-1,
)
grid_search.fit(X_train_scaled, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best MAE:", -grid_search.best_score_)

## Final Model Training and Evaluation

In [None]:
best_model = xgb.XGBRegressor(
    **grid_search.best_params_,
    objective="reg:squarederror",
)
best_model.fit(X_train_scaled, y_train)

y_pred_final = best_model.predict(X_test_scaled)

mae_final = mean_absolute_error(y_test, y_pred_final)
rmse_final = np.sqrt(root_mean_squared_error(y_test, y_pred_final))
spearman_final, _ = spearmanr(y_test, y_pred_final)
kendall_final, _ = kendalltau(y_test, y_pred_final)

print(f"Final MAE: {mae_final:.2f}")
print(f"Final RMSE: {rmse_final:.2f}")
print(f"Final Spearman's rank correlation: {spearman_final:.2f}")
print(f"Final Kendall's Tau: {kendall_final:.2f}")

## Feature Importance Analysis

In [None]:
importance = best_model.feature_importances_
feature_names = X_train.columns

indices = np.argsort(importance)[::-1]

plt.figure(figsize=(12, 8))
plt.title("Feature Importances")
plt.bar(range(20), importance[indices][:20])
plt.xticks(range(20), [feature_names[i] for i in indices[:20]], rotation=90)
plt.tight_layout()
plt.show()

## Model Interpretation

In [None]:
explainer = shap.TreeExplainer(best_model)

shap_values = explainer.shap_values(X_test_scaled)

shap.summary_plot(shap_values, X_test, plot_type="bar", feature_names=feature_names)

shap.force_plot(explainer.expected_value, shap_values[0, :], X_test.iloc[0, :], feature_names=feature_names)