In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import ndcg_score
import warnings
warnings.filterwarnings('ignore')
import json
from pathlib import Path
from datetime import datetime
import gc

print("=" * 60)
print("НАЧАЛО РАБОТЫ ПАЙПЛАЙНА")
print("=" * 60)

DATA_DIR = Path("data/raw")
MODEL_DIR = Path("output/models")
SUBMISSION_DIR = Path("output/submissions")

for dir_path in [DATA_DIR, MODEL_DIR, SUBMISSION_DIR]:
    dir_path.mkdir(exist_ok=True)

def safe_load_csv(filepath):
    try:
        return pd.read_csv(filepath, low_memory=False)
    except Exception as e:
        print(f"Ошибка загрузки {filepath}: {e}")
        return pd.DataFrame()

print("\nЗАГРУЗКА ДАННЫХ")
train = safe_load_csv(DATA_DIR / "train.csv")
books = safe_load_csv(DATA_DIR / "books.csv")
users = safe_load_csv(DATA_DIR / "users.csv")
candidates = safe_load_csv(DATA_DIR / "candidates.csv")
targets = safe_load_csv(DATA_DIR / "targets.csv")

try:
    genres = safe_load_csv(DATA_DIR / "genres.csv")
    book_genres = safe_load_csv(DATA_DIR / "book_genres.csv")
    book_descriptions = safe_load_csv(DATA_DIR / "book_descriptions.csv")
except:
    genres, book_genres, book_descriptions = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

if 'age' in users.columns:
    users['age'] = pd.to_numeric(users['age'], errors='coerce')
    users['age'] = users['age'].fillna(users['age'].median())
    age_bins = [0, 18, 25, 35, 50, 100]
    age_labels = ['teen', 'young', 'adult', 'middle', 'senior']
    users['age_group'] = pd.cut(users['age'], bins=age_bins, labels=age_labels, include_lowest=True).astype(str)
    users['age_group'] = users['age_group'].fillna('adult')

if 'gender' in users.columns:
    users['gender'] = users['gender'].fillna('unknown')
    gender_encoder = LabelEncoder()
    users['gender_encoded'] = gender_encoder.fit_transform(users['gender'])

if 'description' in books.columns:
    books['desc_len'] = books['description'].str.len().fillna(0)
    books['has_desc'] = books['description'].notna().astype('int8')
    books['desc_word_count'] = books['description'].str.split().str.len().fillna(0)

if 'publication_year' in books.columns:
    books['publication_year'] = pd.to_numeric(books['publication_year'], errors='coerce')
    books['publication_year'] = books['publication_year'].fillna(books['publication_year'].median())
    books['book_age'] = 2025 - books['publication_year']
    books['is_old_book'] = (books['book_age'] > 20).astype(int)
    books['is_recent_book'] = (books['book_age'] <= 5).astype(int)

if 'avg_rating' in books.columns:
    books['avg_rating'] = pd.to_numeric(books['avg_rating'], errors='coerce')
    books['avg_rating'] = books['avg_rating'].fillna(books['avg_rating'].median())
    books['high_rated'] = (books['avg_rating'] >= 8).astype(int)
    books['low_rated'] = (books['avg_rating'] <= 4).astype(int)

if not book_genres.empty and not genres.empty:
    train_with_genres = train.merge(book_genres, on='book_id', how='left')
    genre_popularity = train_with_genres.groupby('genre_id').agg({
        'has_read': ['sum', 'count', 'mean']
    }).reset_index()
    genre_popularity.columns = ['genre_id', 'genre_read_sum', 'genre_interactions', 'genre_read_rate']
    book_genres_extended = book_genres.merge(genre_popularity, on='genre_id', how='left')
    book_genre_stats = book_genres_extended.groupby('book_id').agg({
        'genre_id': 'nunique',
        'genre_read_rate': ['mean', 'max', 'min', 'std']
    }).reset_index()
    book_genre_stats.columns = ['book_id', 'num_genres', 'avg_genre_read_rate', 'max_genre_read_rate', 
                               'min_genre_read_rate', 'std_genre_read_rate']
    books = books.merge(book_genre_stats, on='book_id', how='left')
    for col in ['num_genres', 'avg_genre_read_rate', 'max_genre_read_rate', 'min_genre_read_rate', 'std_genre_read_rate']:
        if col in books.columns:
            books[col] = books[col].fillna(0)

user_stats = train.groupby('user_id').agg({
    'has_read': ['sum', 'count', 'mean'],
    'book_id': 'nunique'
}).reset_index()
user_stats.columns = ['user_id', 'read_sum', 'total_interactions', 'read_rate', 'unique_books']
user_stats['plan_rate'] = 1 - user_stats['read_rate']
user_stats['diversity'] = user_stats['unique_books'] / user_stats['total_interactions'].replace(0, 1)
user_stats['engagement'] = user_stats['total_interactions'] / user_stats['unique_books'].replace(0, 1)
user_stats['conversion_rate'] = user_stats['read_sum'] / user_stats['total_interactions'].replace(0, 1)

if 'timestamp' in train.columns:
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    user_time_stats = train.groupby('user_id').agg({
        'timestamp': ['min', 'max', 'nunique']
    }).reset_index()
    user_time_stats.columns = ['user_id', 'first_interaction', 'last_interaction', 'unique_dates']
    user_time_stats['activity_duration_days'] = (user_time_stats['last_interaction'] - user_time_stats['first_interaction']).dt.days
    user_time_stats['activity_duration_days'] = user_time_stats['activity_duration_days'].clip(lower=1)
    user_stats = user_stats.merge(user_time_stats, on='user_id', how='left')
    user_stats['interactions_per_day'] = user_stats['total_interactions'] / user_stats['activity_duration_days'].replace(0, 1)
    user_stats['reading_frequency'] = user_stats['read_sum'] / user_stats['activity_duration_days'].replace(0, 1)

book_stats = train.groupby('book_id').agg({
    'has_read': ['sum', 'count', 'mean'],
    'user_id': 'nunique'
}).reset_index()
book_stats.columns = ['book_id', 'book_read_sum', 'book_interactions', 'book_read_rate', 'unique_users']
book_stats['book_plan_rate'] = 1 - book_stats['book_read_rate']
book_stats['book_diversity'] = book_stats['unique_users'] / book_stats['book_interactions'].replace(0, 1)
book_stats['book_popularity'] = book_stats['book_interactions'] / len(user_stats)
book_stats['book_conversion_rate'] = book_stats['book_read_sum'] / book_stats['book_interactions'].replace(0, 1)
book_stats['book_popularity_norm'] = (book_stats['book_interactions'] - book_stats['book_interactions'].min()) / (book_stats['book_interactions'].max() - book_stats['book_interactions'].min() + 1e-10)
book_stats['book_read_rate_norm'] = (book_stats['book_read_rate'] - book_stats['book_read_rate'].min()) / (book_stats['book_read_rate'].max() - book_stats['book_read_rate'].min() + 1e-10)

user_features = users.merge(user_stats, on='user_id', how='left')
user_feature_cols = ['read_sum', 'total_interactions', 'read_rate', 'unique_books', 'plan_rate', 'diversity', 'engagement', 'conversion_rate']
if 'interactions_per_day' in user_features.columns:
    user_feature_cols.extend(['interactions_per_day', 'reading_frequency', 'activity_duration_days'])
for col in user_feature_cols:
    if col in user_features.columns:
        user_features[col] = user_features[col].fillna(0)

book_features = books.merge(book_stats, on='book_id', how='left')
book_feature_cols = ['book_read_sum', 'book_interactions', 'book_read_rate', 'unique_users', 'book_plan_rate', 'book_diversity', 'book_popularity', 'book_conversion_rate', 'book_popularity_norm', 'book_read_rate_norm']
if 'num_genres' in book_features.columns:
    book_feature_cols.extend(['num_genres', 'avg_genre_read_rate', 'max_genre_read_rate', 'min_genre_read_rate', 'std_genre_read_rate'])
for col in book_feature_cols:
    if col in book_features.columns:
        book_features[col] = book_features[col].fillna(0)

train_pairs_set = set(zip(train['user_id'], train['book_id']))
np.random.seed(42)
negative_samples = []
top_books = train['book_id'].value_counts().head(200).index.tolist()
all_users = user_stats['user_id'].unique()
n_users_to_sample = min(3000, len(all_users))

for user_id in all_users[:n_users_to_sample]:
    user_books = set(train[train['user_id'] == user_id]['book_id'].unique())
    candidate_books = [b for b in top_books if b not in user_books]
    if len(candidate_books) >= 2:
        selected_books = np.random.choice(candidate_books, min(2, len(candidate_books)), replace=False)
        for book_id in selected_books:
            negative_samples.append({'user_id': user_id, 'book_id': book_id, 'has_read': -1})

for user_id in np.random.choice(all_users, min(1500, len(all_users)), replace=False):
    user_row = user_features[user_features['user_id'] == user_id]
    if len(user_row) > 0:
        user_read_rate = user_row['read_rate'].values[0]
        user_diversity = user_row['diversity'].values[0]
        temp_books = book_features.copy()
        temp_books['compatibility'] = (user_read_rate * temp_books['book_read_rate'] + user_diversity * temp_books['book_diversity'])
        user_books = set(train[train['user_id'] == user_id]['book_id'].unique())
        low_compat_books = temp_books[~temp_books['book_id'].isin(user_books)]
        low_compat_books = low_compat_books.nsmallest(10, 'compatibility')['book_id'].tolist()
        if len(low_compat_books) >= 1:
            selected_books = np.random.choice(low_compat_books, min(1, len(low_compat_books)), replace=False)
            for book_id in selected_books:
                negative_samples.append({'user_id': user_id, 'book_id': book_id, 'has_read': -1})

negative_df = pd.DataFrame(negative_samples)
train_extended = pd.concat([train, negative_df], ignore_index=True)
del negative_df, negative_samples
gc.collect()

train_features = train_extended.merge(user_features, on='user_id', how='left')
train_features = train_features.merge(book_features, on='book_id', how='left')
train_features['read_rate_diff'] = train_features['read_rate'] - train_features['book_read_rate']
train_features['interaction_ratio'] = train_features['total_interactions'] / (train_features['book_interactions'] + 1)
train_features['compatibility_score'] = (train_features['read_rate'] * train_features['book_read_rate'] + train_features['diversity'] * train_features['book_diversity'])
train_features['user_book_engagement_ratio'] = train_features['engagement'] / (train_features['book_popularity'] + 1e-10)
train_features['user_book_diversity_product'] = train_features['diversity'] * train_features['book_diversity']
train_features['has_interaction'] = train_features.apply(lambda row: 1 if (row['user_id'], row['book_id']) in train_pairs_set else 0, axis=1)
train_features['is_likely_read'] = ((train_features['read_rate'] > 0.5) & (train_features['book_read_rate'] > 0.3)).astype(int)
train_features['is_likely_plan'] = ((train_features['plan_rate'] > 0.5) & (train_features['book_read_rate'] < 0.3)).astype(int)
train_features['target'] = train_features['has_read'].map({1: 2, 0: 1, -1: 0})

feature_columns = [
    'read_rate', 'total_interactions', 'unique_books', 'plan_rate', 'diversity', 'engagement', 'conversion_rate',
    'age', 'gender_encoded',
    'book_read_rate', 'book_interactions', 'unique_users', 'book_plan_rate', 'book_diversity', 'book_popularity', 'book_conversion_rate',
    'desc_len', 'has_desc', 'desc_word_count', 'avg_rating', 'high_rated', 'low_rated', 'book_age', 'is_old_book', 'is_recent_book',
    'book_popularity_norm', 'book_read_rate_norm',
    'read_rate_diff', 'interaction_ratio', 'compatibility_score', 'user_book_engagement_ratio', 'user_book_diversity_product',
    'num_genres', 'avg_genre_read_rate', 'max_genre_read_rate', 'min_genre_read_rate', 'std_genre_read_rate',
    'interactions_per_day', 'reading_frequency', 'activity_duration_days',
    'has_interaction', 'is_likely_read', 'is_likely_plan'
]

available_features = [f for f in feature_columns if f in train_features.columns]
X = train_features[available_features].copy()
y = train_features['target'].copy()
del train_extended
gc.collect()

params = {
    'objective': 'multiclass',
    'num_class': 3,
    'metric': ['multi_logloss', 'multi_error'],
    'boosting_type': 'gbdt',
    'num_leaves': 127,
    'max_depth': 10,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 20,
    'min_split_gain': 0.01,
    'reg_alpha': 0.3,
    'reg_lambda': 0.3,
    'verbose': -1,
    'random_state': 42,
    'n_jobs': -1,
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'max_bin': 63,
    'tree_learner': 'data',
    'is_unbalance': True,
    'scale_pos_weight': 1.2,
}

groups = train_features['user_id']
group_kfold = GroupKFold(n_splits=3)
cv_models = []
cv_scores = []
cv_accuracies = []

for fold, (train_idx, val_idx) in enumerate(group_kfold.split(X, y, groups)):
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
    val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)
    model = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],
        num_boost_round=2000,
        callbacks=[lgb.early_stopping(100, verbose=False), lgb.log_evaluation(100)],
    )
    val_pred = model.predict(X_val_fold)
    val_pred_class = np.argmax(val_pred, axis=1)
    accuracy = (val_pred_class == y_val_fold).mean()
    cv_accuracies.append(accuracy)
    y_val_relevance = np.zeros((len(y_val_fold), 3))
    for i, class_id in enumerate(y_val_fold):
        y_val_relevance[i, class_id] = class_id + 1
    ndcg_val = ndcg_score(y_val_relevance, val_pred, k=20)
    cv_scores.append(ndcg_val)
    cv_models.append(model)
    del train_data, val_data, X_train_fold, X_val_fold, y_train_fold, y_val_fold
    gc.collect()

for i, model in enumerate(cv_models):
    model.save_model(str(MODEL_DIR / f"model_fold_{i}.txt"))

with open(MODEL_DIR / "features.json", "w") as f:
    json.dump(available_features, f)

def expand_candidates_simple(df):
    expanded = []
    for _, row in df.iterrows():
        user_id = row['user_id']
        book_list_str = row['book_id_list']
        if isinstance(book_list_str, str):
            try:
                book_str = book_list_str.strip('[]')
                book_ids = [int(b.strip()) for b in book_str.split(',') if b.strip()]
            except:
                book_ids = []
        else:
            book_ids = []
        for book_id in book_ids:
            expanded.append({'user_id': int(user_id), 'book_id': int(book_id)})
    return pd.DataFrame(expanded)

def generate_submission_with_hierarchy(candidates_features, models, train_pairs_set, available_features):
    X_test = candidates_features[available_features].copy()
    test_pred = np.zeros((len(X_test), 3))
    for model in models:
        test_pred += model.predict(X_test)
    test_pred /= len(models)
    candidates_features['prob_class_2'] = test_pred[:, 2]
    candidates_features['prob_class_1'] = test_pred[:, 1]
    candidates_features['prob_class_0'] = test_pred[:, 0]
    candidates_features['predicted_class'] = np.argmax(test_pred, axis=1)
    candidates_features['has_interaction'] = candidates_features.apply(lambda row: 1 if (row['user_id'], row['book_id']) in train_pairs_set else 0, axis=1)
    candidates_features['hierarchical_score'] = (
        candidates_features['predicted_class'] * 1000000 +
        candidates_features['prob_class_2'] * 10000 +
        candidates_features['prob_class_1'] * 1000 +
        candidates_features['prob_class_0'] * 100 +
        candidates_features['book_read_rate'] * 10 * candidates_features['predicted_class'].map({2: 1, 1: 0.5, 0: 0.1}) +
        candidates_features['read_rate'] * 5 * candidates_features['predicted_class'].map({2: 1, 1: 0.8, 0: 0.1}) +
        candidates_features['has_interaction'] * 50000
    )
    candidates_sorted = candidates_features.sort_values(['user_id', 'hierarchical_score'], ascending=[True, False]).drop_duplicates(subset=['user_id', 'book_id'], keep='first')
    submission_rows = []
    for user_id in candidates_sorted['user_id'].unique():
        user_books = candidates_sorted[candidates_sorted['user_id'] == user_id]
        seen = set()
        top_books = []
        for class_id in [2, 1, 0]:
            class_books = user_books[user_books['predicted_class'] == class_id]
            for book_id in class_books['book_id'].values:
                if book_id not in seen:
                    seen.add(book_id)
                    top_books.append(book_id)
                if len(top_books) >= 20:
                    break
            if len(top_books) >= 20:
                break
        submission_rows.append({'user_id': user_id, 'book_id_list': ','.join(map(str, top_books[:20]))})
    return pd.DataFrame(submission_rows)

candidates_expanded = expand_candidates_simple(candidates)
candidates_features = candidates_expanded.merge(user_features, on='user_id', how='left')
candidates_features = candidates_features.merge(book_features, on='book_id', how='left')
candidates_features['read_rate_diff'] = candidates_features['read_rate'] - candidates_features['book_read_rate']
candidates_features['interaction_ratio'] = candidates_features['total_interactions'] / (candidates_features['book_interactions'] + 1)
candidates_features['compatibility_score'] = (candidates_features['read_rate'] * candidates_features['book_read_rate'] + candidates_features['diversity'] * candidates_features['book_diversity'])
candidates_features['user_book_engagement_ratio'] = candidates_features['engagement'] / (candidates_features['book_popularity'] + 1e-10)
candidates_features['user_book_diversity_product'] = candidates_features['diversity'] * candidates_features['book_diversity']
candidates_features['is_likely_read'] = ((candidates_features['read_rate'] > 0.5) & (candidates_features['book_read_rate'] > 0.3)).astype(int)
candidates_features['is_likely_plan'] = ((candidates_features['plan_rate'] > 0.5) & (candidates_features['book_read_rate'] < 0.3)).astype(int)
candidates_features['has_interaction'] = candidates_features.apply(lambda row: 1 if (row['user_id'], row['book_id']) in train_pairs_set else 0, axis=1)

for col in available_features:
    if col not in candidates_features.columns:
        candidates_features[col] = 0
    elif candidates_features[col].isna().any():
        candidates_features[col] = candidates_features[col].fillna(0)

submission = generate_submission_with_hierarchy(candidates_features, cv_models, train_pairs_set, available_features)

target_users_set = set(targets['user_id'])
submission_users_set = set(submission['user_id'])
missing_users = target_users_set - submission_users_set

if missing_users:
    missing_rows = [{'user_id': user_id, 'book_id_list': ''} for user_id in missing_users]
    submission = pd.concat([submission, pd.DataFrame(missing_rows)], ignore_index=True)

submission = submission.sort_values('user_id')

for idx, row in submission.iterrows():
    if pd.notna(row['book_id_list']) and row['book_id_list'] != '':
        books = list(map(int, str(row['book_id_list']).split(',')))
        if len(books) != len(set(books)):
            unique_books = []
            seen = set()
            for book in books:
                if book not in seen:
                    seen.add(book)
                    unique_books.append(book)
            submission.at[idx, 'book_id_list'] = ','.join(map(str, unique_books[:20]))

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
final_path = SUBMISSION_DIR / f"submission_gpu_enhanced_{timestamp}.csv"
submission[['user_id', 'book_id_list']].to_csv(final_path, index=False)

print(f"\nРЕЗУЛЬТАТЫ:")
print(f"Средняя точность: {np.mean(cv_accuracies):.4f}")
print(f"Средний NDCG@20: {np.mean(cv_scores):.4f}")
print(f"Submission сохранен: {final_path}")

del X, y, train_features, candidates_features
gc.collect()

print("\nГотово!")

НАЧАЛО РАБОТЫ ПАЙПЛАЙНА

ЗАГРУЗКА ДАННЫХ
[100]	valid_0's multi_logloss: 0.442086	valid_0's multi_error: 0.19277
[200]	valid_0's multi_logloss: 0.385062	valid_0's multi_error: 0.192585
[100]	valid_0's multi_logloss: 0.437953	valid_0's multi_error: 0.188724
[200]	valid_0's multi_logloss: 0.379992	valid_0's multi_error: 0.188583
[100]	valid_0's multi_logloss: 0.437546	valid_0's multi_error: 0.18916
[200]	valid_0's multi_logloss: 0.380143	valid_0's multi_error: 0.18878

РЕЗУЛЬТАТЫ:
Средняя точность: 0.8103
Средний NDCG@20: 0.9300
Submission сохранен: output/submissions/submission_gpu_enhanced_20251209_075527.csv

Готово!
