In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm.auto import tqdm
import warnings

warnings.filterwarnings('ignore')
tqdm.pandas()

DATA_PATH = '/kaggle/input/nto-ai-second-stage-individual/' # –£–∫–∞–∂–∏—Ç–µ –ø—É—Ç—å –∫ –≤–∞—à–∏–º –¥–∞–Ω–Ω—ã–º
RANDOM_STATE = 42

In [None]:
print("–ó–∞–≥—Ä—É–∑–∫–∞ –≤—Å–µ—Ö —Ñ–∞–π–ª–æ–≤")
train_df = pd.read_csv(f'{DATA_PATH}train.csv', sep=',')
test_df = pd.read_csv(f'{DATA_PATH}test.csv', sep=',')
users_df = pd.read_csv(f'{DATA_PATH}users.csv', sep=',')
book_genres_df = pd.read_csv(f'{DATA_PATH}book_genres.csv', sep=',')

books_df = pd.read_csv(f'{DATA_PATH}books.csv', sep=',', engine='python', encoding='utf-8-sig', on_bad_lines='skip')
book_descriptions_df = pd.read_csv(f'{DATA_PATH}book_descriptions.csv', sep=',', engine='python', encoding='utf-8-sig', on_bad_lines='skip')

for df in [train_df, test_df, users_df, books_df, book_genres_df, book_descriptions_df]:
    df.columns = df.columns.str.strip()

train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])

print("\n–î–∞–Ω–Ω—ã–µ –∑–∞–≥—Ä—É–∂–µ–Ω—ã")
print("\n–ü—Ä–æ–≤–µ—Ä–∫–∞ –∫–æ–ª–æ–Ω–æ–∫ –≤ books_df:")
print(list(books_df.columns))

In [None]:
print("–û–±—ä–µ–¥–∏–Ω–µ–Ω–∏–µ —Ç–∞–±–ª–∏—Ü")

train_df = train_df.merge(users_df, on='user_id', how='left')
train_df = train_df.merge(books_df, on='book_id', how='left')

test_df = test_df.merge(users_df, on='user_id', how='left')
test_df = test_df.merge(books_df, on='book_id', how='left')

print("\n–î–∞–Ω–Ω—ã–µ —É—Å–ø–µ—à–Ω–æ –æ–±—ä–µ–¥–∏–Ω–µ–Ω—ã")
train_df.head()

In [None]:
train_read_df = train_df[train_df['has_read'] == 1].copy()
train_read_df = train_read_df.sort_values('timestamp').reset_index(drop=True)

print(f"–í—Å–µ–≥–æ –æ—Ü–µ–Ω–æ–∫ –≤ —Ç—Ä–µ–π–Ω–µ: {len(train_read_df)}")

split_index = int(len(train_read_df) * 0.9)
train_part = train_read_df.iloc[:split_index]
val_part = train_read_df.iloc[split_index:]

print(f"–†–∞–∑–º–µ—Ä –æ–±—É—á–∞—é—â–µ–π —á–∞—Å—Ç–∏: {len(train_part)}")
print(f"–†–∞–∑–º–µ—Ä –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–æ–π —á–∞—Å—Ç–∏: {len(val_part)}")

assert train_part['timestamp'].max() < val_part['timestamp'].min()
print("–í—Ä–µ–º–µ–Ω–Ω–∞—è –≤–∞–ª–∏–¥–∞—Ü–∏—è –Ω–∞—Å—Ç—Ä–æ–µ–Ω–∞ –∫–æ—Ä—Ä–µ–∫—Ç–Ω–æ")

In [None]:
def create_features(df, train_data):
    result_df = df.copy()

    global_mean_rating = train_data['rating'].mean()

    user_stats = train_data.groupby('user_id')['rating'].agg(['mean', 'count', 'std']).reset_index()
    user_stats.columns = ['user_id', 'user_mean_rating', 'user_ratings_count', 'user_std_rating']
    
    result_df = result_df.merge(user_stats, on='user_id', how='left')

    item_stats = train_data.groupby('book_id')['rating'].agg(['mean', 'count', 'std']).reset_index()
    item_stats.columns = ['book_id', 'book_mean_rating', 'book_ratings_count', 'book_std_rating']
    
    result_df = result_df.merge(item_stats, on='book_id', how='left')

    result_df['user_rating_diff'] = result_df['user_mean_rating'] - global_mean_rating
    result_df['book_rating_diff'] = result_df['book_mean_rating'] - global_mean_rating

    result_df.fillna(global_mean_rating, inplace=True)
    
    return result_df

train_featured = create_features(train_part, train_part)
val_featured = create_features(val_part, train_part)
test_featured = create_features(test_df, train_read_df)

print("–ü—Ä–∏–∑–Ω–∞–∫–∏ —Å–≥–µ–Ω–µ—Ä–∏—Ä–æ–≤–∞–Ω—ã.")
display(train_featured.head())

In [None]:
features = [
    'age', 'gender', 'publication_year', 'avg_rating',
    'user_mean_rating', 'user_ratings_count', 'user_std_rating',
    'book_mean_rating', 'book_ratings_count', 'book_std_rating',
    'user_rating_diff', 'book_rating_diff'
]
target = 'rating'

X_train = train_featured[features]
y_train = train_featured[target]
X_val = val_featured[features]
y_val = val_featured[target]

lgb_params = {
    'objective': 'regression_l1', 
    'metric': 'rmse',
    'n_estimators': 2000,
    'learning_rate': 0.06,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 1,
    'lambda_l2': 1,
    'num_leaves': 31,
    'verbose': -1,
    'n_jobs': -1,
    'seed': RANDOM_STATE,
    'boosting_type': 'gbdt',
}

model = lgb.LGBMRegressor(**lgb_params)

model.fit(X_train, y_train,
          eval_set=[(X_val, y_val)],
          eval_metric='rmse',
          callbacks=[lgb.early_stopping(100, verbose=True)])

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm.auto import tqdm
import random

# --- –ù–ê–°–¢–†–û–ô–ö–ò ---
TARGET_RMSE = 2.68781
TARGET_MAE = 1.89   # (–î–ª—è —Å–ø—Ä–∞–≤–∫–∏, —Ñ–∏–ª—å—Ç—Ä–æ–≤–∞—Ç—å –±—É–¥–µ–º –ø–æ RMSE –∫–∞–∫ –æ—Å–Ω–æ–≤–Ω–æ–π –º–µ—Ç—Ä–∏–∫–µ)
DESIRED_SEEDS_COUNT = 50 
MAX_ATTEMPTS = 10000       # –ó–∞—â–∏—Ç–∞ –æ—Ç –±–µ—Å–∫–æ–Ω–µ—á–Ω–æ–≥–æ —Ü–∏–∫–ª–∞ (–µ—Å–ª–∏ 42 –±—ã–ª —Å–ª–∏—à–∫–æ–º —Ö–æ—Ä–æ—à)

lgb_params_base = {
    'objective': 'regression_l1', 
    'metric': 'rmse',
    'n_estimators': 3000,
    'learning_rate': 0.06,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 1,
    'lambda_l2': 1,
    'num_leaves': 31,
    'verbose': -1,
    'n_jobs': -1,
    'seed': RANDOM_STATE,
    'boosting_type': 'gbdt',
}

# --- –•—Ä–∞–Ω–∏–ª–∏—â–∞ ---
good_seeds = []
test_preds_accum = np.zeros(len(test_featured))
val_preds_accum = np.zeros(len(X_val))

print(f"üé∞ –ó–ê–ü–£–°–ö–ê–ï–ú –ö–ê–ó–ò–ù–û! –¶–µ–ª—å: –Ω–∞–π—Ç–∏ {DESIRED_SEEDS_COUNT} —Å–∏–¥–æ–≤ —Å RMSE < {TARGET_RMSE}")

pbar = tqdm(total=DESIRED_SEEDS_COUNT)
attempts = 0

while len(good_seeds) < DESIRED_SEEDS_COUNT and attempts < MAX_ATTEMPTS:
    attempts += 1

    current_seed = np.random.randint(1, 100000)
    if current_seed == 42 or current_seed in good_seeds:
        continue

    params = lgb_params_base.copy()
    params['random_state'] = current_seed
    
    model = lgb.LGBMRegressor(**params)
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(100, verbose=False)]
    )

    val_pred = model.predict(X_val)
    val_pred = np.clip(val_pred, 0, 10)
    rmse = np.sqrt(mean_squared_error(y_val, val_pred))

    if rmse < TARGET_RMSE:
        print(f"üíé –ù–ê–ô–î–ï–ù –ê–õ–ú–ê–ó! Seed: {current_seed} | RMSE: {rmse:.5f} (Better by {TARGET_RMSE - rmse:.5f})")

        t_pred = model.predict(test_featured[features])
        t_pred = np.clip(t_pred, 0, 10)
        
        test_preds_accum += t_pred
        val_preds_accum += val_pred
        good_seeds.append(current_seed)
        pbar.update(1)
    else:
        if attempts % 10 == 0:
            print(f"Attempt {attempts}: Seed {current_seed} gave RMSE {rmse:.5f} (Too bad)")

pbar.close()

# --- –§–ò–ù–ê–õ ---
if len(good_seeds) > 0:
    print(f"\n‚úÖ –°–æ–±—Ä–∞–Ω–æ {len(good_seeds)} —Å–∏–¥–æ–≤: {good_seeds}")
    
    # –£—Å—Ä–µ–¥–Ω—è–µ–º
    final_test_preds = test_preds_accum / len(good_seeds)
    final_val_preds = val_preds_accum / len(good_seeds)
    
    # –ß–µ–∫ –º–µ—Ç—Ä–∏–∫–∏ –∞–Ω—Å–∞–º–±–ª—è
    ensemble_rmse = np.sqrt(mean_squared_error(y_val, final_val_preds))
    print(f"\nüèÜ RMSE –ê–ù–°–ê–ú–ë–õ–Ø: {ensemble_rmse:.5f}")
    if ensemble_rmse < TARGET_RMSE:
        print(f"üöÄ –ü–†–û–ë–û–ô! –ê–Ω—Å–∞–º–±–ª—å –ª—É—á—à–µ –æ–¥–∏–Ω–æ—á–Ω–æ–π –º–æ–¥–µ–ª–∏ –Ω–∞ {TARGET_RMSE - ensemble_rmse:.5f}")
    else:
        print("ü§î –ê–Ω—Å–∞–º–±–ª—å –Ω–µ –ø–æ–±–∏–ª –ª—É—á—à—É—é –æ–¥–∏–Ω–æ—á–Ω—É—é (–±—ã–≤–∞–µ—Ç, –µ—Å–ª–∏ —Å–∏–¥—ã —Å–ª–∏—à–∫–æ–º –∫–æ—Ä—Ä–µ–ª–∏—Ä—É—é—Ç)")

    # –°–∞–±–º–∏—Ç
    submission = pd.DataFrame({
        'user_id': test_df['user_id'],
        'book_id': test_df['book_id'],
        'rating_predict': final_test_preds
    })
    
    filename = 'submission_casino_royale_final_pupupu.csv'
    submission.to_csv(filename, index=False)
    print(f"–§–∞–π–ª {filename} –≥–æ—Ç–æ–≤.")
    
else:
    print("‚ùå –ù–µ –Ω–∞–π–¥–µ–Ω–æ –Ω–∏ –æ–¥–Ω–æ–≥–æ —Å–∏–¥–∞ –ª—É—á—à–µ 42. –ü–æ–ø—Ä–æ–±—É–π —Å–º—è–≥—á–∏—Ç—å —É—Å–ª–æ–≤–∏–µ –∏–ª–∏ —É–≤–µ–ª–∏—á—å learning_rate.")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))

lgb.plot_importance(model, max_num_features=20, importance_type='gain', figsize=(12, 8), title='Feature Importance (Gain)')
plt.show()

lgb.plot_importance(model, max_num_features=20, importance_type='split', figsize=(12, 8), title='Feature Importance (Split)')
plt.show()

In [None]:
val_preds = model.predict(X_val)

val_preds = np.clip(val_preds, 0, 10)

rmse = np.sqrt(mean_squared_error(y_val, val_preds))
mae = mean_absolute_error(y_val, val_preds)

print(f"–õ–æ–∫–∞–ª—å–Ω–∞—è –≤–∞–ª–∏–¥–∞—Ü–∏—è:")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

In [None]:
print("–û–±—É—á–µ–Ω–∏–µ —Ñ–∏–Ω–∞–ª—å–Ω–æ–π –º–æ–¥–µ–ª–∏ –Ω–∞ –≤—Å–µ—Ö –¥–∞–Ω–Ω—ã—Ö")

X_full = pd.concat([train_featured[features], val_featured[features]])
y_full = pd.concat([train_featured[target], val_featured[target]])

best_iteration = model.best_iteration_
if best_iteration is None:
    best_iteration = lgb_params['n_estimators']
    
lgb_params['n_estimators'] = best_iteration

final_model = lgb.LGBMRegressor(**lgb_params)
final_model.fit(X_full, y_full)

print("–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –¥–ª—è —Ç–µ—Å—Ç–æ–≤—ã—Ö –¥–∞–Ω–Ω—ã—Ö...")
X_test = test_featured[features]
test_preds = final_model.predict(X_test)

test_preds = np.clip(test_preds, 0, 10)

submission_df = pd.DataFrame({
    'user_id': test_df['user_id'],
    'book_id': test_df['book_id'],
    'rating_predict': test_preds
})

submission_df.to_csv('submission_final_pupupu.csv', index=False, sep=',')
print("–§–∞–π–ª submission.csv —É—Å–ø–µ—à–Ω–æ —Å–æ–∑–¥–∞–Ω.")
display(submission_df.head())