<a href="https://colab.research.google.com/github/seludkoaleksandr95-coder/Nto/blob/main/%D0%9A%D0%BE%D0%BF%D0%B8%D1%8F_%D0%B1%D0%BB%D0%BE%D0%BA%D0%BD%D0%BE%D1%82%D0%B0_%22Untitled11_ipynb%22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!unzip /content/stage1_individual_data.zip

Archive:  /content/stage1_individual_data.zip
  inflating: book_descriptions.csv   
  inflating: book_genres.csv         
  inflating: books.csv               
  inflating: genres.csv              
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               
  inflating: users.csv               


In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
import joblib
from scipy.optimize import minimize

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

if 'has_read' in train.columns:
    train = train[train['has_read'] == 1].copy()

y_original = train['rating'].values
global_mean = train['rating'].mean()

user_stats = train.groupby('user_id')['rating'].agg([
    'mean', 'std', 'count', 'min', 'max', 'median',
    lambda x: (x == 10).sum(),
    lambda x: (x >= 8).sum(),
    lambda x: (x <= 5).sum(),
    lambda x: x.skew() if len(x) > 2 else 0
]).reset_index()

user_stats.columns = [
    'user_id', 'user_mean', 'user_std', 'user_count',
    'user_min', 'user_max', 'user_median', 'user_count_10',
    'user_count_high', 'user_count_low', 'user_skew'
]

user_stats['user_10_ratio'] = user_stats['user_count_10'] / user_stats['user_count']
user_stats['user_high_ratio'] = user_stats['user_count_high'] / user_stats['user_count']
user_stats['user_low_ratio'] = user_stats['user_count_low'] / user_stats['user_count']

book_stats = train.groupby('book_id')['rating'].agg([
    'mean', 'std', 'count', 'min', 'max', 'median',
    lambda x: (x == 10).sum(),
    lambda x: (x >= 8).sum(),
    lambda x: (x <= 5).sum(),
    lambda x: x.skew() if len(x) > 2 else 0
]).reset_index()

book_stats.columns = [
    'book_id', 'book_mean', 'book_std', 'book_count',
    'book_min', 'book_max', 'book_median', 'book_count_10',
    'book_count_high', 'book_count_low', 'book_skew'
]

book_stats['book_10_ratio'] = book_stats['book_count_10'] / book_stats['book_count']
book_stats['book_high_ratio'] = book_stats['book_count_high'] / book_stats['book_count']
book_stats['book_low_ratio'] = book_stats['book_count_low'] / book_stats['book_count']

if 'timestamp' in train.columns:
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    train['year'] = train['timestamp'].dt.year
    train['month'] = train['timestamp'].dt.month
    train['day'] = train['timestamp'].dt.day
    train['dayofweek'] = train['timestamp'].dt.dayofweek
    train['hour'] = train['timestamp'].dt.hour

    train['month_sin'] = np.sin(2 * np.pi * train['month'] / 12)
    train['month_cos'] = np.cos(2 * np.pi * train['month'] / 12)
    train['dayofweek_sin'] = np.sin(2 * np.pi * train['dayofweek'] / 7)
    train['dayofweek_cos'] = np.cos(2 * np.pi * train['dayofweek'] / 7)
    train['hour_sin'] = np.sin(2 * np.pi * train['hour'] / 24)
    train['hour_cos'] = np.cos(2 * np.pi * train['hour'] / 24)

X_data = train[['user_id', 'book_id']].copy()
if 'timestamp' in train.columns:
    time_features = ['year', 'month', 'day', 'dayofweek', 'hour',
                     'month_sin', 'month_cos', 'dayofweek_sin', 'dayofweek_cos',
                     'hour_sin', 'hour_cos']
    for feat in time_features:
        X_data[feat] = train[feat]

X_data = X_data.merge(user_stats, on='user_id', how='left')
X_data = X_data.merge(book_stats, on='book_id', how='left')

X_data['mean_diff'] = X_data['user_mean'] - X_data['book_mean']
X_data['mean_avg'] = (X_data['user_mean'] + X_data['book_mean']) / 2
X_data['std_avg'] = (X_data['user_std'] + X_data['book_std']) / 2
X_data['count_ratio'] = np.log1p(X_data['user_count']) / (np.log1p(X_data['book_count']) + 1)
X_data['10_ratio_diff'] = X_data['user_10_ratio'] - X_data['book_10_ratio']
X_data['high_ratio_diff'] = X_data['user_high_ratio'] - X_data['book_high_ratio']
X_data['low_ratio_diff'] = X_data['user_low_ratio'] - X_data['book_low_ratio']

X_data['user_confidence'] = 1 - np.exp(-X_data['user_count'] / 10)
X_data['book_confidence'] = 1 - np.exp(-X_data['book_count'] / 5)
X_data['combined_confidence'] = (X_data['user_confidence'] + X_data['book_confidence']) / 2

X_data['weighted_mean'] = (
    X_data['user_mean'] * X_data['user_confidence'] +
    X_data['book_mean'] * X_data['book_confidence']
) / (X_data['user_confidence'] + X_data['book_confidence'] + 1e-10)

for col in X_data.columns:
    if col not in ['user_id', 'book_id']:
        if col.endswith('_mean') or col.endswith('_median'):
            X_data[col] = X_data[col].fillna(global_mean)
        elif col.endswith('_count'):
            X_data[col] = X_data[col].fillna(0)
        elif col.endswith('_std') or col.endswith('_skew'):
            X_data[col] = X_data[col].fillna(0)
        elif col.endswith('_ratio') or 'confidence' in col:
            X_data[col] = X_data[col].fillna(0)
        else:
            X_data[col] = X_data[col].fillna(0)

feature_cols = [col for col in X_data.columns if col not in ['user_id', 'book_id']]

X = X_data[feature_cols]
y = train['rating'].values

kf = KFold(n_splits=5, shuffle=True, random_state=42)
models_predictions = {}
models = {}

lgb_predictions = np.zeros(len(X))
lgb_models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    model = lgb.LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        num_leaves=63,
        max_depth=8,
        min_child_samples=20,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=0.1,
        random_state=42 + fold,
        verbose=-1,
        n_jobs=-1
    )

    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    lgb_predictions[val_idx] = pred
    lgb_models.append(model)

lgb_rmse = np.sqrt(mean_squared_error(y, lgb_predictions))
models['lgb'] = lgb_models
models_predictions['lgb'] = lgb_predictions

xgb_predictions = np.zeros(len(X))
xgb_models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    model = xgb.XGBRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=0.1,
        random_state=42 + fold,
        n_jobs=-1,
        verbosity=0
    )

    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    xgb_predictions[val_idx] = pred
    xgb_models.append(model)

xgb_rmse = np.sqrt(mean_squared_error(y, xgb_predictions))
models['xgb'] = xgb_models
models_predictions['xgb'] = xgb_predictions

rf_predictions = np.zeros(len(X))
rf_models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    model = RandomForestRegressor(
        n_estimators=100,
        max_depth=15,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features=0.5,
        random_state=42 + fold,
        n_jobs=-1,
        verbose=0
    )

    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    rf_predictions[val_idx] = pred
    rf_models.append(model)

rf_rmse = np.sqrt(mean_squared_error(y, rf_predictions))
models['rf'] = rf_models
models_predictions['rf'] = rf_predictions

user_mean_pred = X_data['user_mean'].fillna(global_mean).values
user_mean_rmse = np.sqrt(mean_squared_error(y, user_mean_pred))
models_predictions['user_mean'] = user_mean_pred

book_mean_pred = X_data['book_mean'].fillna(global_mean).values
book_mean_rmse = np.sqrt(mean_squared_error(y, book_mean_pred))
models_predictions['book_mean'] = book_mean_pred

weighted_pred = (
    user_mean_pred * X_data['user_confidence'] +
    book_mean_pred * X_data['book_confidence']
) / (X_data['user_confidence'] + X_data['book_confidence'] + 1e-10)
weighted_rmse = np.sqrt(mean_squared_error(y, weighted_pred))
models_predictions['weighted'] = weighted_pred

all_predictions = pd.DataFrame(models_predictions)

def objective(weights):
    weights = np.clip(weights, 0, 1)
    weights = weights / weights.sum()

    ensemble_pred = np.zeros(len(y))
    for i, col in enumerate(all_predictions.columns):
        ensemble_pred += all_predictions[col].values * weights[i]

    rmse = np.sqrt(mean_squared_error(y, ensemble_pred))
    return rmse

initial_weights = np.ones(len(all_predictions.columns))
for i, col in enumerate(all_predictions.columns):
    if col == 'lgb':
        initial_weights[i] = 3.0
    elif col == 'xgb':
        initial_weights[i] = 2.0
    elif col == 'rf':
        initial_weights[i] = 1.5
    else:
        initial_weights[i] = 0.5

initial_weights = initial_weights / initial_weights.sum()

bounds = [(0, 1) for _ in range(len(all_predictions.columns))]

result = minimize(
    objective,
    initial_weights,
    bounds=bounds,
    method='SLSQP',
    options={'maxiter': 100, 'disp': False}
)

optimal_weights = np.clip(result.x, 0, 1)
optimal_weights = optimal_weights / optimal_weights.sum()

ensemble_oof = np.zeros(len(y))
for i, col in enumerate(all_predictions.columns):
    ensemble_oof += all_predictions[col].values * optimal_weights[i]

ensemble_rmse = np.sqrt(mean_squared_error(y, ensemble_oof))

X_test = test[['user_id', 'book_id']].copy()

if 'timestamp' in train.columns:
    for feat in time_features:
        X_test[feat] = train[feat].mean()

X_test = X_test.merge(user_stats, on='user_id', how='left')
X_test = X_test.merge(book_stats, on='book_id', how='left')

X_test['mean_diff'] = X_test['user_mean'] - X_test['book_mean']
X_test['mean_avg'] = (X_test['user_mean'] + X_test['book_mean']) / 2
X_test['std_avg'] = (X_test['user_std'] + X_test['book_std']) / 2
X_test['count_ratio'] = np.log1p(X_test['user_count']) / (np.log1p(X_test['book_count']) + 1)
X_test['10_ratio_diff'] = X_test['user_10_ratio'] - X_test['book_10_ratio']
X_test['high_ratio_diff'] = X_test['user_high_ratio'] - X_test['book_high_ratio']
X_test['low_ratio_diff'] = X_test['user_low_ratio'] - X_test['book_low_ratio']

X_test['user_confidence'] = 1 - np.exp(-X_test['user_count'] / 10)
X_test['book_confidence'] = 1 - np.exp(-X_test['book_count'] / 5)
X_test['combined_confidence'] = (X_test['user_confidence'] + X_test['book_confidence']) / 2

X_test['weighted_mean'] = (
    X_test['user_mean'] * X_test['user_confidence'] +
    X_test['book_mean'] * X_test['book_confidence']
) / (X_test['user_confidence'] + X_test['book_confidence'] + 1e-10)

for col in feature_cols:
    if col in X_test.columns:
        if col.endswith('_mean') or col.endswith('_median'):
            X_test[col] = X_test[col].fillna(global_mean)
        elif col.endswith('_count'):
            X_test[col] = X_test[col].fillna(0)
        elif col.endswith('_std') or col.endswith('_skew'):
            X_test[col] = X_test[col].fillna(0)
        elif col.endswith('_ratio') or 'confidence' in col:
            X_test[col] = X_test[col].fillna(0)
        else:
            X_test[col] = X_test[col].fillna(0)

X_test_features = X_test[feature_cols]

test_predictions_all = {}

for name, model_list in models.items():
    if name in ['lgb', 'xgb', 'rf']:
        preds = np.zeros((len(X_test_features), len(model_list)))
        for i, model in enumerate(model_list):
            preds[:, i] = model.predict(X_test_features)
        test_predictions_all[name] = preds.mean(axis=1)

test_predictions_all['user_mean'] = X_test['user_mean'].fillna(global_mean).values
test_predictions_all['book_mean'] = X_test['book_mean'].fillna(global_mean).values
test_predictions_all['weighted'] = (
    test_predictions_all['user_mean'] * X_test['user_confidence'] +
    test_predictions_all['book_mean'] * X_test['book_confidence']
) / (X_test['user_confidence'] + X_test['book_confidence'] + 1e-10)

test_ensemble = np.zeros(len(X_test_features))
for i, col in enumerate(all_predictions.columns):
    if col in test_predictions_all:
        test_ensemble += test_predictions_all[col] * optimal_weights[i]

test_ensemble = np.clip(test_ensemble, 0, 10)

target_distribution = dict(train['rating'].value_counts(normalize=True).sort_index())

def adjust_distribution(preds, target_dist):
    sorted_idx = np.argsort(preds)
    sorted_preds = preds[sorted_idx]

    n = len(preds)
    adjusted = np.zeros_like(preds)

    percentiles = np.cumsum(list(target_dist.values()))
    percentiles = percentiles / percentiles[-1]

    values = list(target_dist.keys())

    for i, idx in enumerate(sorted_idx):
        percentile = i / n
        for j, p in enumerate(percentiles):
            if percentile <= p:
                adjusted[idx] = values[j]
                break

    return adjusted

test_adjusted = adjust_distribution(test_ensemble, target_distribution)

current_mean = test_adjusted.mean()
if abs(current_mean - global_mean) > 0.1:
    adjustment = (global_mean - current_mean) * 0.5
    test_adjusted = np.clip(test_adjusted + adjustment, 0, 10)
    test_adjusted = np.round(test_adjusted).astype(int)

strategies = {}
strategies['smart_dist'] = test_adjusted.copy()
strategies['simple_round'] = np.round(test_ensemble)
strategies['simple_round'] = np.clip(strategies['simple_round'], 0, 10).astype(int)

submission_main = test[['user_id', 'book_id']].copy()
submission_main['rating_predict'] = strategies['smart_dist']
submission_main.to_csv('submission_ensemble.csv', index=False, float_format='%.1f')

joblib.dump(models, 'ensemble_models.pkl')
joblib.dump(feature_cols, 'feature_cols.pkl')
joblib.dump(optimal_weights, 'optimal_weights.pkl')

['optimal_weights.pkl']