# Predicting user new rating on codeforces

In [1]:
import requests
import pandas as pd
import os
import time
from typing import List, Dict, Any, Optional
import numpy as np
import joblib # Required for saving and loading the model

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

RANDOM_STATE = 42

## 1. Data Collection: Creating Dataset using Contest List (`dataset_1.csv`)

This section fetches rating change data from specific Codeforces contests. We use **`ratingDelta`** ($\text{newRating} - \text{oldRating}$) as the target variable for improved model robustness. The **`contestId`** is retained as a feature, recognizing its potential importance for contest age/difficulty.

In [2]:
# --- Create Output Directory for CSV and Define Contest List ---
output_file_path_1 = 'data_for_user_rating_prediction/dataset_1.csv'
output_dir = os.path.dirname(output_file_path_1)
if output_dir and not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created data directory: {output_dir}")
    
contestId_list = [1000, 1955, 1956, 1253, 1500, 1957]

Created data directory: data_for_user_rating_prediction


In [3]:
df1 = pd.DataFrame()
for contestId in contestId_list:
    url = f"https://codeforces.com/api/contest.ratingChanges?contestId={contestId}"
    print(f"Fetching data for Contest ID: {contestId}...")
    
    try:
        resp = requests.get(url, timeout=10)
        resp.raise_for_status() 
        data = resp.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for Contest ID {contestId}: {e}")
        continue 
    
    if data.get('status') != 'OK':
        print(f"API request for Contest ID {contestId} failed: {data.get('comment', 'No comment provided')}. Skipping.")
        continue 
    
    if 'result' not in data or not data['result']:
        print(f"No rating changes found for Contest ID: {contestId}. Skipping.")
        continue

    df_temp = pd.DataFrame(data['result'])
    df_temp['contestId'] = contestId 
    df1 = pd.concat([df1, df_temp], ignore_index=True)
    time.sleep(0.5)

Fetching data for Contest ID: 1000...
Fetching data for Contest ID: 1955...
Fetching data for Contest ID: 1956...
Fetching data for Contest ID: 1253...
Fetching data for Contest ID: 1500...
Fetching data for Contest ID: 1957...


In [4]:
# --- Final Processing and Feature Engineering on df1 ---
if df1.empty:
    print("No data was successfully fetched to process. Exiting.")
else:
    # Drop non-feature columns, keeping 'contestId'
    df1 = df1.drop(columns=['handle', 'ratingUpdateTimeSeconds'], errors='ignore')
    
    # FEATURE ENGINEERING: Delta Calculation (Target Variable)
    df1['ratingDelta'] = df1['newRating'] - df1['oldRating']
    
    # FEATURE ENGINEERING: Creating Division indicators
    for div in [1, 2, 3, 4]:
        df1[f'Div. {div}'] = 0

    df1['Div. 3'] = df1['contestName'].str.contains('Div. 3', case=False, na=False).astype(int)
    df1['Div. 2'] = df1['contestName'].str.contains('Div. 2', case=False, na=False).astype(int)
    df1['Div. 1'] = df1['contestName'].str.contains('Div. 1', case=False, na=False).astype(int)
    df1['Div. 4'] = df1['contestName'].str.contains('Div. 4', case=False, na=False).astype(int)

    # 'unknown' is 1 if none of the explicit divisions were found
    df1['unknown'] = (df1[['Div. 1', 'Div. 2', 'Div. 3', 'Div. 4']].sum(axis=1) == 0).astype(int)

    df1.to_csv(output_file_path_1, index=False)
    print(f"\nSuccessfully saved {len(df1)} rating change entries to: {output_file_path_1} 🎉")
    print(df1.head())


Successfully saved 59994 rating change entries to: data_for_user_rating_prediction/dataset_1.csv 🎉
   contestId                                        contestName  rank  \
0       1000  Educational Codeforces Round 46 (Rated for Div...     1   
1       1000  Educational Codeforces Round 46 (Rated for Div...     2   
2       1000  Educational Codeforces Round 46 (Rated for Div...     3   
3       1000  Educational Codeforces Round 46 (Rated for Div...     4   
4       1000  Educational Codeforces Round 46 (Rated for Div...     5   

   oldRating  newRating  ratingDelta  Div. 1  Div. 2  Div. 3  Div. 4  unknown  
0       1959       2241          282       0       1       0       0        0  
1       1919       2181          262       0       1       0       0        0  
2       2083       2277          194       0       1       0       0        0  
3       1916       2147          231       0       1       0       0        0  
4       1915       2127          212       0       1       0 

## 2. Data Collection: Creating Dataset using List of Handles (`dataset_2.csv`)

In [5]:
# --- Configuration ---
handles_list = ['sangam2ishra', 'sangammishra', 'tourist', 'jiangly', 'Benq']
output_file_path_2 = 'data_for_user_rating_prediction/dataset_2.csv'
API_URL = "https://codeforces.com/api/user.rating"

In [6]:
all_ratings_df_list = [] 
output_dir = os.path.dirname(output_file_path_2)
if output_dir and not os.path.exists(output_dir):
    os.makedirs(output_dir)

for handle in handles_list:
    url = f"{API_URL}?handle={handle}"
    print(f"Fetching rating history for handle: {handle}...")
    
    try:
        resp = requests.get(url, timeout=10)
        resp.raise_for_status() 
        data = resp.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {handle}: {e}. Skipping.")
        continue 
    
    if data.get('status') != 'OK':
        print(f"API request for {handle} failed: {data.get('comment', 'No comment provided')}. Skipping.")
        continue 
    
    if 'result' in data and data['result']:
        all_ratings_df_list.extend(data['result'])
    else:
        print(f"No rating changes found for handle: {handle}.")
        
    time.sleep(0.5) 


Fetching rating history for handle: sangam2ishra...
Fetching rating history for handle: sangammishra...
Fetching rating history for handle: tourist...
Fetching rating history for handle: jiangly...
Fetching rating history for handle: Benq...


In [7]:
# --- Merging and Final Processing on df2 ---

if not all_ratings_df_list:
    print("No data was successfully fetched to process. Exiting.")
    df2 = pd.DataFrame()
else:
    df2 = pd.DataFrame(all_ratings_df_list)
    print(f"\nSuccessfully retrieved {len(df2)} total rating change entries.")

    # Drop specified columns, keeping contestId
    df2 = df2.drop(columns=['ratingUpdateTimeSeconds', 'handle'], errors='ignore')
    
    # FEATURE ENGINEERING: Delta Calculation (Target Variable)
    df2['ratingDelta'] = df2['newRating'] - df2['oldRating']

    # FEATURE ENGINEERING: Creating Division indicators based on 'contestName'
    for div in [1, 2, 3, 4]:
        df2[f'Div. {div}'] = 0

    df2['Div. 3'] = df2['contestName'].str.contains('Div. 3', case=False, na=False).astype(int)
    df2['Div. 2'] = df2['contestName'].str.contains('Div. 2', case=False, na=False).astype(int)
    df2['Div. 1'] = df2['contestName'].str.contains('Div. 1', case=False, na=False).astype(int)
    df2['Div. 4'] = df2['contestName'].str.contains('Div. 4', case=False, na=False).astype(int)

    # 'unknown' is 1 if none of the explicit divisions were found
    df2['unknown'] = (df2[['Div. 1', 'Div. 2', 'Div. 3', 'Div. 4']].sum(axis=1) == 0).astype(int)

    # --- Final Save ---
    df2.to_csv(output_file_path_2, index=False)
    print(f"Successfully saved {len(df2)} rating change entries to: {output_file_path_2} 🎉")
    print(df2.head())


Successfully retrieved 728 total rating change entries.
Successfully saved 728 rating change entries to: data_for_user_rating_prediction/dataset_2.csv 🎉
   contestId                                        contestName   rank  \
0       1851                      Codeforces Round 888 (Div. 3)   9413   
1       1849  Educational Codeforces Round 152 (Rated for Di...  10560   
2       1855                      Codeforces Round 889 (Div. 2)   5517   
3       1856  Codeforces Round 890 (Div. 2) supported by Con...   9812   
4       1857                      Codeforces Round 891 (Div. 3)   8948   

   oldRating  newRating  ratingDelta  Div. 1  Div. 2  Div. 3  Div. 4  unknown  
0          0        427          427       0       0       1       0        0  
1        427        695          268       0       1       0       0        0  
2        695        956          261       0       1       0       0        0  
3        956       1047           91       0       1       0       0        0  
4

## 3. Model Preparation: Defining Features (X) and Target (y)

In [8]:
if not df1.empty:
    # X includes oldRating, contestId, rank, and division features
    X1 = df1.drop(columns=['contestName', 'newRating', 'ratingDelta'])
    # y is the target delta
    y1 = df1['ratingDelta']
    print(f"Features (X1) for Dataset 1: {X1.columns.tolist()}")

Features (X1) for Dataset 1: ['contestId', 'rank', 'oldRating', 'Div. 1', 'Div. 2', 'Div. 3', 'Div. 4', 'unknown']


In [9]:
if not df2.empty:
    X2 = df2.drop(columns=['contestName', 'newRating', 'ratingDelta'])
    y2 = df2['ratingDelta']
    print(f"Features (X2) for Dataset 2: {X2.columns.tolist()}")

Features (X2) for Dataset 2: ['contestId', 'rank', 'oldRating', 'Div. 1', 'Div. 2', 'Div. 3', 'Div. 4', 'unknown']


## 4. Model Training and Evaluation Functions

In [10]:
def split_dataset(X_data: pd.DataFrame, y_data: pd.Series):
    """Splits the data into Train (60%), Cross-Validation (20%), and Test (20%)."""
    X_temp, X_test, y_temp, y_test = train_test_split(
        X_data, y_data, test_size=0.2, random_state=RANDOM_STATE
    )
    X_train, X_cv, y_train, y_cv = train_test_split(
        X_temp, y_temp, test_size=0.25, random_state=RANDOM_STATE
    )

    # Replace NaNs with 0 
    X_train = X_train.fillna(0)
    X_cv = X_cv.fillna(0)
    X_test = X_test.fillna(0)
    
    print("Train set:", X_train.shape)
    print("Cross-validation set:", X_cv.shape)
    print("Test set:", X_test.shape)
    
    return X_train, X_cv, X_test, y_train, y_cv, y_test

In [11]:
def train_models(X_train: pd.DataFrame, y_train: pd.Series, X_cv: pd.DataFrame, y_cv: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, dataset_name: str):
    """Trains models, evaluates them on CV set, and saves ALL models and the best model."""
    print("\n--- Starting Model Training ---")
    
    # Define models/pipelines
    linearModel = LinearRegression()
    ridge = Ridge(alpha=1.0)
    randomForestregressor = RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
    knn = KNeighborsRegressor(n_neighbors=5, weights='distance', n_jobs=-1)
    polynomialDegree2 = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=False)), ('linear', LinearRegression())])
    polynomialDegree3 = Pipeline([('poly', PolynomialFeatures(degree=3, include_bias=False)), ('linear', LinearRegression())])
    polynomialDegree4 = Pipeline([('poly', PolynomialFeatures(degree=4, include_bias=False)), ('linear', LinearRegression())])

    models = {
        'Linear Regression': linearModel,
        'Ridge Regression': ridge,
        'Poly Deg 2': polynomialDegree2,
        'Poly Deg 3': polynomialDegree3,
        'Poly Deg 4': polynomialDegree4,
        'Random Forest': randomForestregressor,
        'K-Nearest Neighbors': knn
    }

    best_model_name = None
    min_cv_error = float('inf')
    best_model = None

    # Train and evaluate on CV set
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        y_cv_pred = model.predict(X_cv)
        rmse = np.sqrt(mean_squared_error(y_cv, y_cv_pred))
        print(f"  {name} CV RMSE (Delta): {rmse:.2f}")

        # Track the best model
        if rmse < min_cv_error:
            min_cv_error = rmse
            best_model_name = name
            best_model = model
            
    print(f"\n🏆 Best Model (by CV RMSE): {best_model_name} with RMSE: {min_cv_error:.2f}")

    # Report error on test data for the best model
    y_test_pred = best_model.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    print(f"Test RMSE for {best_model_name}: {test_rmse:.2f}")
    
    # --- Model Saving Implementation (Saving ALL Models) ---
    base_dir = 'models_user_rating_prediction'
    save_dir = os.path.join(base_dir, dataset_name)
    
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        print(f"Created model directory: {save_dir}")
        
    print("\n--- Saving All Models ---")
    # 1. Save ALL models
    for name, model in models.items():
        # Clean up model name for filename
        safe_name = name.replace(' ', '_').replace('-', '_')
        model_filename = os.path.join(save_dir, f'{safe_name}_{dataset_name}.pkl')
        try:
            joblib.dump(model, model_filename)
            print(f"  Saved {name} to {model_filename}")
        except Exception as e:
            print(f"  ❌ Failed to save {name}: {e}")

    # 2. Save the best model specifically for easy retrieval
    best_model_filename = os.path.join(save_dir, f'best_model_{dataset_name}.pkl')
    try:
        joblib.dump(best_model, best_model_filename)
        print(f"\n✅ Successfully saved BEST MODEL ({best_model_name}) to {best_model_filename}")
    except Exception as e:
        print(f"\n❌ Failed to save best model: {e}")
        
    return models, best_model_name

In [12]:
def get_contest_division(contestId: int) -> Optional[Dict[str, int]]:
    """Fetches contest list and determines the division features for a given contestId."""
    url = "https://codeforces.com/api/contest.list"
    try:
        resp = requests.get(url, timeout=10)
        resp.raise_for_status()
        data = resp.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching contest list: {e}")
        return None
    
    if data.get('status') != 'OK':
        print("API request for contest list failed.")
        return None
    
    target_contest = next((c for c in data['result'] if c.get('id') == contestId), None)
    
    if not target_contest:
        print(f"Contest ID {contestId} not found.")
        return None

    contest_name = target_contest.get('name', '')
    
    # Form datapoint features for divisions
    features = {
        'Div. 1': 1 if 'Div. 1' in contest_name else 0,
        'Div. 2': 1 if 'Div. 2' in contest_name else 0,
        'Div. 3': 1 if 'Div. 3' in contest_name else 0,
        'Div. 4': 1 if 'Div. 4' in contest_name else 0
    }
    
    # Calculate 'unknown' division
    features['unknown'] = 1 if sum(features.values()) == 0 else 0
    
    print(f"Contest Name: {contest_name}, Division Features: {features}")
    return features


def predict_new_rating(models: Dict[str, Any], best_model_name: str, X_template: pd.DataFrame, old_rating: int, contestId: int, rank: int):
    """Forms a datapoint, predicts the rating delta, and calculates the new rating."""
    division_features = get_contest_division(contestId)
    if division_features is None:
        return
    
    prediction_data = pd.DataFrame(columns=X_template.columns).fillna(0)
    
    prediction_data.loc[0, 'oldRating'] = old_rating
    prediction_data.loc[0, 'contestId'] = contestId
    prediction_data.loc[0, 'rank'] = rank
    
    # Apply division features
    for key, value in division_features.items():
        if key in prediction_data.columns:
            prediction_data.loc[0, key] = value
        
    X_predict = prediction_data.iloc[[0]]
    
    print(f"\n--- Prediction for oldRating={old_rating}, ContestId={contestId}, Rank={rank} ---")

    # Predict using each model
    predictions = {}
    for name, model in models.items():
        try:
            # Predicts the delta
            pred_delta = model.predict(X_predict.fillna(0))[0]
            # New Rating = Old Rating + Predicted Delta
            pred_new_rating = round(old_rating + pred_delta)
            predictions[name] = {
                'delta': round(pred_delta),
                'new_rating': pred_new_rating
            }
        except Exception as e:
            predictions[name] = f"Error: {e}"
    
    print("\nPredictions from All Models (Predicting Delta):")
    for name, result in predictions.items():
        if isinstance(result, dict):
             print(f"  {name}: Delta={result['delta']}, New Rating={result['new_rating']}")
        else:
             print(f"  {name}: {result}")
        
    best_result = predictions.get(best_model_name, {'new_rating': 'N/A'})
    print(f"\n✨ Predicted newRating (Best Model: {best_model_name}): {best_result.get('new_rating', 'N/A')}")

## 5. Execution for Dataset 1 (`X1/y1`)

In [13]:
if not df1.empty:
    print("\n*** Running for Dataset 1 (Contest-based Data) ***")
    
    # 1. Split Data
    X1_train, X1_cv, X1_test, y1_train, y1_cv, y1_test = split_dataset(X1, y1)
    
    # 2. Train Models and Save All Models
    models_1, best_model_name_1 = train_models(
        X1_train, y1_train, X1_cv, y1_cv, X1_test, y1_test, dataset_name='dataset1'
    )


*** Running for Dataset 1 (Contest-based Data) ***
Train set: (35996, 8)
Cross-validation set: (11999, 8)
Test set: (11999, 8)

--- Starting Model Training ---
Training Linear Regression...
  Linear Regression CV RMSE (Delta): 70.48
Training Ridge Regression...
  Ridge Regression CV RMSE (Delta): 70.48
Training Poly Deg 2...
  Poly Deg 2 CV RMSE (Delta): 36.35
Training Poly Deg 3...
  Poly Deg 3 CV RMSE (Delta): 32.81
Training Poly Deg 4...
  Poly Deg 4 CV RMSE (Delta): 31.02
Training Random Forest...
  Random Forest CV RMSE (Delta): 27.34
Training K-Nearest Neighbors...
  K-Nearest Neighbors CV RMSE (Delta): 33.28

🏆 Best Model (by CV RMSE): Random Forest with RMSE: 27.34
Test RMSE for Random Forest: 28.79
Created model directory: models_user_rating_prediction/dataset1

--- Saving All Models ---
  Saved Linear Regression to models_user_rating_prediction/dataset1/Linear_Regression_dataset1.pkl
  Saved Ridge Regression to models_user_rating_prediction/dataset1/Ridge_Regression_dataset1

### 5.2. Prediction Example

This section uses the best model trained on Dataset 1 to predict the new rating for a specific scenario (e.g., a user with a 1500 rating ranking 122nd in Contest ID 1956, which is a Div. 2 contest). The results from all trained models are displayed for comparison.

In [14]:
if not df1.empty:
    # Example: old_rating=1500, contestId=1956 (Div 2), rank=122
    predict_new_rating(models_1, best_model_name_1, X1, old_rating=1500, contestId=1956, rank=122)

Contest Name: Codeforces Round 939 (Div. 2), Division Features: {'Div. 1': 0, 'Div. 2': 1, 'Div. 3': 0, 'Div. 4': 0, 'unknown': 0}

--- Prediction for oldRating=1500, ContestId=1956, Rank=122 ---

Predictions from All Models (Predicting Delta):
  Linear Regression: Delta=32, New Rating=1532
  Ridge Regression: Delta=32, New Rating=1532
  Poly Deg 2: Delta=114, New Rating=1614
  Poly Deg 3: Delta=126, New Rating=1626
  Poly Deg 4: Delta=178, New Rating=1678
  Random Forest: Delta=251, New Rating=1751
  K-Nearest Neighbors: Delta=188, New Rating=1688

✨ Predicted newRating (Best Model: Random Forest): 1751


  pred_delta = model.predict(X_predict.fillna(0))[0]
  pred_delta = model.predict(X_predict.fillna(0))[0]
  pred_delta = model.predict(X_predict.fillna(0))[0]
  pred_delta = model.predict(X_predict.fillna(0))[0]
  pred_delta = model.predict(X_predict.fillna(0))[0]
  pred_delta = model.predict(X_predict.fillna(0))[0]
  pred_delta = model.predict(X_predict.fillna(0))[0]


## 6. Execution for Dataset 2 (`X2/y2`)

In [15]:
if not df2.empty:
    print("\n*** Running for Dataset 2 (Handle-based Data) ***")

    # 1. Split Data
    X2_train, X2_cv, X2_test, y2_train, y2_cv, y2_test = split_dataset(X2, y2)

    # 2. Train Models and Save All Models
    models_2, best_model_name_2 = train_models(
        X2_train, y2_train, X2_cv, y2_cv, X2_test, y2_test, dataset_name='dataset2'
    )


*** Running for Dataset 2 (Handle-based Data) ***
Train set: (436, 8)
Cross-validation set: (146, 8)
Test set: (146, 8)

--- Starting Model Training ---
Training Linear Regression...
  Linear Regression CV RMSE (Delta): 94.97
Training Ridge Regression...
  Ridge Regression CV RMSE (Delta): 93.51
Training Poly Deg 2...
  Poly Deg 2 CV RMSE (Delta): 89.14
Training Poly Deg 3...
  Poly Deg 3 CV RMSE (Delta): 2900.16
Training Poly Deg 4...
  Poly Deg 4 CV RMSE (Delta): 76363.55
Training Random Forest...
  Random Forest CV RMSE (Delta): 35.77
Training K-Nearest Neighbors...
  K-Nearest Neighbors CV RMSE (Delta): 80.57

🏆 Best Model (by CV RMSE): Random Forest with RMSE: 35.77
Test RMSE for Random Forest: 65.51
Created model directory: models_user_rating_prediction/dataset2

--- Saving All Models ---
  Saved Linear Regression to models_user_rating_prediction/dataset2/Linear_Regression_dataset2.pkl
  Saved Ridge Regression to models_user_rating_prediction/dataset2/Ridge_Regression_dataset2.p

### 6.2. Prediction Example

This section uses the best model trained on Dataset 2 to predict the new rating for a specific scenario (e.g., a user with a 2000 rating ranking 332nd in Contest ID 1955, which is a Div. 1 contest). The results from all trained models are displayed for comparison.

In [16]:
if not df2.empty:
    # Example: old_rating=2000, contestId=1955 (Div 1), rank=332
    predict_new_rating(models_2, best_model_name_2, X2, old_rating=2000, contestId=1955, rank=332)

Contest Name: Codeforces Round 938 (Div. 3), Division Features: {'Div. 1': 0, 'Div. 2': 0, 'Div. 3': 1, 'Div. 4': 0, 'unknown': 0}

--- Prediction for oldRating=2000, ContestId=1955, Rank=332 ---

Predictions from All Models (Predicting Delta):
  Linear Regression: Delta=117, New Rating=2117
  Ridge Regression: Delta=121, New Rating=2121
  Poly Deg 2: Delta=-46, New Rating=1954
  Poly Deg 3: Delta=1013, New Rating=3013
  Poly Deg 4: Delta=-1095, New Rating=905
  Random Forest: Delta=28, New Rating=2028
  K-Nearest Neighbors: Delta=39, New Rating=2039

✨ Predicted newRating (Best Model: Random Forest): 2028


  pred_delta = model.predict(X_predict.fillna(0))[0]
  pred_delta = model.predict(X_predict.fillna(0))[0]
  pred_delta = model.predict(X_predict.fillna(0))[0]
  pred_delta = model.predict(X_predict.fillna(0))[0]
  pred_delta = model.predict(X_predict.fillna(0))[0]
  pred_delta = model.predict(X_predict.fillna(0))[0]
  pred_delta = model.predict(X_predict.fillna(0))[0]
