In [1]:
!pip install /kaggle/input/offline-packages/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
!pip install /kaggle/input/offline-packages/xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl
!pip install /kaggle/input/offline-packages/numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

Processing /kaggle/input/offline-packages/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
Installing collected packages: rdkit
Successfully installed rdkit-2025.3.3
Processing /kaggle/input/offline-packages/xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 2.0.3
    Uninstalling xgboost-2.0.3:
      Successfully uninstalled xgboost-2.0.3
Successfully installed xgboost-3.0.2
Processing /kaggle/input/offline-packages/numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframe

In [2]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit import RDLogger

RDLogger.DisableLog('rdApp.*')

from rdkit.Chem import AllChem, MACCSkeys, Descriptors, DataStructs
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import joblib



# Load data
train_df = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')
test_df = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')


# Function: Combine Morgan + MACCS fingerprints
def featurize_combo(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(2048 + 167)
    
    # Morgan fingerprint (bit vector)
    morgan_fp = GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    morgan_arr = np.zeros((2048,), dtype=int)
    DataStructs.ConvertToNumpyArray(morgan_fp, morgan_arr)

    # MACCS fingerprint
    maccs_fp = MACCSkeys.GenMACCSKeys(mol)
    maccs_arr = np.zeros((167,), dtype=int)
    DataStructs.ConvertToNumpyArray(maccs_fp, maccs_arr)

    return np.concatenate([morgan_arr, maccs_arr])

# Function: Calculate basic RDKit descriptors to enrich features
def calc_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(5)
    desc = [
        Descriptors.MolWt(mol),
        Descriptors.TPSA(mol),
        Descriptors.NumRotatableBonds(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.NumHAcceptors(mol),
    ]
    return np.array(desc)

print("Featurizing training data...")
fps = [featurize_combo(smi) for smi in train_df['SMILES']]
descs = [calc_descriptors(smi) for smi in train_df['SMILES']]
X_full = np.hstack([fps, descs])

print("Featurizing test data...")
fps_test = [featurize_combo(smi) for smi in test_df['SMILES']]
descs_test = [calc_descriptors(smi) for smi in test_df['SMILES']]
X_test_full = np.hstack([fps_test, descs_test])

# Clean missing targets
train_df_cleaned = train_df.dropna(subset=['Tg']).reset_index(drop=True)
y = train_df_cleaned['Tg'].values
X_full_cleaned = X_full[train_df['Tg'].notna()]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_full_cleaned)
X_test_scaled = scaler.transform(X_test_full)

# Train-val split
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Prepare DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test_scaled)

# Hyperparameters
params = {
    'objective': 'reg:squarederror',
    'max_depth': 10,
    'eta': 0.005,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'reg_alpha': 0.2,
    'reg_lambda': 1.5,
    'seed': 42,
    'tree_method': 'hist',
    'eval_metric': 'rmse'
}

print("Running cross-validation to find best num_boost_round...")
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=5000,
    nfold=5,
    early_stopping_rounds=100,
    verbose_eval=50,
    seed=42,
    stratified=False,
)

best_num_boost_round = len(cv_results)
print(f"Best number of boosting rounds: {best_num_boost_round}")

# Final model training
model = xgb.train(
    params,
    dtrain,
    num_boost_round=best_num_boost_round,
    evals=[(dval, 'eval'), (dtrain, 'train')],
    early_stopping_rounds=100,
    verbose_eval=50
)

# Evaluate
y_pred = model.predict(dval)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation R² Score: {r2:.4f}")

# Test prediction
test_preds = model.predict(dtest)

# Save outputs
joblib.dump(model, "polymer_xgb_model_optimized.pkl")
joblib.dump(scaler, "feature_scaler.pkl")
print("Model and scaler saved.")

# Create submission
submission_df = pd.DataFrame({
    'Id': test_df['id'],
    'Tg': test_preds
})
submission_df.to_csv('submission.csv', index=False)
print("Submission file saved as submission.csv")

Featurizing training data...
Featurizing test data...
Running cross-validation to find best num_boost_round...
[0]	train-rmse:113.65349+1.37889	test-rmse:114.24663+5.66958
[50]	train-rmse:94.04533+1.12750	test-rmse:100.60565+4.59634
[100]	train-rmse:78.40049+0.95420	test-rmse:91.03423+3.58592
[150]	train-rmse:65.83382+0.87503	test-rmse:84.36004+3.16892
[200]	train-rmse:55.71389+0.81529	test-rmse:79.95716+3.05049
[250]	train-rmse:47.52456+0.80983	test-rmse:77.02370+3.24486
[300]	train-rmse:40.88511+0.81612	test-rmse:75.10768+3.36889
[350]	train-rmse:35.38014+0.79934	test-rmse:73.88128+3.50057
[400]	train-rmse:30.87602+0.79301	test-rmse:73.07670+3.65957
[450]	train-rmse:27.11261+0.78828	test-rmse:72.57217+3.75291
[500]	train-rmse:23.95987+0.78870	test-rmse:72.31601+3.83519
[550]	train-rmse:21.31687+0.77355	test-rmse:72.23800+3.90688
[600]	train-rmse:19.07720+0.74463	test-rmse:72.17841+3.99257
[650]	train-rmse:17.19440+0.73598	test-rmse:72.18731+4.04223
[700]	train-rmse:15.56738+0.70820	t