In [1]:
# Files to import and install
!pip install --no-index --find-links=/kaggle/input/rdkit-2025-3-3-cp311-cp311 rdkit
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs, Descriptors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from tqdm import tqdm
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.preprocessing import StandardScaler
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Looking in links: /kaggle/input/rdkit-2025-3-3-cp311-cp311
Processing /kaggle/input/rdkit-2025-3-3-cp311-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
Installing collected packages: rdkit
Successfully installed rdkit-2025.3.3
/kaggle/input/extradataset1/Density_ExtraDataSet.csv
/kaggle/input/extradataset1/Tg2_ExtraDataSet.csv
/kaggle/input/extradataset1/Tg_ExtraDataSet.csv
/kaggle/input/extradataset1/Tg3_ExtraDataSet.csv
/kaggle/input/rdkit-2025-3-3-cp311-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train.csv
/kaggle/input/neurips-open-polymer-prediction-2025/test.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset2.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset1.csv
/kaggle/input/neurips-open-

In [2]:
# === 1. Load Data ===
data = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')

# === 2. Generate Molecular Descriptors ===
def smiles_to_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    features = {
        'MolWt': Descriptors.MolWt(mol),
        'TPSA': Descriptors.TPSA(mol),
        'NumRotatableBonds': Descriptors.NumRotatableBonds(mol),
        'NumAromaticRings': Descriptors.NumAromaticRings(mol),
        'MolLogP': Descriptors.MolLogP(mol),
        'NumHDonors': Descriptors.NumHDonors(mol),
        'NumHAcceptors': Descriptors.NumHAcceptors(mol),
        'RingCount': Descriptors.RingCount(mol),
        'HeavyAtomCount': Descriptors.HeavyAtomCount(mol),
        'FractionCSP3': Descriptors.FractionCSP3(mol),
        'MorganFingDens': Descriptors.FpDensityMorgan3(mol),
        'HeavyMolWt': Descriptors.HeavyAtomMolWt(mol),
        #'MaxAbsPartChar': Descriptors.MaxAbsPartialCharge(mol, force=False), contains NaN values
        'NumRadElec': Descriptors.NumRadicalElectrons(mol),
        #'MaxPartChar': Descriptors.MaxPartialCharge(mol, force=False),
        #'MinAbsPartChar': Descriptors.MinAbsPartialCharge(mol, force=False),
        #'MinPartChar': Descriptors.MinPartialCharge(mol, force=False),
        'NumValElec': Descriptors.NumValenceElectrons(mol),
    }
    return features

print("Generating features...")
feature_list = []
valid_rows = []
for i, row in tqdm(data.iterrows(), total=len(data)):
    features = smiles_to_features(row['SMILES'])
    if features:
        feature_list.append(features)
        valid_rows.append(i)

# Keep only valid entries
X = pd.DataFrame(feature_list)
y_raw = data.loc[valid_rows, ['Tg', 'FFV', 'Tc', 'Density', 'Rg']].reset_index(drop=True)

Generating features...


100%|██████████| 7973/7973 [00:13<00:00, 609.52it/s]


In [3]:
# === 3. Train Models for Each Property ===
models = {}
results = {}
num_train_data = {}
num_test_data = {}
sum_num_data = 0
for property in y_raw.columns:
    print(f"\nTraining model for {property}...")

    # Select current target
    y_property = y_raw[property]

    # Filter out rows where target is NaN or inf
    mask = y_property.notna() & np.isfinite(y_property)
    X1 = X.loc[mask].reset_index(drop=True)
    y_property = y_property.loc[mask].reset_index(drop=True)

    # Split into train/test
    if len(X1) < 5:
        print(f"⚠️ Skipping {property} due to too few valid samples ({len(X1)})")
        continue

    X_train, X_test, y_train, y_test = train_test_split(X1, y_property, test_size=0.2, random_state=42)

    #Train Model
    '''
    param_dist = {
        'n_estimators': randint(50, 300),
        'max_depth': randint(3, 10),
        'learning_rate': uniform(0.01, 0.3),
        'subsample': uniform(0.7, 0.3),
        'colsample_bytree': uniform(0.7, 0.3),
        'gamma': uniform(0, 1),
        'reg_alpha': uniform(0, 1),
        'reg_lambda': uniform(0, 1)
    }
    xgb_base = XGBRegressor(random_state=42, verbosity=0)
    # === Randomized Search ===
    random_search = RandomizedSearchCV(
        estimator=xgb_base,
        param_distributions=param_dist,
        n_iter=25,  # Increase for better results
        scoring='neg_root_mean_squared_error',
        cv=3,
        verbose=0,
        n_jobs=-1,
        random_state=42
    )

    random_search.fit(X_train, y_train)

    best_model = random_search.best_estimator_
    models[property] = best_model
    # === Evaluation ===
    y_pred = best_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    value_range = y_property.max() - y_property.min()
    norm_rmse = rmse / value_range if value_range != 0 else None
    '''
    model = XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
    has_nan = X_train.isna().any().any()
    print(f"DataFrame contains any NaN: {has_nan}")
    model.fit(X_train, y_train)
    models[property] = model    

    # === Evaluation ===
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    value_range = y_property.max() - y_property.min()
    norm_rmse = rmse / value_range if value_range != 0 else None

    results[property] = {
        'RMSE': rmse,
        'R2': r2,
        'Normalized_RMSE': norm_rmse
    }

    print(f"{property} - RMSE: {rmse:.3f}, R²: {r2:.3f}, Normalized RMSE: {norm_rmse:.4f}" if norm_rmse is not None else
          f"{property} - RMSE: {rmse:.3f}, R²: {r2:.3f}, Normalized RMSE: undefined (zero range)")


Training model for Tg...
DataFrame contains any NaN: False
Tg - RMSE: 72.111, R²: 0.455, Normalized RMSE: 0.1163

Training model for FFV...
DataFrame contains any NaN: False
FFV - RMSE: 0.022, R²: 0.545, Normalized RMSE: 0.0394

Training model for Tc...
DataFrame contains any NaN: False
Tc - RMSE: 0.047, R²: 0.747, Normalized RMSE: 0.0974

Training model for Density...
DataFrame contains any NaN: False
Density - RMSE: 0.099, R²: 0.447, Normalized RMSE: 0.0902

Training model for Rg...
DataFrame contains any NaN: False
Rg - RMSE: 3.767, R²: 0.355, Normalized RMSE: 0.1510


In [4]:
test_df = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')

# Prepare test features
test_feature_list = []
test_valid_rows = []
test_ids = []

for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    features = smiles_to_features(row['SMILES'])
    if features:
        test_feature_list.append(features)
        test_valid_rows.append(i)
        test_ids.append(row['id'])

# Create feature matrix for valid SMILES
testX = pd.DataFrame(test_feature_list)

# Predict using trained models
preds = {}
for property in y_raw.columns:
    model = models.get(property)
    if model:
        preds[property] = model.predict(testX)
    else:
        preds[property] = [0.0] * len(testX)  # fallback if model is missing

# Create submission DataFrame with predicted values
submission_df = pd.DataFrame({
    'id': test_ids,
    'Tg': preds['Tg'],
    'FFV': preds['FFV'],
    'Tc': preds['Tc'],
    'Density': preds['Density'],
    'Rg': preds['Rg']
})

# Save submission
submission_df.to_csv('submission.csv', index=False)
print(submission_df)

100%|██████████| 3/3 [00:00<00:00, 355.71it/s]

           id          Tg       FFV        Tc   Density         Rg
0  1109053969  140.131500  0.375114  0.197688  1.151430  18.322979
1  1422188626  120.043037  0.378099  0.201501  1.128946  18.805056
2  2032016830   63.870346  0.359711  0.252389  1.126352  14.396860



