In [1]:
!pip install --no-index --find-links=/kaggle/input/rdkit-2025-3-3-cp311-cp311 rdkit
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from tqdm import tqdm
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Looking in links: /kaggle/input/rdkit-2025-3-3-cp311-cp311
Processing /kaggle/input/rdkit-2025-3-3-cp311-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
Installing collected packages: rdkit
Successfully installed rdkit-2025.3.3
/kaggle/input/rdkit-2025-3-3-cp311-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train.csv
/kaggle/input/neurips-open-polymer-prediction-2025/test.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset2.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset1.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset3.csv


In [2]:
# === 1. Load Data ===
data = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')

# === 2. Generate Molecular Descriptors ===
def smiles_to_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    features = {
        'MolWt': Descriptors.MolWt(mol),
        'TPSA': Descriptors.TPSA(mol),
        'NumRotatableBonds': Descriptors.NumRotatableBonds(mol),
        'NumAromaticRings': Descriptors.NumAromaticRings(mol),
        'MolLogP': Descriptors.MolLogP(mol),
        'NumHDonors': Descriptors.NumHDonors(mol),
        'NumHAcceptors': Descriptors.NumHAcceptors(mol),
        'RingCount': Descriptors.RingCount(mol),
        'HeavyAtomCount': Descriptors.HeavyAtomCount(mol),
        'FractionCSP3': Descriptors.FractionCSP3(mol),
    }
    return features

print("Generating features...")
feature_list = []
valid_rows = []
for i, row in tqdm(data.iterrows(), total=len(data)):
    features = smiles_to_features(row['SMILES'])
    if features:
        feature_list.append(features)
        valid_rows.append(i)

# Keep only valid entries
X = pd.DataFrame(feature_list)
"""
y_raw = data.loc[valid_rows, ['Tg', 'FFV', 'Tc', 'Density', 'Rg']].reset_index(drop=True)

# Remove rows with NaN or inf in targets
mask = y_raw.notna().all(axis=1) & np.isfinite(y_raw).all(axis=1)
X = X.loc[mask].reset_index(drop=True)
y = y_raw.loc[mask].reset_index(drop=True)
"""

Generating features...


100%|██████████| 7973/7973 [00:10<00:00, 772.70it/s]


"\ny_raw = data.loc[valid_rows, ['Tg', 'FFV', 'Tc', 'Density', 'Rg']].reset_index(drop=True)\n\n# Remove rows with NaN or inf in targets\nmask = y_raw.notna().all(axis=1) & np.isfinite(y_raw).all(axis=1)\nX = X.loc[mask].reset_index(drop=True)\ny = y_raw.loc[mask].reset_index(drop=True)\n"

In [3]:
#Temporary dataset with onlt FFV
y = data.loc[valid_rows, ['Tg', 'FFV', 'Tc', 'Density', 'Rg']].reset_index(drop=True)

In [4]:
# === 3. Train Models for Each Property ===
models = {}
results = {}
for property in y.columns:
    print(f"\nTraining model for {property}...")

    # Select current target
    y_property = y[property]

    # Filter out rows where target is NaN or inf
    mask = y_property.notna() & np.isfinite(y_property)
    X1 = X.loc[mask].reset_index(drop=True)
    y_property = y_property.loc[mask].reset_index(drop=True)

    # Split into train/test
    if len(X1) < 5:
        print(f"⚠️ Skipping {property} due to too few valid samples ({len(X1)})")
        continue

    X_train, X_test, y_train, y_test = train_test_split(X1, y_property, test_size=0.2, random_state=42)

    # Train model
    model = XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
    model.fit(X_train, y_train)

    # Evaluat5
    

    
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    models[property] = model
    results[property] = {'RMSE': rmse, 'R2': r2}

    print(f"{property} - RMSE: {rmse:.3f}, R²: {r2:.3f}")


Training model for Tg...
Tg - RMSE: 75.644, R²: 0.401

Training model for FFV...
FFV - RMSE: 0.022, R²: 0.546

Training model for Tc...
Tc - RMSE: 0.046, R²: 0.751

Training model for Density...
Density - RMSE: 0.092, R²: 0.519

Training model for Rg...
Rg - RMSE: 4.053, R²: 0.254


In [5]:
test_df = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')

# Prepare test features
test_feature_list = []
test_valid_rows = []
test_ids = []

for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    features = smiles_to_features(row['SMILES'])
    if features:
        test_feature_list.append(features)
        test_valid_rows.append(i)
        test_ids.append(row['id'])

# Create feature matrix for valid SMILES
testX = pd.DataFrame(test_feature_list)

# Predict using trained models
preds = {}
for property in y.columns:
    model = models.get(property)
    if model:
        preds[property] = model.predict(testX)
    else:
        preds[property] = [0.0] * len(testX)  # fallback if model is missing

# Create submission DataFrame with predicted values
submission_df = pd.DataFrame({
    'id': test_ids,
    'Tg': preds['Tg'],
    'FFV': preds['FFV'],
    'Tc': preds['Tc'],
    'Density': preds['Density'],
    'Rg': preds['Rg']
})

# Save submission
submission_df.to_csv('submission.csv', index=False)

100%|██████████| 3/3 [00:00<00:00, 412.50it/s]
