In [None]:
### Infer with your own fine-tuned Qwen 2.5 model for Polymer property prediction--Ricky Li, Sept. 2025, UIUC

In [1]:
from unsloth import FastLanguageModel
from peft import PeftConfig, PeftModel
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.


Skipping import of cpp extensions due to incompatible torch version 2.7.0+cu126 for torchao version 0.14.1             Please see https://github.com/pytorch/ao/issues/2919 for more info


ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [2]:
BASE_MODEL_ID   = "unsloth/Qwen2.5-32B"          # your base
LORA_ADAPTER_ID = ""         # <<â€” change this

In [3]:
HF_TOKEN = ""  # <<â€” change this

In [4]:
max_seq_length = 2048
dtype = None
load_in_4bit = True

base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = BASE_MODEL_ID,  #your fine tuned model
    max_seq_length = max_seq_length,
    dtype          = dtype,
    load_in_4bit   = load_in_4bit,
    token          = HF_TOKEN,     # only needed if the base is gated/private
)

==((====))==  Unsloth 2025.9.4: Fast Qwen2 patching. Transformers: 4.56.1.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.252 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [01:21<00:00, 20.31s/it]


In [5]:
model = PeftModel.from_pretrained(base_model, LORA_ADAPTER_ID)

In [6]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [7]:
print("eos checked")

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

inputs = tokenizer(
[
    alpaca_prompt.format(
        "Based on the given SMILES string, predict the monomer's relevant properties. Answer the task in this format: The monomer compound has sigma of ? GM at 780 nm, maximum sigma of ? GM, ISC of ? eV, boiling point of ? Â°C, logP of ?, aromaticity of ?, solubility of ? ug/mol, molecular weight of ? g/mol.", # instruction
        "CC12NC1C1C(C#N)C21", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)
tokenizer.batch_decode(outputs)
print("sample checked")

eos checked
sample checked
tensor([[ 38214,    374,    458,   7600,    429,  16555,    264,   3383,     11,
          34426,    448,    458,   1946,    429,   5707,   4623,   2266,     13,
           9645,    264,   2033,    429,  34901,  44595,    279,   1681,    382,
          14374,  29051,    510,  28715,    389,    279,   2661,  13716,  45978,
            914,     11,   7023,    279,   1615,  25359,    594,   9760,   5888,
            382,  14374,   5571,    510,   3706,     16,     17,   9949,     16,
             34,     16,     34,   3025,      2,     45,      8,     34,     17,
             16,    271,  14374,   5949,    510,    785,   1615,  25359,  23628,
            702,  20254,    315,    220,     22,     16,     13,     23,     24,
          19172,    518,    220,     22,     23,     15,  25723,     11,    220,
           7192,  20254,    315,    220,     23,     19,     13,     15,     16,
          19172,     11,  43125,  45306,   2088,    315,    220,     21,     15,
 

In [9]:
import pandas as pd
# === Load SMILES ===
file_path = "3000.csv"
smiles_df = pd.read_csv(file_path)
all_outputs = []

print("3000.csv loaded")

3000.csv loaded


In [10]:
for i, smi in enumerate(smiles_df["SMILES"].head(3000)):  # just first 100
    prompt = alpaca_prompt.format(
        "Based on the given SMILES string, predict the monomer's relevant properties. Answer the task in this format: The monomer compound has sigma of ? GM at 780 nm, maximum sigma of ? GM, ISC of ? eV, boiling point of ? Â°C, logP of ?, aromaticity of ?, solubility of ? ug/mol, molecular weight of ? g/mol.",
        smi,
        ""  # leave blank for generation
    )
    print(smi)

    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=256, use_cache=True)
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    all_outputs.append(decoded)

    print(f"Completed {i+1}/3000")  # progress bar in notebook

# === Save results ===
with open("3000outputs.txt", "w", encoding="utf-8") as f:
    for out in all_outputs:
        f.write(out + "\n\n")

FC=CF
Completed 1/3000
FC(F)=CN
Completed 2/3000
N=C(F)CF
Completed 3/3000
CC(F)NF
Completed 4/3000
FCNCF
Completed 5/3000
FC(F)C=O
Completed 6/3000
C=C(F)OF
Completed 7/3000
FC1C(F)O1
Completed 8/3000
FC(F)CO
Completed 9/3000
OC(F)CF
Completed 10/3000
FCCOF
Completed 11/3000
CCF
Completed 12/3000
N=NC#CF
Completed 13/3000
N#CC=NF
Completed 14/3000
FN=C1N=C1
Completed 15/3000
FNN1C#C1
Completed 16/3000
FC1=NN=C1
Completed 17/3000
NC1=C(F)N1
Completed 18/3000
FC=1N(C)N1
Completed 19/3000
N=C1N(F)C1
Completed 20/3000
NC(F)=C=N
Completed 21/3000
N=NC=CF
Completed 22/3000
FCN1N=C1
Completed 23/3000
FN1N2CC12
Completed 24/3000
FN1C2NC12
Completed 25/3000
NC1(F)N=C1
Completed 26/3000
N=C(F)NC
Completed 27/3000
NN(F)C=C
Completed 28/3000
CC1(F)NN1
Completed 29/3000
NC1(F)NC1
Completed 30/3000
CNC=NF
Completed 31/3000
NN=CCF
Completed 32/3000
FNC1NC1
Completed 33/3000
FNN1CC1
Completed 34/3000
N=C(F)CN
Completed 35/3000
CNCNF
Completed 36/3000
NCCNF
Completed 37/3000
FN1C=C1
Completed 38/3000
