In [3]:
import pandas as pd

In [37]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
from collections import Counter

class ChemicalFormulaVAE:
    def __init__(self, latent_dim=64, max_length=20):
        self.latent_dim = latent_dim
        self.max_length = max_length
        self.tokenizer = None
        self.vocab_size = None
        self.encoder = None
        self.decoder = None
        
    def tokenize_formula(self, formula):
        """
        Tokenize công thức hóa học thành các token
        Ví dụ: 'Li2MnSiO4' -> ['Li', '2', 'Mn', 'Si', 'O', '4']
        """
        # Regex để tách element và số
        pattern = r'([A-Z][a-z]*)(\d*)'
        tokens = []
        
        matches = re.findall(pattern, formula)
        for element, number in matches:
            tokens.append(element)
            if number:
                tokens.append(number)
        
        return tokens
    
    def prepare_data(self, formulas):
        """
        Chuẩn bị dữ liệu cho training
        """
        # Tokenize tất cả công thức
        tokenized_formulas = []
        for formula in formulas:
            tokens = self.tokenize_formula(formula)
            tokenized_formulas.append(' '.join(tokens))
        
        # Tạo tokenizer
        # self.tokenizer = Tokenizer(char_level=False)
        self.tokenizer = Tokenizer(char_level=False, lower=False, oov_token='<OOV>') # Thêm để tránh các token thành lowcase hết
        self.tokenizer.fit_on_texts(tokenized_formulas)
        self.vocab_size = len(self.tokenizer.word_index) + 1
        
        # Chuyển đổi thành sequences
        sequences = self.tokenizer.texts_to_sequences(tokenized_formulas)
        
        # Padding
        padded_sequences = pad_sequences(sequences, maxlen=self.max_length, padding='post')
        
        print(f"Vocab size: {self.vocab_size}")
        print(f"Max sequence length: {self.max_length}")
        print(f"Sample tokenized formula: {tokenized_formulas[0]}")
        print(f"Sample sequence: {sequences[0]}")
        
        return padded_sequences
    
    def build_encoder(self):
        """
        Xây dựng encoder
        """
        encoder_inputs = Input(shape=(self.max_length,), name='encoder_input')
        
        # Embedding layer
        x = Embedding(self.vocab_size, 128, mask_zero=True)(encoder_inputs)
        
        # LSTM layers
        x = LSTM(256, return_sequences=True, dropout=0.2)(x)
        x = LSTM(128, dropout=0.2)(x)
        
        # Dense layers
        x = Dense(128, activation='relu')(x)
        
        # Latent space
        z_mean = Dense(self.latent_dim, name='z_mean')(x)
        z_log_var = Dense(self.latent_dim, name='z_log_var')(x)
        
        encoder = Model(encoder_inputs, [z_mean, z_log_var], name='encoder')
        return encoder
    
    def build_decoder(self):
        """
        Xây dựng decoder
        """
        latent_inputs = Input(shape=(self.latent_dim,), name='z_sampling')
        
        # Dense layers
        x = Dense(128, activation='relu')(latent_inputs)
        x = Dense(256, activation='relu')(x)
        
        # Repeat for sequence generation
        x = RepeatVector(self.max_length)(x)
        
        # LSTM layers
        x = LSTM(256, return_sequences=True, dropout=0.2)(x)
        x = LSTM(128, return_sequences=True, dropout=0.2)(x)
        
        # Output layer
        decoder_outputs = Dense(self.vocab_size, activation='softmax', name='decoder_output')(x)
        
        decoder = Model(latent_inputs, decoder_outputs, name='decoder')
        return decoder
    
    def sampling(self, z_mean, z_log_var):
        """
        Reparameterization trick
        """
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.random.normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon
    
    def build_models(self):
        """
        Xây dựng encoder và decoder
        """
        self.encoder = self.build_encoder()
        self.decoder = self.build_decoder()
        
        # Compile models
        self.encoder.compile(optimizer=Adam(learning_rate=0.001))
        self.decoder.compile(optimizer=Adam(learning_rate=0.001), 
                           loss='sparse_categorical_crossentropy',
                           metrics=['accuracy'])
    
    def train_step(self, x_batch):
        """
        Custom training step
        """
        with tf.GradientTape() as tape:
            # Encode
            z_mean, z_log_var = self.encoder(x_batch)
            
            # Sample
            z = self.sampling(z_mean, z_log_var)
            
            # Decode
            reconstruction = self.decoder(z)
            
            # Reconstruction loss
            reconstruction_loss = tf.keras.losses.sparse_categorical_crossentropy(
                x_batch, reconstruction
            )
            reconstruction_loss = tf.reduce_mean(reconstruction_loss)
            
            # KL divergence loss
            kl_loss = -0.5 * tf.reduce_mean(
                1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
            )
            
            # Total loss
            total_loss = reconstruction_loss + 0.1 * kl_loss
        
        # Get trainable variables from both encoder and decoder
        trainable_vars = self.encoder.trainable_variables + self.decoder.trainable_variables
        
        # Compute gradients
        gradients = tape.gradient(total_loss, trainable_vars)
        
        # Apply gradients
        optimizer = Adam(learning_rate=0.001)
        optimizer.apply_gradients(zip(gradients, trainable_vars))
        
        return {
            'total_loss': total_loss,
            'reconstruction_loss': reconstruction_loss,
            'kl_loss': kl_loss
        }
    
    def train(self, sequences, epochs=100, batch_size=32):
        """
        Huấn luyện VAE với custom training loop
        """
        if self.encoder is None or self.decoder is None:
            self.build_models()
        
        dataset = tf.data.Dataset.from_tensor_slices(sequences)
        dataset = dataset.batch(batch_size).shuffle(1000)
        
        history = {
            'total_loss': [],
            'reconstruction_loss': [],
            'kl_loss': []
        }
        
        print("Starting training...")
        for epoch in range(epochs):
            epoch_losses = {
                'total_loss': [],
                'reconstruction_loss': [],
                'kl_loss': []
            }
            
            for batch in dataset:
                losses = self.train_step(batch)
                for key in epoch_losses:
                    epoch_losses[key].append(losses[key])
            
            # Calculate average losses for this epoch
            avg_losses = {}
            for key in epoch_losses:
                avg_losses[key] = tf.reduce_mean(epoch_losses[key])
                history[key].append(float(avg_losses[key]))
            
            if (epoch + 1) % 10 == 0:
                print(f"Epoch {epoch+1}/{epochs} - "
                      f"Total Loss: {avg_losses['total_loss']:.4f}, "
                      f"Recon Loss: {avg_losses['reconstruction_loss']:.4f}, "
                      f"KL Loss: {avg_losses['kl_loss']:.4f}")
        
        return history
    
    def generate_formulas(self, n_samples=10, temperature=1.0):
        """
        Generate công thức mới
        """
        if self.decoder is None:
            raise ValueError("Model chưa được train!")
        
        # Sample từ latent space
        z_samples = np.random.normal(0, 1, (n_samples, self.latent_dim))
        
        # Decode
        generated_sequences = self.decoder.predict(z_samples, verbose=0)
        
        # Convert về công thức
        formulas = []
        for seq in generated_sequences:
            # Sample với temperature
            if temperature != 1.0:
                seq = seq / temperature
                seq = tf.nn.softmax(seq, axis=-1).numpy()
            
            # Greedy decoding
            tokens = np.argmax(seq, axis=-1)
            
            # Convert tokens về text
            try:
                formula_tokens = []
                for token_id in tokens:
                    if token_id == 0:  # Padding token
                        break
                    if token_id in self.tokenizer.index_word:
                        formula_tokens.append(self.tokenizer.index_word[token_id])
                
                # Ghép thành công thức
                formula = ''.join(formula_tokens)
                if formula and self.validate_formula(formula):
                    formulas.append(formula)
            except:
                continue
        
        return formulas
    
    def interpolate_formulas(self, formula1, formula2, n_steps=5):
        """
        Interpolate giữa 2 công thức trong latent space
        """
        # Encode 2 công thức
        seq1 = self.prepare_single_formula(formula1)
        seq2 = self.prepare_single_formula(formula2)
        
        z1_mean, z1_log_var = self.encoder.predict(seq1.reshape(1, -1), verbose=0)
        z2_mean, z2_log_var = self.encoder.predict(seq2.reshape(1, -1), verbose=0)
        
        # Use mean values for interpolation
        interpolated_formulas = []
        for i in range(n_steps):
            alpha = i / (n_steps - 1) if n_steps > 1 else 0
            z_interp = (1 - alpha) * z1_mean + alpha * z2_mean
            
            # Decode
            generated_seq = self.decoder.predict(z_interp, verbose=0)
            tokens = np.argmax(generated_seq[0], axis=-1)
            
            # Convert về công thức
            formula_tokens = []
            for token_id in tokens:
                if token_id == 0:
                    break
                if token_id in self.tokenizer.index_word:
                    formula_tokens.append(self.tokenizer.index_word[token_id])
            
            formula = ''.join(formula_tokens)
            if formula and self.validate_formula(formula):
                interpolated_formulas.append(formula)
        
        return interpolated_formulas
    
    def prepare_single_formula(self, formula):
        """
        Chuẩn bị 1 công thức để encode
        """
        tokens = self.tokenize_formula(formula)
        tokenized = ' '.join(tokens)
        sequence = self.tokenizer.texts_to_sequences([tokenized])
        padded = pad_sequences(sequence, maxlen=self.max_length, padding='post')
        return padded[0]
    
    def validate_formula(self, formula):
        """
        Kiểm tra tính hợp lệ cơ bản của công thức
        """
        # Kiểm tra có chứa ít nhất 1 element
        if not re.search(r'[A-Z]', formula):
            return False
        
        # Kiểm tra format cơ bản
        if not re.match(r'^([A-Z][a-z]*\d*)+$', formula):
            return False
        
        # Kiểm tra không có số 0
        if '0' in formula:
            return False
        
        return True

# Ví dụ sử dụng
if __name__ == "__main__":
    data = pd.read_csv("/Users/sonn/Sonn/Workspace/Projects/IonBatteryQML/data/CrystalLithiumIonBattery.csv")
    sample_formulas = data['Formula']
    
    # Tạo và train model
    vae = ChemicalFormulaVAE(latent_dim=32, max_length=15)
    
    # Chuẩn bị dữ liệu
    sequences = vae.prepare_data(sample_formulas)
    print(f"Prepared {len(sequences)} sequences")
    
    # Build models
    vae.build_models()
    print("\nModel built successfully!")
    print("Encoder summary:")
    vae.encoder.summary()
    print("\nDecoder summary:")
    vae.decoder.summary()
    
    # Train
    print("\nTraining model...")
    history = vae.train(sequences, epochs=50, batch_size=8)
    
    # Generate công thức mới
    print("\nGenerating new formulas:")
    new_formulas = vae.generate_formulas(n_samples=20, temperature=0.8)
    
    print(f"\nOriginal formulas ({len(sample_formulas)}):")
    for f in sample_formulas:
        print(f"  {f}")
    
    print(f"\nGenerated valid formulas ({len(new_formulas)}):")
    for f in new_formulas:
        print(f"  {f}")
    
    # Interpolation example
    if len(sample_formulas) >= 2:
        print(f"\nInterpolation between {sample_formulas[0]} and {sample_formulas[1]}:")
        interpolated = vae.interpolate_formulas(sample_formulas[0], sample_formulas[1], n_steps=5)
        for i, f in enumerate(interpolated):
            print(f"  Step {i}: {f}")

Vocab size: 25
Max sequence length: 15
Sample tokenized formula: Li 2 Mn Si O 4
Sample sequence: [2, 5, 9, 3, 4, 6]
Prepared 339 sequences

Model built successfully!
Encoder summary:



Decoder summary:



Training model...
Starting training...


KeyboardInterrupt: 

In [38]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
import re
import pandas as pd

class ChemicalFormulaVAE:
    def __init__(self, latent_dim=64, max_length=20):
        self.latent_dim = latent_dim
        self.max_length = max_length + 2  # +2 cho <START> và <END>
        self.tokenizer = None
        self.vocab_size = None
        self.encoder = None
        self.decoder = None
        self.optimizer = Adam(learning_rate=0.001)

    def tokenize_formula(self, formula):
        pattern = r'([A-Z][a-z]*)(\d*)'
        tokens = []
        matches = re.findall(pattern, formula)
        for element, number in matches:
            tokens.append(element)
            if number:
                tokens.append(number)
        return tokens

    def prepare_data(self, formulas):
        tokenized_formulas = []
        for formula in formulas:
            tokens = self.tokenize_formula(formula)
            tokenized = '<START> ' + ' '.join(tokens) + ' <END>'
            tokenized_formulas.append(tokenized)

        #self.tokenizer = Tokenizer(char_level=False, oov_token='<OOV>')
        self.tokenizer = Tokenizer(char_level=False, lower=False, oov_token='<OOV>')
        self.tokenizer.fit_on_texts(tokenized_formulas)
        self.vocab_size = len(self.tokenizer.word_index) + 1

        sequences = self.tokenizer.texts_to_sequences(tokenized_formulas)
        padded = pad_sequences(sequences, maxlen=self.max_length, padding='post')

        return padded

    def build_encoder(self):
        inputs = Input(shape=(self.max_length - 1,))  # <-- Sửa ở đây
        x = Embedding(self.vocab_size, 128, mask_zero=True)(inputs)
        x = LSTM(256, return_sequences=True)(x)
        x = LSTM(128)(x)
        z_mean = Dense(self.latent_dim, name='z_mean')(x)
        z_log_var = Dense(self.latent_dim, name='z_log_var')(x)
        return Model(inputs, [z_mean, z_log_var], name='encoder')

    def build_decoder(self):
        latent_inputs = Input(shape=(self.latent_dim,))
        x = Dense(128, activation='relu')(latent_inputs)
        x = Dense(256, activation='relu')(x)
        x = RepeatVector(self.max_length - 1)(x)
        x = LSTM(256, return_sequences=True)(x)
        x = LSTM(128, return_sequences=True)(x)
        outputs = Dense(self.vocab_size, activation='softmax')(x)
        return Model(latent_inputs, outputs, name='decoder')

    def sampling(self, z_mean, z_log_var):
        epsilon = tf.random.normal(shape=tf.shape(z_mean))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

    def build_models(self):
        self.encoder = self.build_encoder()
        self.decoder = self.build_decoder()

    def train_step(self, encoder_input, decoder_target):
        with tf.GradientTape() as tape:
            z_mean, z_log_var = self.encoder(encoder_input)
            z = self.sampling(z_mean, z_log_var)
            reconstruction = self.decoder(z)

            loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
            reconstruction_loss = loss_fn(decoder_target, reconstruction)
            reconstruction_loss = tf.reduce_mean(reconstruction_loss)

            kl_loss = -0.5 * tf.reduce_mean(
                1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
            )
            total_loss = reconstruction_loss + 0.1 * kl_loss

        grads = tape.gradient(total_loss, self.encoder.trainable_variables + self.decoder.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.encoder.trainable_variables + self.decoder.trainable_variables))

        return {
            'total_loss': total_loss,
            'reconstruction_loss': reconstruction_loss,
            'kl_loss': kl_loss
        }

    def train(self, sequences, epochs=50, batch_size=32):
        decoder_input = sequences[:, :-1]
        decoder_target = sequences[:, 1:]
        dataset = tf.data.Dataset.from_tensor_slices((decoder_input, decoder_target))
        dataset = dataset.shuffle(1000).batch(batch_size)

        history = {'total_loss': [], 'reconstruction_loss': [], 'kl_loss': []}

        for epoch in range(epochs):
            epoch_loss = {'total_loss': [], 'reconstruction_loss': [], 'kl_loss': []}
            for x_batch, y_batch in dataset:
                losses = self.train_step(x_batch, y_batch)
                for k in losses:
                    epoch_loss[k].append(losses[k])

            for k in epoch_loss:
                history[k].append(np.mean(epoch_loss[k]))

            if (epoch + 1) % 10 == 0:
                print(f"Epoch {epoch+1}/{epochs} - "
                      f"Loss: {history['total_loss'][-1]:.4f}, "
                      f"Recon: {history['reconstruction_loss'][-1]:.4f}, "
                      f"KL: {history['kl_loss'][-1]:.4f}")
        return history

    # def generate_formulas(self, n_samples=10, temperature=1.0):
    #     z_samples = np.random.normal(0, 1, (n_samples, self.latent_dim))
    #     predictions = self.decoder.predict(z_samples, verbose=0)
    #     formulas = []

    #     for seq in predictions:
    #         if temperature != 1.0:
    #             seq = seq / temperature
    #             seq = tf.nn.softmax(seq, axis=-1).numpy()

    #         token_ids = np.argmax(seq, axis=-1)
    #         tokens = []
    #         for token_id in token_ids:
    #             word = self.tokenizer.index_word.get(token_id)
    #             if word == '<END>':
    #                 break
    #             if word and word not in ['<START>', '<OOV>']:
    #                 tokens.append(word)
    #         formula = ''.join(tokens)
    #         if self.validate_formula(formula):
    #             formulas.append(formula)
    #     return formulas
    # def generate_formulas(self, n_samples=10, temperature=1.0):
    #     if self.decoder is None:
    #         raise ValueError("Model chưa được train!")

    #     # Sample z từ latent space
    #     z_samples = np.random.normal(0, 1, (n_samples, self.latent_dim))
    #     predictions = self.decoder.predict(z_samples, verbose=0)

    #     formulas = []

    #     for seq in predictions:
    #         # Temperature scaling
    #         if temperature != 1.0:
    #             seq = seq / temperature
    #             seq = tf.nn.softmax(seq, axis=-1).numpy()

    #         token_ids = np.argmax(seq, axis=-1)
    #         tokens = []
    #         for token_id in token_ids:
    #             word = self.tokenizer.index_word.get(token_id)
    #             if word == '<END>':
    #                 break
    #             if word and word not in ['<START>', '<OOV>']:
    #                 tokens.append(word)
    #         formula = ''.join(tokens)
    #         if self.validate_formula(formula):
    #             formulas.append(formula)

    #     return formulas

    def generate_formulas(self, n_samples=10, temperature=1.0):
        if self.decoder is None:
            raise ValueError("Model chưa được train!")

        z_samples = np.random.normal(0, 1, (n_samples, self.latent_dim))
        sequences = self.decoder.predict(z_samples, verbose=0)

        formulas = []

        for i, seq in enumerate(sequences):
            if temperature != 1.0:
                seq = seq / temperature
                seq = tf.nn.softmax(seq, axis=-1).numpy()

            token_ids = np.argmax(seq, axis=-1)
            tokens = []
            for token_id in token_ids:
                word = self.tokenizer.index_word.get(token_id, None)
                if word == '<END>':
                    break
                if word and word not in ['<START>', '<OOV>']:
                    tokens.append(word)

            formula = ''.join(tokens)
            print(f"[DEBUG] Sample {i}: token_ids = {token_ids}")
            print(f"[DEBUG] Sample {i}: raw tokens = {tokens}")
            print(f"[DEBUG] Sample {i}: joined formula = {formula}")

            if self.validate_formula(formula):
                formulas.append(formula)

        return formulas



    # def interpolate_formulas(self, formula1, formula2, n_steps=5):
    #     seq1 = self.prepare_single_formula(formula1)
    #     seq2 = self.prepare_single_formula(formula2)

    #     z1_mean, _ = self.encoder.predict(seq1[np.newaxis, :])
    #     z2_mean, _ = self.encoder.predict(seq2[np.newaxis, :])

    #     results = []
    #     for alpha in np.linspace(0, 1, n_steps):
    #         z = (1 - alpha) * z1_mean + alpha * z2_mean
    #         pred = self.decoder.predict(z, verbose=0)
    #         token_ids = np.argmax(pred[0], axis=-1)
    #         tokens = []
    #         for token_id in token_ids:
    #             word = self.tokenizer.index_word.get(token_id)
    #             if word == '<END>':
    #                 break
    #             if word and word not in ['<START>', '<OOV>']:
    #                 tokens.append(word)
    #         formula = ''.join(tokens)
    #         if self.validate_formula(formula):
    #             results.append(formula)
    #     return results
    def interpolate_formulas(self, formula1, formula2, n_steps=5):
        # Chuẩn bị 2 công thức đầu vào
        seq1 = self.prepare_single_formula(formula1)
        seq2 = self.prepare_single_formula(formula2)

        # Lấy latent vectors (chỉ lấy mean, bỏ logvar)
        z1_mean, _ = self.encoder.predict(seq1[np.newaxis, :], verbose=0)
        z2_mean, _ = self.encoder.predict(seq2[np.newaxis, :], verbose=0)

        # Tạo dãy vector nội suy
        alphas = np.linspace(0, 1, n_steps)
        z_interp = np.vstack([
            (1 - alpha) * z1_mean + alpha * z2_mean for alpha in alphas
        ])

        preds = self.decoder.predict(z_interp, verbose=0)
        results = []

        for seq in preds:
            token_ids = np.argmax(seq, axis=-1)
            tokens = []
            for token_id in token_ids:
                word = self.tokenizer.index_word.get(token_id)
                if word == '<END>':
                    break
                if word and word not in ['<START>', '<OOV>']:
                    tokens.append(word)
            formula = ''.join(tokens)
            if self.validate_formula(formula):
                results.append(formula)

        return results
        


    # def prepare_single_formula(self, formula):
    #     tokens = self.tokenize_formula(formula)
    #     sequence = '<START> ' + ' '.join(tokens) + ' <END>'
    #     seq = self.tokenizer.texts_to_sequences([sequence])
    #     return pad_sequences(seq, maxlen=self.max_length, padding='post')[0]
    def prepare_single_formula(self, formula):
        tokens = self.tokenize_formula(formula)
        sequence = '<START> ' + ' '.join(tokens) + ' <END>'
        seq = self.tokenizer.texts_to_sequences([sequence])
        padded = pad_sequences(seq, maxlen=self.max_length, padding='post')
        return padded[0][:-1]  # <-- Trả về input giống lúc train encoder


    def validate_formula(self, formula):
        if not re.search(r'[A-Z]', formula): return False
        if not re.match(r'^([A-Z][a-z]*\d*)+$', formula): return False
        # if '0' in formula: return False
        return True


In [39]:
if __name__ == "__main__":
    data = pd.read_csv("/Users/sonn/Sonn/Workspace/Projects/IonBatteryQML/data/CrystalLithiumIonBattery.csv")
    sample_formulas = data['Formula'].dropna().unique()

    vae = ChemicalFormulaVAE(latent_dim=32, max_length=15)
    sequences = vae.prepare_data(sample_formulas)

    print("Building model...")
    vae.build_models()

    print("Training...")
    vae.train(sequences, epochs=50, batch_size=8)


    print("\nGenerated formulas:")
    new_formulas = vae.generate_formulas(n_samples=10, temperature=0.6)
    for f in new_formulas:
        print(f)

    print("\nInterpolation example:")
    if len(sample_formulas) >= 2:
        f1, f2 = sample_formulas[0], sample_formulas[1]
        print(f"Between {f1} and {f2}:")
        inter = vae.interpolate_formulas(f1, f2, n_steps=5)
        for i, f in enumerate(inter):
            print(f"  Step {i}: {f}")


Building model...
Training...
Epoch 10/50 - Loss: 0.8214, Recon: 0.8107, KL: 0.1069
Epoch 20/50 - Loss: 0.6203, Recon: 0.5951, KL: 0.2526
Epoch 30/50 - Loss: 0.5243, Recon: 0.5019, KL: 0.2239
Epoch 40/50 - Loss: 0.4896, Recon: 0.4584, KL: 0.3112
Epoch 50/50 - Loss: 0.4631, Recon: 0.4354, KL: 0.2773

Generated formulas:
[DEBUG] Sample 0: token_ids = [ 3  7  9  7  4  5 12  6  0  0  0  0  0  0  0  0]
[DEBUG] Sample 0: raw tokens = ['Li', '2', 'Fe', '2', 'Si', 'O', '5', 'END']
[DEBUG] Sample 0: joined formula = Li2Fe2SiO5END
[DEBUG] Sample 1: token_ids = [ 3  7  9  7  4  5 10  6  0  0  0  0  0  0  0  0]
[DEBUG] Sample 1: raw tokens = ['Li', '2', 'Fe', '2', 'Si', 'O', '4', 'END']
[DEBUG] Sample 1: joined formula = Li2Fe2SiO4END
[DEBUG] Sample 2: token_ids = [3 7 9 4 7 5 6 6 0 0 0 0 0 0 0 0]
[DEBUG] Sample 2: raw tokens = ['Li', '2', 'Fe', 'Si', '2', 'O', 'END', 'END']
[DEBUG] Sample 2: joined formula = Li2FeSi2OENDEND
[DEBUG] Sample 3: token_ids = [ 3  7  9  7  4  5 10  6  0  0  0  0  0  0 

In [34]:
data = pd.read_csv("/Users/sonn/Sonn/Workspace/Projects/IonBatteryQML/data/CrystalLithiumIonBattery.csv")
data.head(5)

Unnamed: 0,Materials Id,Formula,Spacegroup,Formation Energy (eV),E Above Hull (eV),Band Gap (eV),Nsites,Density (gm/cc),Volume,Has Bandstructure,Crystal System
0,mp-849394,Li2MnSiO4,Pc,-2.699,0.006,3.462,16,2.993,178.513,True,monoclinic
1,mp-783909,Li2MnSiO4,P21/c,-2.696,0.008,2.879,32,2.926,365.272,True,monoclinic
2,mp-761311,Li4MnSi2O7,Cc,-2.775,0.012,3.653,28,2.761,301.775,True,monoclinic
3,mp-761598,Li4Mn2Si3O10,C2/c,-2.783,0.013,3.015,38,2.908,436.183,True,monoclinic
4,mp-767709,Li2Mn3Si3O10,C2/c,-2.747,0.016,2.578,36,3.334,421.286,True,monoclinic


In [35]:
print("Token index for Mn:", vae.tokenizer.word_index.get("Mn", "Not found"))
print("Vocabulary size:", vae.vocab_size)
print("All tokens:", list(vae.tokenizer.word_index.keys()))


Token index for Mn: Not found
Vocabulary size: 27
All tokens: ['<OOV>', 'start', 'li', 'si', 'o', 'end', '2', '3', 'fe', '4', 'mn', '5', '7', 'co', '10', '8', '6', '16', '13', '11', '9', '15', '17', '19', '32', '24']


In [36]:
print(vae.tokenize_formula("Li2MnSiO4"))


['Li', '2', 'Mn', 'Si', 'O', '4']
