In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
BASE_PATH = "/content/drive/MyDrive/droplet_model"


In [5]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from google.colab import drive
import os

In [9]:
DATA_PATH = '/content/drive/MyDrive/droplet_model/dataset/We_2_Mu_0.01_t_80.dat'
MODEL_SAVE_PATH = '/content/drive/MyDrive/droplet_model/notebooks'

In [7]:
print(f"Loading data from {DATA_PATH}...") #failsafe methods

Loading data from /content/drive/MyDrive/droplet_model/dataset...


In [13]:
try:
    df = pd.read_csv(DATA_PATH, sep='\s+', engine='python')
    print("Data loaded successfully.")
    print(f"Columns found: {list(df.columns)}")
except FileNotFoundError:
    print("ERROR: File not found. Please check the DATA_PATH variable.")
    # Stop execution if file is missing
    raise

Data loaded successfully.
Columns found: ['11595', '80.01', '0', '2138.97', '59.9913', '59.9959', '59.9534', '793.853']


  df = pd.read_csv(DATA_PATH, sep='\s+', engine='python')


In [15]:
try:
    df = pd.read_csv(DATA_PATH, sep='\s+', header=None, engine='python')
#filter raw data
    df.columns = [
        'Computational step',
        'Physical time',
        'Id',
        'volume',
        'x',
        'y',
        'z',
        'surface area'
    ]

    print("Data loaded successfully.")
    print(f"First 5 rows:\n{df.head()}")
except FileNotFoundError:
    print("ERROR: File not found. Please check the DATA_PATH variable.")
    raise

Data loaded successfully.
First 5 rows:
   Computational step  Physical time  Id   volume        x        y        z  \
0               11595          80.01   0  2138.97  59.9913  59.9959  59.9534   
1               11598          80.02   0  2138.95  59.9823  59.9918  59.9050   
2               11600          80.03   0  2138.95  59.9734  59.9878  59.8563   
3               11602          80.04   0  2138.89  59.9643  59.9835  59.8070   
4               11604          80.05   0  2138.90  59.9552  59.9793  59.7572   

   surface area  
0       793.853  
1       793.899  
2       794.324  
3       793.965  
4       794.706  


  df = pd.read_csv(DATA_PATH, sep='\s+', header=None, engine='python')


In [17]:
df = df.sort_values(by=['Id', 'Computational step'])
# We predict position at t based on t-1
df['x_prev'] = df.groupby('Id')['x'].shift(1)
df['y_prev'] = df.groupby('Id')['y'].shift(1)
df['z_prev'] = df.groupby('Id')['z'].shift(1)
# Calculate Targets (Velocity/Delta)
df['dx'] = df['x'] - df['x_prev']
df['dy'] = df['y'] - df['y_prev']
df['dz'] = df['z'] - df['z_prev']
# Remove rows with NaN (the first step of every droplet has no history)
df_clean = df.dropna()
print(f"Training data size: {len(df_clean)} rows")

Training data size: 10281 rows


In [19]:
feature_cols = ['Physical time', 'volume', 'surface area', 'x_prev', 'y_prev', 'z_prev']
# Targets: Changes in x, y, z
targets = ['dx', 'dy', 'dz']
X = df_clean[feature_cols]

In [20]:
models = {}
# Fast, efficient parameters for home/colab use
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'force_col_wise': True # Optimizes for speed
}

print("Starting training...")

for target in targets:
    print(f"Training model for {target}...")
    y = df_clean[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

    bst = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[test_data],
        callbacks=[lgb.early_stopping(stopping_rounds=50)]
    )
    models[target] = bst

Starting training...
Training model for dx...
[LightGBM] [Info] Total Bins 1519
[LightGBM] [Info] Number of data points in the train set: 8224, number of used features: 6
[LightGBM] [Info] Start training from score -0.000104
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[81]	valid_0's rmse: 0.186996
Training model for dy...
[LightGBM] [Info] Total Bins 1519
[LightGBM] [Info] Number of data points in the train set: 8224, number of used features: 6
[LightGBM] [Info] Start training from score 0.001852
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 0.934089
Training model for dz...
[LightGBM] [Info] Total Bins 1519
[LightGBM] [Info] Number of data points in the train set: 8224, number of used features: 6
[LightGBM] [Info] Start training from score -0.000774
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[13]	

In [24]:
print("Saving models...")
for target, model in models.items():
    # Construct the file path by joining the directory path with the filename
    file_name = f'{target}_model.txt'
    save_path = os.path.join(MODEL_SAVE_PATH, file_name)
    model.save_model(save_path)
    print(f"Saved {target} model to: {save_path}")

Saving models...
Saved dx model to: /content/drive/MyDrive/droplet_model/notebooks/dx_model.txt
Saved dy model to: /content/drive/MyDrive/droplet_model/notebooks/dy_model.txt
Saved dz model to: /content/drive/MyDrive/droplet_model/notebooks/dz_model.txt
