In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow.parquet as pq
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import lightgbm as lgb

# Define the path to the dataset directory
DATA_DIR = "/kaggle/input/open-problems-single-cell-perturbations"

# Function to read the dataset in chunks
def read_in_chunks(file_path, chunk_size=10000):
    if file_path.endswith('.parquet'):
        parquet_file = pq.ParquetFile(file_path)
        num_rows = parquet_file.metadata.num_rows
        num_chunks = num_rows // chunk_size + 1

        for i in range(num_chunks):
            table = parquet_file.read_row_group(i, columns=None)
            yield table.to_pandas()
    elif file_path.endswith('.csv'):
        for chunk in pd.read_csv(file_path, chunksize=chunk_size):
            yield chunk

## Data Exploration

In [None]:
datasets = {
    "adata_obs_meta": f"{DATA_DIR}/adata_obs_meta.csv",
    "adata_train": f"{DATA_DIR}/adata_train.parquet",
    "de_train": f"{DATA_DIR}/de_train.parquet",
    "id_map": f"{DATA_DIR}/id_map.csv",
    "multiome_obs_meta": f"{DATA_DIR}/multiome_obs_meta.csv",
    "multiome_train": f"{DATA_DIR}/multiome_train.parquet",
    "multiome_var_meta": f"{DATA_DIR}/multiome_var_meta.csv"
}

for name, path in datasets.items():
    data_chunk_gen = read_in_chunks(path)
    data_first_chunk = next(data_chunk_gen)
    print(f"--- {name.upper()} ---")
    print(data_first_chunk.head())

# Visualization and summary stats
sample_data = next(read_in_chunks(datasets["de_train"], chunk_size=1000))
sample_data.hist(bins=50, figsize=(20, 15))
plt.show()

## Preprocessing

In [None]:
scaler = MinMaxScaler()

def preprocess_data(chunk):
    chunk = handle_missing_data(chunk)
    scaled_data = scaler.fit_transform(chunk.iloc[:, :-1])  # Assuming the last column is the target
    return pd.DataFrame(scaled_data, columns=chunk.columns[:-1])

def handle_missing_data(chunk):
    for column in chunk.columns:
        median_val = chunk[column].median()
        chunk[column].fillna(median_val, inplace=True)
    return chunk

## Model Development

In [None]:
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt'
}

de_train_gen = read_in_chunks(datasets["de_train"])
first_chunk = next(de_train_gen)
train_data = preprocess_data(first_chunk)
X = train_data.drop('target_column', axis=1)  # replace 'target_column' with the actual column name
y = first_chunk['target_column']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

train_dataset = lgb.Dataset(X_train, label=y_train)
val_dataset = lgb.Dataset(X_val, label=y_val, reference=train_dataset)

model = lgb.train(params, train_dataset, valid_sets=val_dataset, num_boost_round=500, early_stopping_rounds=50)
model.save_model('model.txt')

## Evaluation

In [None]:
# Assuming de_train dataset will also be split for evaluation
X_test = preprocess_data(first_chunk).drop('target_column', axis=1)  # replace 'target_column' with the actual column name
y_test = first_chunk['target_column']

predictions = model.predict(X_test)
mse = ((predictions - y_test) ** 2).mean()
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")