<a href="https://www.kaggle.com/code/colewelkins/pert30?scriptVersionId=143165959" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Open Problems in Single Cell Perturbations - Kaggle Competition

## Import Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder
import gc

## Load and Preprocess Data

In [None]:
# Load the columns ['obs_id', 'gene', 'normalized_count']
gene_expression_data = pd.read_parquet('/kaggle/input/open-problems-single-cell-perturbations/adata_train.parquet', columns=['obs_id', 'gene', 'normalized_count'])

# Pivot the data to get a wide format
pivot_data = gene_expression_data.pivot(index='obs_id', columns='gene', values='normalized_count').reset_index()

# Load metadata
metadata = pd.read_parquet('/kaggle/input/open-problems-single-cell-perturbations/adata_train.parquet', columns=['obs_id', 'cell_type', 'sm_name', 'SMILES', 'sm_lincs_id', 'control'])

# Merge with metadata
train_df = metadata.merge(pivot_data, on='obs_id')

# Convert categorical columns to string type and encode
categorical_cols = ['cell_type', 'sm_name', 'SMILES', 'sm_lincs_id']
for col in categorical_cols:
    train_df[col] = train_df[col].astype(str)

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    label_encoders[col] = le

# Cleanup
del gene_expression_data, pivot_data, metadata
gc.collect()

# Concatenate processed chunks
initial_df = pd.concat(list_of_dataframes, axis=0)

# Convert categorical columns to string type
categorical_cols = ['cell_type', 'sm_name', 'SMILES', 'sm_lincs_id']
for col in categorical_cols:
    initial_df[col] = initial_df[col].astype(str)

# Encoding categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    initial_df[col] = le.fit_transform(initial_df[col])
    label_encoders[col] = le

# Now, load gene expression data in chunks and merge with the initial_df
genes = []
for chunk in pd.read_parquet('/kaggle/input/open-problems-single-cell-perturbations/adata_train.parquet', chunksize=chunksize):
    genes.extend([col for col in chunk.columns if col not in cols_to_load])
    initial_df = pd.concat([initial_df, chunk[genes].astype('float32')], axis=1)
    del chunk
    gc.collect()

train_df = initial_df
del initial_df
gc.collect()

In [None]:
import pyarrow.parquet as pq

# Check the columns in the parquet file
file_metadata = pq.read_metadata('/kaggle/input/open-problems-single-cell-perturbations/adata_train.parquet')
all_columns = file_metadata.schema.names

print(all_columns)

## Model Training

In [None]:
# Splitting data for training individual models for each gene
X_train, X_val, y_train, y_val = train_test_split(train_df.drop(columns=genes), train_df[genes], test_size=0.2, random_state=42)

# Train individual models for each gene
models = {}
for gene in genes:
    model = LGBMRegressor()
    model.fit(X_train, y_train[gene])
    models[gene] = model
    # Free up memory after each model training
    gc.collect()

# Validate models
val_preds = {}
for gene, model in models.items():
    val_preds[gene] = model.predict(X_val)

# Calculate overall validation score
overall_val_score = mean_squared_error(y_val, pd.DataFrame(val_preds))

## Prepare the Test Data

In [None]:
# Load the id_map data
id_map = pd.read_csv('/kaggle/input/open-problems-single-cell-perturbations/id_map.csv')

# Ensure consistent data types before merging
for col in categorical_cols:
    id_map[col] = id_map[col].astype(str)
    train_df[col] = train_df[col].astype(str)

# Merge the data
test_df = train_df.merge(id_map, on=categorical_cols, how='right')

# Encoding the categorical columns using the label encoders from training data
for col, le in label_encoders.items():
    test_df[col] = le.transform(test_df[col])

## Generate Predictions for Submission

In [None]:
# Making predictions
predictions = pd.DataFrame(index=test_df.index, columns=genes)
for gene, model in models.items():
    predictions[gene] = model.predict(test_df)

# Creating the submission dataframe
submission = predictions.melt(id_vars=['id'], value_vars=genes, 
                              var_name='gene', value_name='predicted_expression')
submission = submission.sort_values(by=['id', 'gene']).reset_index(drop=True)

# Save the submission file
submission.to_csv('submission.csv', index=False)

## End of Notebook