In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
import modlee
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

In [None]:
os.environ['MODLEE_API_KEY'] = "E1S58A6F4dUUBJEG02E1R1TG631i8b8E"

import modlee
if os.environ.get('MODLEE_API_KEY') is None:
    print("Module key not set")
else:
    modlee.init(api_key=os.environ.get('MODLEE_API_KEY'))
    print("Module initialized")

In [None]:
file_path = '/mnt/data/housing.csv'
df = pd.read_csv(file_path)

In [None]:
X = df.drop(['median_house_value'],axis=1)
y = df['median_house_value']

categorical_cols = ['ocean_proximity']  # Replace with your actual categorical columns
numerical_cols = [col for col in X if col not in categorical_cols]

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Preprocess features
X = preprocessor.fit_transform(X_train)
# Transform the validation data
X_val = preprocessor.transform(X_val)

In [None]:
# Define custom dataset
class TextRegressionDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        sample = self.features[idx]
        label = self.labels[idx]
        return torch.tensor(sample, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)

# Create datasets
train_dataset = TextRegressionDataset(X_train, y_train)
val_dataset = TextRegressionDataset(X_val, y_val)

In [None]:
# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
recommender = modlee.recommender.from_modality_task(
    modality='text',
    task='regression',
)
recommender.fit(train_dataloader)
modlee_model = recommender.model
print(f"\nRecommended model: \n{modlee_model}")

In [None]:
with modlee.start_run() as run:
    trainer = pl.Trainer(max_epochs=1)
    trainer.fit(
        model=modlee_model,
        train_dataloaders=train_dataloader
    )

In [None]:
# Validation
trainer.validate(
    model=modlee_model,
    val_dataloaders=val_dataloader
)

In [None]:
last_run_path = modlee.last_run_path()
print(f"Run path: {last_run_path}")
artifacts_path = os.path.join(last_run_path, 'artifacts')
artifacts = sorted(os.listdir(artifacts_path))
print(f"Saved artifacts: {artifacts}")