In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset
import evaluate
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import Trainer, TrainingArguments
import transformers
transformers.set_seed(35)
from datasets import Features, Value, Dataset, DatasetDict
import comet_ml
import comet_llm
import os
import numpy as np
import pickle
import json
import pandas as pd
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")




In [2]:
import pandas as pd

file_path = "data/HomeC.csv"
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(df.head())

         time  use [kW]  gen [kW]  House overall [kW]  Dishwasher [kW]  \
0  1451624400  0.932833  0.003483            0.932833         0.000033   
1  1451624401  0.934333  0.003467            0.934333         0.000000   
2  1451624402  0.931817  0.003467            0.931817         0.000017   
3  1451624403  1.022050  0.003483            1.022050         0.000017   
4  1451624404  1.139400  0.003467            1.139400         0.000133   

   Furnace 1 [kW]  Furnace 2 [kW]  Home office [kW]  Fridge [kW]  \
0        0.020700        0.061917          0.442633     0.124150   
1        0.020717        0.063817          0.444067     0.124000   
2        0.020700        0.062317          0.446067     0.123533   
3        0.106900        0.068517          0.446583     0.123133   
4        0.236933        0.063983          0.446533     0.122850   

   Wine cellar [kW]  ...  visibility  summary  apparentTemperature  pressure  \
0          0.006983  ...        10.0    Clear                29.26

  df = pd.read_csv(file_path)


In [3]:
# Rename columns to remove spaces and the kW unit 
df.columns = [col[:-5].replace(' ','_') if 'kW' in col else col for col in df.columns]

# Drop rows with nan values 
df = df.dropna()

# The columns "use" and "house_overall" are the same, so let's remove the 'house_overall' column
df.drop(['House_overall'], axis=1, inplace=True)

# The columns "gen" and "solar" are the same, so let's remove the 'solar' column
df.drop(['Solar'], axis=1, inplace=True)

# drop rows with cloudCover column values that are not numeric (bug in sensors) and convert column to numeric
df = df[df['cloudCover']!='cloudCover']
df["cloudCover"] = pd.to_numeric(df["cloudCover"])

# Create columns that regroup kitchens and furnaces 
df['kitchen'] = df['Kitchen_12'] + df['Kitchen_14'] + df['Kitchen_38']
df['Furnace'] = df['Furnace_1'] + df['Furnace_2']

# Convert "time" column (which is a unix timestamp) to a Y-m-d H-M-S 
import time 
start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(df['time'].iloc[0])))
time_index = pd.date_range(start_time, periods=len(df), freq='min')  
time_index = pd.DatetimeIndex(time_index)
df = df.set_index(time_index)
df = df.drop(['time'], axis=1)

In [4]:
df.columns

Index(['use', 'gen', 'Dishwasher', 'Furnace_1', 'Furnace_2', 'Home_office',
       'Fridge', 'Wine_cellar', 'Garage_door', 'Kitchen_12', 'Kitchen_14',
       'Kitchen_38', 'Barn', 'Well', 'Microwave', 'Living_room', 'temperature',
       'icon', 'humidity', 'visibility', 'summary', 'apparentTemperature',
       'pressure', 'windSpeed', 'cloudCover', 'windBearing', 'precipIntensity',
       'dewPoint', 'precipProbability', 'kitchen', 'Furnace'],
      dtype='object')

In [None]:
output_file = "homec_prompts_responses.jsonl"

# Create prompt-response pairs and save to JSONL
with open(output_file, "w") as f:
    for index, row in df.iterrows():
        # Create a prompt asking for all feature values
        prompt = f"What are the details of the home energy and environmental metrics at {index}?"
        
        # Create a natural language response for all columns
        response_parts = []
        for column in df.columns:
            if column in ["use", "Furnance_1", "Furnance_2", "Home_office", "Wine_cellar", "Garage_door", "Barn", "Well", "Living_room",
                          "Kitchen_12", "Kitchen_14", "Kitchen_38", "kitchen", "Dishwasher", "Furnace", "Microwave", "Fridge"]:
                response_parts.append(f"{column.replace('_', ' ')} uses {row[column]} energy")
            elif column in ["temperature", "humidity", "apparentTemperature", "dewPoint", "visibility", "pressure", "windSpeed",
                             "cloudCover", "windBearing", "precipIntensity", "precipProbability"]:
                response_parts.append(f"{column.replace('_', ' ')} is {row[column]}")
            elif column in ["icon", "summary"]:
                response_parts.append(f"{column} indicates '{row[column]}'")
            else:
                response_parts.append(f"{column.replace('_', ' ')} is {row[column]}")

        # Join the response parts into a single text
        response = ". ".join(response_parts) + "."

        # Write the prompt-response pair to the JSONL file
        json_line = {"prompt": prompt, "completion": response}
        f.write(json.dumps(json_line) + "\n")

print(f"Transformed dataset saved to {output_file}")

Transformed dataset saved to homec_prompts_responses.jsonl
Transformed dataset saved to homec_prompts_responses.jsonl


In [6]:
# Load the dataset from your JSONL file
dataset = load_dataset("json", data_files="homec_prompts_responses.jsonl")

# Split the dataset into training and validation sets
dataset = dataset["train"].train_test_split(test_size=0.1)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

# Inspect the dataset structure
print(train_dataset[0])


Generating train split: 0 examples [00:00, ? examples/s]

{'prompt': 'What are the details of the home energy and environmental metrics at 2016-01-25 06:01:58?', 'completion': "use uses 0.591866667 energy. gen is 0.33925. Dishwasher uses 0.000233333 energy. Furnace 1 is 0.459266667. Furnace 2 is 0.191. Home office uses 0.039866667 energy. Fridge uses 0.004833333 energy. Wine cellar uses 0.00715 energy. Garage door uses 0.012366667 energy. Kitchen 12 uses 0.00075 energy. Kitchen 14 uses 0.000216667 energy. Kitchen 38 uses 0.0 energy. Barn uses 0.028883333 energy. Well uses 0.001 energy. Microwave uses 0.004 energy. Living room uses 0.001433333 energy. temperature is 25.52. icon indicates 'clear-night'. humidity is 0.81. visibility is 9.43. summary indicates 'Clear'. apparentTemperature is 20.49. pressure is 1018.89. windSpeed is 4.08. cloudCover is 0.05. windBearing is 178.0. precipIntensity is 0.0. dewPoint is 20.42. precipProbability is 0.0. kitchen uses 0.000966667 energy. Furnace uses 0.650266667 energy."}


In [7]:
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [8]:
def preprocess_function(examples):
    return tokenizer(
        examples["prompt"],
        text_target=examples["completion"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

# Tokenize datasets
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/453466 [00:00<?, ? examples/s]

Map:   0%|          | 0/50386 [00:00<?, ? examples/s]

In [9]:
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_total_limit=2,
    weight_decay=0.01,
    logging_dir="./logs",
    push_to_hub=False,
)

comet_ml is installed but the Comet API Key is not configured. Please set the `COMET_API_KEY` environment variable to enable Comet logging. Check out the documentation for other ways of configuring it: https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#set-the-api-key


In [10]:
trainer = Trainer(
    model=model.to("cuda"),
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.5143,0.49313
2,0.4896,0.467416
3,0.4777,0.458459


TrainOutput(global_step=170052, training_loss=0.513531099184459, metrics={'train_runtime': 30542.9759, 'train_samples_per_second': 44.54, 'train_steps_per_second': 5.568, 'total_flos': 2.3288563318888858e+17, 'train_loss': 0.513531099184459, 'epoch': 3.0})

In [11]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

('./fine_tuned_model\\tokenizer_config.json',
 './fine_tuned_model\\special_tokens_map.json',
 './fine_tuned_model\\spiece.model',
 './fine_tuned_model\\added_tokens.json',
 './fine_tuned_model\\tokenizer.json')

In [None]:
from transformers import pipeline

# Load the fine-tuned model
fine_tuned_model = pipeline("text2text-generation", model="./fine_tuned_model", tokenizer=tokenizer)

# Test with a query
query = "What are the details of the home energy and environmental metric at 2015-12-31 21:03:58"
response = fine_tuned_model(query, max_length=50)
print(response[0]["generated_text"])

Device set to use cuda:0


use uses 0.3098 energy. gen is 0.00345. Dishwasher uses 0.0 energy. Furnace 1 is 0.020766667. Furnace 2 is 0.0641. Home office uses 0.0


: 

In [21]:
df

Unnamed: 0,use,gen,Dishwasher,Furnace_1,Furnace_2,Home_office,Fridge,Wine_cellar,Garage_door,Kitchen_12,...,apparentTemperature,pressure,windSpeed,cloudCover,windBearing,precipIntensity,dewPoint,precipProbability,kitchen,Furnace
2015-12-31 21:00:58,0.714200,0.003417,0.000033,0.021083,0.309983,0.043067,0.005167,0.123317,0.013183,0.000667,...,29.40,1016.25,8.29,0.75,285.0,0.0000,23.90,0.00,0.000733,0.331067
2015-12-31 21:01:58,0.497067,0.003417,0.000017,0.096983,0.062867,0.043283,0.005000,0.123283,0.012883,0.000750,...,29.40,1016.25,8.29,0.75,285.0,0.0000,23.90,0.00,0.000833,0.159850
2015-12-31 21:02:58,0.465133,0.003450,0.000017,0.064500,0.062633,0.043250,0.005017,0.123350,0.012950,0.000717,...,29.40,1016.25,8.29,0.75,285.0,0.0000,23.90,0.00,0.000817,0.127133
2015-12-31 21:03:58,0.512933,0.003417,0.000017,0.111333,0.063883,0.043300,0.004967,0.123867,0.012883,0.000733,...,29.40,1016.25,8.29,0.75,285.0,0.0000,23.90,0.00,0.000833,0.175217
2015-12-31 21:04:58,0.651283,0.003417,0.000017,0.114583,0.063200,0.043283,0.114167,0.124267,0.012917,0.000467,...,29.40,1016.25,8.29,0.75,285.0,0.0000,23.90,0.00,0.000500,0.177783
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-15 18:27:58,1.601233,0.003183,0.000050,0.085267,0.642417,0.041783,0.005267,0.008667,0.013483,0.000467,...,29.45,1011.49,6.72,0.31,186.0,0.0101,31.27,0.51,0.000633,0.727683
2016-12-15 18:28:58,1.599333,0.003233,0.000050,0.104017,0.625033,0.041750,0.005233,0.008433,0.013433,0.000467,...,29.45,1011.49,6.72,0.31,186.0,0.0101,31.27,0.51,0.000600,0.729050
2016-12-15 18:29:58,1.924267,0.003217,0.000033,0.422383,0.637733,0.042033,0.004983,0.008467,0.012933,0.000533,...,29.45,1011.49,6.72,0.31,186.0,0.0101,31.27,0.51,0.000600,1.060117
2016-12-15 18:30:58,1.978200,0.003217,0.000050,0.495667,0.620367,0.042100,0.005333,0.008233,0.012817,0.000517,...,29.45,1011.49,6.72,0.31,186.0,0.0101,31.27,0.51,0.000650,1.116033
