In [1]:

import pandas as pd

device_list = pd.read_pickle('./Energy_graph/embeddings/device_list.pkl')

In [31]:
device_list

['coffeemachine',
 'food mixer',
 'electric oven',
 'games console',
 'electric stove',
 'refrigerator',
 'stereo',
 'treadmill',
 'air conditioner',
 'bouncy castle pump',
 'electric heating element',
 'food processor',
 'hair straighteners',
 'oven',
 'water purifier',
 'audio system',
 'tumble dryer',
 'heat kitchen',
 'heat bedroom #3',
 'electric heater ',
 'fridge freezer',
 'kimchi fridge',
 'heat basement',
 'router',
 'washing machine ',
 'heat garage',
 'whirlpool bath',
 'freezer',
 'office desk',
 'microwave',
 'combination microwave',
 'electric heater',
 'hi fi',
 'solar thermal pumping station',
 'htpc',
 'air conditioning',
 'charger',
 'usb hub',
 'ce appliance',
 'audio amplifier',
 'fridge',
 'air exhaust',
 'cooker',
 'laptops',
 'handmixer',
 'coffee maker',
 'dishwasher',
 'mobile phone charger',
 'heat bedroom #2',
 'blender',
 'tumble dryer ',
 'breadmaker',
 'set top box',
 'fan',
 'stove oven',
 'vivarium',
 'coffee machine',
 'television',
 'dish washer',
 'a

### BERT

In [4]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import pandas as pd

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embedding
def get_bert_embedding(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    hidden_states = outputs.last_hidden_state
    sentence_embedding = torch.mean(hidden_states, dim=1)
    return sentence_embedding.numpy()

# Sample list of devices
device_list = pd.read_pickle('./Energy_graph/device_list.pkl')

# Generate BERT embeddings for each device
device_embeddings = np.array([get_bert_embedding(device).flatten() for device in device_list])


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Fasttext
https://fasttext.cc/docs/en/crawl-vectors.html

In [9]:
import fasttext
import fasttext.util
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Load FastText model
ft = fasttext.load_model('./Energy_graph/embeddings/fastText/cc.en.300.bin')

# Device names to be clustered
# device_list = ["laptop", "mobile", "tablet", "desktop", "router", "fridge", "washing machine", "oven"]

# Generate embeddings for each device name
device_vectors = np.array([ft.get_sentence_vector(device) for device in device_list])
device_vectors = np.array([ft.get_sentence_vector(device) for device in device_list], dtype='double')


ModuleNotFoundError: No module named 'fasttext'

### GPT-2

In [5]:
from transformers import GPT2Model, GPT2Tokenizer

model_name = 'gpt2-medium'  # You can choose other versions as well: 'gpt2', 'gpt2-large', 'gpt2-xl'
model = GPT2Model.from_pretrained(model_name, output_hidden_states=True)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
if not tokenizer.pad_token:
    tokenizer.pad_token = tokenizer.eos_token

def get_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=50)
    with torch.no_grad():
        outputs = model(**inputs)
    # Get the embeddings from one of the hidden layers, for example, the second to last layer
    embeddings = outputs.hidden_states[-2][0]
    return embeddings.mean(dim=0)

word_list = device_list
embeddings = [get_embedding(word, model, tokenizer).numpy() for word in word_list]


Using pad_token, but it is not set yet.


### Spacy

In [2]:
import spacy
from sklearn.cluster import KMeans
import numpy as np

N_CLUSTERS = 25

# Load the largge English model in spaCy (contains word vectors)
# nlp = spacy.load('en_core_web_trf')
nlp = spacy.load('en_core_web_lg')

# List of devices
devices = device_list

# Create vectors for each device name
device_vectors = np.array([nlp(device).vector for device in devices])


2023-09-07 07:49:38.602106: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### MiniLM

In [13]:
%pip install -U sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting torchvision
  Using cached torchvision-0.15.2-cp310-cp310-manylinux1_x86_64.whl (6.0 MB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125925 sha256=f279d55d2e5505204a458d4cedb5af903b405bec3acdb80cd62f71df1879e920
  Stored in directory: /home/jovyan/shared/anaconda/pip/wheels/

In [17]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(device_list)

# Clustering

In [22]:
RANDOM_SEED = 170
# Perform KMeans clustering
n_clusters = 85  # You can choose a different number based on your requirements
kmeans = KMeans(n_clusters=n_clusters, random_state=RANDOM_SEED)
kmeans.fit(embeddings)
labels = kmeans.labels_

# Group devices by their cluster labels
grouped_devices = {}
for label, device in zip(labels, device_list):
    if label not in grouped_devices:
        grouped_devices[label] = []
    grouped_devices[label].append(device)

# Print grouped devices
for label, devices in grouped_devices.items():
    print(f"Group {label}: {devices}")


Group 5: ['coffeemachine', 'coffee maker', 'coffee machine']
Group 75: ['food mixer']
Group 0: ['electric oven', 'oven', 'stove oven']
Group 65: ['games console']
Group 11: ['electric stove', 'stove']
Group 72: ['refrigerator', 'fridge']
Group 52: ['stereo']
Group 29: ['treadmill']
Group 8: ['air conditioner', 'air conditioning']
Group 63: ['bouncy castle pump']
Group 84: ['electric heating element']
Group 31: ['food processor']
Group 14: ['hair straighteners', 'hairdryer straightener']
Group 9: ['water purifier']
Group 12: ['audio system']
Group 7: ['tumble dryer', 'tumble dryer ', 'tumble dryer 3']
Group 80: ['heat kitchen']
Group 6: ['heat bedroom #3', 'heat bedroom #2', 'heat bedroom #1']
Group 30: ['electric heater ', 'electric heater', 'heater']
Group 1: ['fridge freezer', 'fridge freezer ']
Group 73: ['kimchi fridge']
Group 61: ['heat basement']
Group 22: ['router', 'broadband router']
Group 13: ['washing machine ', 'washing machine']
Group 51: ['heat garage']
Group 55: ['whirlp

In [1]:
from sklearn.metrics import silhouette_score
from tqdm import tqdm

# Run initial KMeans
best_score = -1
best_n_clusters = 0
best_labels = None

for n_clusters in tqdm(range(10, 51, 5)):  # Ranging from 10 to 50 clusters
    kmeans = KMeans(n_clusters=n_clusters)
    labels = kmeans.fit_predict(device_vectors)
    score = silhouette_score(device_vectors, labels)
    
    if score > best_score:
        best_score = score
        best_n_clusters = n_clusters
        best_labels = labels

# Run KMeans again with the best number of clusters
kmeans = KMeans(n_clusters=best_n_clusters)
labels = kmeans.fit_predict(device_vectors)

# Detect and reassign outliers
grouped_appliances = {}
for label, appliance in tqdm(zip(labels, device_list)):
    if label not in grouped_appliances:
        grouped_appliances[label] = []
    grouped_appliances[label].append(appliance)

for label, group in tqdm(grouped_appliances.items()):
    if len(group) < 3:  # Assuming a cluster with less than 3 points is an "outlier"
        for appliance in group:
            vector = ft.get_sentence_vector(appliance)
            new_label = kmeans.predict([vector])[0]
            grouped_appliances[new_label].append(appliance)
        del grouped_appliances[label]


ModuleNotFoundError: No module named 'tqdm'