In [1]:
import openai

In [2]:
import os
from openai import OpenAI
import numpy as np
import matplotlib.pyplot as plt
import pandas as pdf

In [3]:
# read my key from file openai_key.txt
with open('../../keys/openai_gpt_siji.txt', 'r') as file:
    key = file.read().replace('\n', '')


In [4]:
client = OpenAI(
    # This is my API key, you should use your own
    api_key=key
)

def get_chat_completion(messages,client=client, model="gpt-3.5-turbo-0125", max_tokens=12000, store=True):
    # the cheapest model I could find; input $0.0005 / 1K tokens; output $0.0015 / 1K tokens
    chat_completion= client.chat.completions.create(
        messages=[
            {"role":"system","content":"You are a data scientist working on anomaly detection."},
             {
            "role": "user",
            "content": messages,
        }
            ],
        model=model,
        temperature=0,
        max_tokens=max_tokens,
        store=store,
    )
    return chat_completion

MODEL_LIST=["gpt-4o","gpt-4o-mini","gpt-4-turbo","gpt-3.5-turbo-0125"]


In [5]:
def print_response(response, debug=False):
    if debug:
        # print the tokens size
        print('total tokens:', response.usage.total_tokens, 'prompt tokens:', response.usage.prompt_tokens, 'completion_tokens tokens:', response.usage.completion_tokens)
    print(response.choices[0].message.content)
    

In [6]:
# read file
import pandas as pd
df = pd.read_csv('data/data_selection_17_23_drop_missing_nodes_dropna.csv')



In [7]:
one_hour=120 # sample every 30 seconds, one hour has 120 samples
one_day=one_hour*24
sample=df[np.logical_and(df['timestamp']>one_hour*10,df['timestamp']<one_hour*15)]
print(sample.shape)

(3449, 5)


In [8]:
sample_json=sample.to_json(orient="split",index=False)
print(len(sample_json))
print(sample_json)

91359
{"columns":["timestamp","nodeid","temperature","humidity","voltage"],"data":[[1207,1,21.81,34.64,2.49],[1207,2,22.33,35.23,2.46],[1207,3,21.89,35.02,2.5],[1207,10,22.36,34.33,2.5],[1207,13,19.93,39.01,2.44],[1207,34,23.26,32.72,2.47],[1207,35,22.92,35.4,2.39],[1207,54,22.85,31.07,2.49],[1208,39,24.57,29.97,2.52],[1210,27,25.44,28.97,2.52],[1210,45,20.68,35.02,2.47],[1211,30,26.78,27.29,2.54],[1211,43,24.05,29.54,2.5],[1212,10,22.36,34.29,2.49],[1212,11,22.48,34.47,2.47],[1212,41,25.82,26.28,2.52],[1213,43,24.04,29.54,2.51],[1213,51,20.74,34.33,2.47],[1216,12,23.28,31.6,2.49],[1217,1,21.98,34.4,2.5],[1217,22,21.94,34.12,2.46],[1217,32,23.11,32.62,2.5],[1217,35,23.06,35.09,2.39],[1217,54,22.88,31.24,2.49],[1218,12,23.34,31.53,2.49],[1219,31,25.92,28.65,2.54],[1225,7,20.76,36.61,2.47],[1225,24,23.26,31.38,2.5],[1225,54,22.93,30.96,2.49],[1226,12,23.28,31.53,2.49],[1227,37,23.57,34.61,2.5],[1230,46,19.99,36.72,2.35],[1230,54,22.94,30.85,2.49],[1231,3,21.93,34.64,2.49],[1231,33,22.71,

In [9]:
single_point_shifts_dramatically=[24.37, 90.92, 24.42, 24.41]
multiple_points_oscillate=[36.64, 71.76, 49.87, 77.15, 94.38]
persistent_level_shift=[]
few_shots_promt=f"""
You are a data scientist working on anomaly detection. The dataset is below in json format. 
You're looking for anomalies in 3 columns: "temperature","humidity","voltage". 
Please analyze the data with the highest level of diligence and caution.
Use your knowledge about temperature, humidity and battery voltage to find anomaly.
Anomaly Types definitions:
    - Single point shifts dramatically from other points. example: {single_point_shifts_dramatically}
    - Multiple points oscillate. example: {multiple_points_oscillate}
    - Persistent level shift. 
Brief me your thought in each step. But don't return the code.
Directly give me the results. The anomalies for the three columns the csv format: [timestamp, nodeid,temperature_anomaly, humidity_anomaly, voltage_anomaly] format. Use 0 indicates normal and 1 indicates anomaly. 
If given timestamp and nodeid has normal for all three columns, don't include it in the csv.

Data: ```{sample_json}```.
"""
print(len(few_shots_promt))
print(few_shots_promt)

92307

You are a data scientist working on anomaly detection. The dataset is below in json format. 
You're looking for anomalies in 3 columns: "temperature","humidity","voltage". 
Please analyze the data with the highest level of diligence and caution.
Use your knowledge about temperature, humidity and battery voltage to find anomaly.
Anomaly Types definitions:
    - Single point shifts dramatically from other points. example: [24.37, 90.92, 24.42, 24.41]
    - Multiple points oscillate. example: [36.64, 71.76, 49.87, 77.15, 94.38]
    - Persistent level shift. 
Brief me your thought in each step. But don't return the code.
Directly give me the results. The anomalies for the three columns the csv format: [timestamp, nodeid,temperature_anomaly, humidity_anomaly, voltage_anomaly] format. Use 0 indicates normal and 1 indicates anomaly. 
If given timestamp and nodeid has normal for all three columns, don't include it in the csv.

Data: ```{"columns":["timestamp","nodeid","temperature","hum

In [10]:
response = get_chat_completion(few_shots_promt,model=MODEL_LIST[1],max_tokens=4096)
print_response(response, debug=True)

total tokens: 59391 prompt tokens: 58905 completion_tokens tokens: 486
To identify anomalies in the dataset based on the columns "temperature", "humidity", and "voltage", I will follow a systematic approach:

1. **Single Point Anomalies**: I will look for individual data points that deviate significantly from the surrounding values. For temperature, values above 40°C or below 0°C are generally considered outliers. For humidity, values above 100% or below 0% are anomalies. For voltage, values significantly lower than the expected range (e.g., below 2.0V) or higher than normal (e.g., above 3.0V) will be flagged.

2. **Multiple Point Oscillations**: I will check for sequences of values that oscillate significantly. This can be identified by looking for patterns where values fluctuate rapidly over a short period, indicating instability.

3. **Persistent Level Shifts**: I will analyze the data for any sustained changes in the mean level of the measurements. This can be identified by looking