<a href="https://colab.research.google.com/github/selvamanishiva18/GenAI/blob/main/Copy_of_IsolationForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import pipeline, GPT2Tokenizer, GPT2Model
import torch
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import pandas as pd

# Sample data (replace this with your log data)
data = {
    'Timestamp': ['2023-01-01 12:00:00', '2023-01-01 13:00:00', '2023-01-01 14:00:00'],
    'LogText': ['Normal operation', 'Connection timeout', 'Unexpected error'],
}

df = pd.DataFrame(data)
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df.set_index('Timestamp', inplace=True)

# Tokenize and embed textual data using GPT-2
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2Model.from_pretrained("gpt2")
text_embedding = pipeline('feature-extraction', model=model, tokenizer=tokenizer)

# Embed textual data
df['TextEmbedding'] = df['LogText'].apply(lambda x: text_embedding(x)[0][0])

# Combine text embeddings with numerical features if available
numerical_features = df.columns.difference(['LogText', 'TextEmbedding'])
combined_features = np.column_stack((df['TextEmbedding'].to_list(), df[numerical_features].to_numpy()))

# Standardize the combined features
scaler = StandardScaler()
combined_features_scaled = scaler.fit_transform(combined_features)

# Train Isolation Forest model
model = IsolationForest(contamination=0.1)
model.fit(combined_features_scaled)

# Predict anomalies
df['AnomalyScore'] = model.decision_function(combined_features_scaled)

# Display the results
print(df[['LogText', 'AnomalyScore']])

                                LogText  AnomalyScore
Timestamp                                            
2023-01-01 12:00:00    Normal operation      0.018858
2023-01-01 13:00:00  Connection timeout     -0.000896
2023-01-01 14:00:00    Unexpected error      0.003583


In [None]:
pip install transformers

Collecting transformers
  Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.3-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
Ins