In [1]:
import os
import json
import pandas as pd
from glob import glob
import numpy as np

In [2]:
def load_json_files(directory):
    all_data = []
    json_files = glob(os.path.join(directory, "*.json"))

    for filename in json_files:
        with open(filename, "r") as f:
            data = json.load(f)
            all_data.extend(data)

    df = pd.DataFrame(all_data)
    return df

In [3]:
directory = r"registry_snapshots"
data = load_json_files(directory)
data = data.drop_duplicates()
data.head()

Unnamed: 0,path,value_name,value_data,value_type
0,HKEY_LOCAL_MACHINE\Software\Microsoft\Windows\...,SecurityHealth,%windir%\system32\SecurityHealthSystray.exe,2
1,HKEY_LOCAL_MACHINE\Software\Microsoft\Windows\...,RtkAudUService,"""C:\WINDOWS\System32\RtkAudUService64.exe"" -ba...",1
2,HKEY_LOCAL_MACHINE\Software\Microsoft\Windows\...,AdobeAAMUpdater-1.0,"""C:\Program Files (x86)\Common Files\Adobe\OOB...",1
3,HKEY_LOCAL_MACHINE\Software\Microsoft\Windows\...,Common Administrative Tools,C:\ProgramData\Microsoft\Windows\Start Menu\Pr...,1
4,HKEY_LOCAL_MACHINE\Software\Microsoft\Windows\...,Common AppData,C:\ProgramData,1


In [4]:
data["value_data_len"] = data["value_data"].apply(len)

In [5]:
from sklearn.ensemble import IsolationForest

model = IsolationForest(contamination=0.1)
model.fit(data[["value_data_len"]])

In [6]:
data["anomaly"] = model.predict(data[["value_data_len"]])
data.head()

Unnamed: 0,path,value_name,value_data,value_type,value_data_len,anomaly
0,HKEY_LOCAL_MACHINE\Software\Microsoft\Windows\...,SecurityHealth,%windir%\system32\SecurityHealthSystray.exe,2,43,1
1,HKEY_LOCAL_MACHINE\Software\Microsoft\Windows\...,RtkAudUService,"""C:\WINDOWS\System32\RtkAudUService64.exe"" -ba...",1,54,1
2,HKEY_LOCAL_MACHINE\Software\Microsoft\Windows\...,AdobeAAMUpdater-1.0,"""C:\Program Files (x86)\Common Files\Adobe\OOB...",1,84,-1
3,HKEY_LOCAL_MACHINE\Software\Microsoft\Windows\...,Common Administrative Tools,C:\ProgramData\Microsoft\Windows\Start Menu\Pr...,1,73,1
4,HKEY_LOCAL_MACHINE\Software\Microsoft\Windows\...,Common AppData,C:\ProgramData,1,14,-1


In [7]:
data["anomaly"].value_counts()

anomaly
 1    88
-1    10
Name: count, dtype: int64

In [8]:
anomalous_rows = data[data["anomaly"] == -1]
anomalous_rows.head()

Unnamed: 0,path,value_name,value_data,value_type,value_data_len,anomaly
2,HKEY_LOCAL_MACHINE\Software\Microsoft\Windows\...,AdobeAAMUpdater-1.0,"""C:\Program Files (x86)\Common Files\Adobe\OOB...",1,84,-1
4,HKEY_LOCAL_MACHINE\Software\Microsoft\Windows\...,Common AppData,C:\ProgramData,1,14,-1
15,HKEY_CURRENT_USER\Software\Microsoft\Windows\C...,EpicGamesLauncher,"""C:\Program Files (x86)\Epic Games\Launcher\Po...",1,116,-1
17,HKEY_CURRENT_USER\Software\Microsoft\Windows\C...,GoogleDriveFS,C:\Program Files\Google\Drive File Stream\94.0...,1,83,-1
20,HKEY_CURRENT_USER\Software\Microsoft\Windows\C...,MicrosoftEdgeAutoLaunch_B8188322885C6DD24FAC5C...,"""C:\Program Files (x86)\Microsoft\Edge\Applica...",1,102,-1


In [9]:
text_data = data['value_data'].tolist()

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text_data)

In [11]:
iso_forest = IsolationForest(contamination='auto', random_state=42)
iso_forest.fit(X)

In [12]:
anomalies = iso_forest.predict(X)
anomalies = np.where(anomalies == -1, True, False)
print("Anomalies detected:", np.sum(anomalies))

Anomalies detected: 1


In [13]:
data["anomaly"] = anomalies
anomalous_rows = data[data["anomaly"] == True]
anomalous_rows.head()

Unnamed: 0,path,value_name,value_data,value_type,value_data_len,anomaly
15,HKEY_CURRENT_USER\Software\Microsoft\Windows\C...,EpicGamesLauncher,"""C:\Program Files (x86)\Epic Games\Launcher\Po...",1,116,True
