In [24]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MeanShift
import numpy as np

In [25]:
logs = pd.read_csv("data\merged_logs_small.csv")
df = logs[["Log"]].iloc[0:500] #test 500 logs

Unnamed: 0,Log
0,081109 203615 148 INFO dfs.DataNode$PacketResp...
1,081109 203807 222 INFO dfs.DataNode$PacketResp...
2,081109 204005 35 INFO dfs.FSNamesystem: BLOCK*...
3,081109 204015 308 INFO dfs.DataNode$PacketResp...
4,081109 204106 329 INFO dfs.DataNode$PacketResp...
...,...
495,081110 103819 19 INFO dfs.FSDataset: Deleting ...
496,081110 103821 19 INFO dfs.FSDataset: Deleting ...
497,081110 103827 26 INFO dfs.FSNamesystem: BLOCK*...
498,081110 103840 19 INFO dfs.FSDataset: Deleting ...


In [26]:
def generate_logs(df, tfidf_matrix, mean_shift, labels, num_representative=3):
    representative_logs = []
    for cluster in np.unique(labels):
        cluster_points = df[df['cluster'] == cluster]
        cluster_center = mean_shift.cluster_centers_[cluster]
        # distance of log to its cluster center
        distances = np.linalg.norm(tfidf_matrix[cluster_points.index].toarray() - cluster_center, axis=1)
        # indices of closest logs (most representative of cluster)
        closest_indices = np.argsort(distances)
        step = max(1, len(closest_indices) // num_representative)
        selected_indices = closest_indices[::step][:num_representative]
        # append most representative logs for each cluster
        for idx in selected_indices:
            log = cluster_points.iloc[idx]['Log']
            template = cluster_points.iloc[idx]['LogTemplate']
            representative_logs.append([template, log])
            
    return representative_logs

In [27]:
def get_representative_logs(input_path, num_representative=3):
    # TF-IDF
    df = pd.read_csv(input_path)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['Log'])

    # mean shift
    mean_shift = MeanShift()
    mean_shift.fit(tfidf_matrix.toarray())
    labels = mean_shift.labels_

    # adding cluster labels
    df['cluster'] = labels

    logs = generate_logs(df, tfidf_matrix, mean_shift, labels, num_representative)
    return logs

In [28]:
a = get_representative_logs("data\TESTparsed_0_generated_logs_COPY.csv", 3)
a

[['BLOCK* ask <*>:<*> to replicate blk_<*> to datanode(s) <*>:<*>',
  '<B>081111 080934 19 INFO dfs.FSNamesystem: BLOCK* ask 10.250.11.85:50010 to replicate blk_2377150260128098806 to datanode(s) 10.251.203.80:50010<E>'],
 ['<*>:<*>:Got exception while serving blk_<*> to /<*>:',
  '<B>081109 214043 2561 WARN dfs.DataNode$DataXceiver: 10.250.11.85:50010:Got exception while serving blk_509586258217225674 to /10.251.203.80:50010<E>'],
 ['Received block blk_<*> of size <*> from /<*>',
  '<B>081109 204722 567 INFO dfs.DataNode$PacketResponder: Received block blk_1234567890123456789 of size 9876543210987654321 from /10.251.203.80 <E>']]