In [1]:
import json
import os


data = []

with open("Sample 300/results.jsonl", "r") as file:
    for line in file:
        data.append(json.loads(line))

base_path = "Full Samples"

for i in range(1, 7):
    with open(f"{base_path}/folder_{i}/results{i}.jsonl", "r") as file:
        data.extend([json.loads(line) for line in file])
    

In [2]:
kaspersky_results = []
not_scanned_files = []
filename = 'not_scanned_by_kaspersky.txt'

for entry in data:
    try:
        result = entry['result']['data']['attributes']['results']['Kaspersky']['result']
        kaspersky_results.append(result)
    except KeyError:
        print(f"Kaspersky result not found for analysis_id: {entry['analysis_id']}.")
        not_scanned_files.append(entry['analysis_id'])

# Save not scanned files to a text file
with open(filename, 'w') as f:
    for analysis_id in not_scanned_files:
        f.write(f"{analysis_id}\n")

# Print out the results
print(f"Number of files not scanned by Kaspersky: {len(not_scanned_files)}")


Kaspersky result not found for analysis_id: YWM1OWI4MTkxOTNhOTY0YjRiMjZhZWI1Mzk2N2ViNmU6MTY5MjIwMzI5NA==.
Kaspersky result not found for analysis_id: ZGJiNDc2NmNkZDlmNGE3NTU4NzJkMjQ4ZjM5NmVmMjk6MTY5MjIwMzI5NQ==.
Kaspersky result not found for analysis_id: Y2QyMDA3ZTNmYTIwNWQ0ZGMwYjI0ZTVkNTc3Y2QwYjA6MTY5MjIwMzI5NQ==.
Kaspersky result not found for analysis_id: NDg3YjI1Mzc1MjFiYjkzMjU4OTg0NDdjNWQ4NWUxZTk6MTY5MjIwNTg4Nw==.
Kaspersky result not found for analysis_id: YzI3ZmUxMTllOTM2YzU0ZDMzZGE0NWJmYTljOTllMmM6MTY5MjIwNTg4Nw==.
Kaspersky result not found for analysis_id: MGE1MTA2OGQ2ZDAzZmNhMjIxM2I2N2Q2YmJjOTdmYjM6MTY5MjIwNTg4NA==.
Kaspersky result not found for analysis_id: NDk0ODBmMjgzNjg2N2JkNmIyNmU1MTJmYTNmZjMwNjk6MTY5MjIwNTg4NA==.
Kaspersky result not found for analysis_id: YzExMTliYTc5MTM3M2IxOTQ3NDMxZmVjYzk2MDI3YWY6MTY5MjIwNTg4NA==.
Kaspersky result not found for analysis_id: YTU1Y2ViYTNiYTNmZmQ3ZWEyNWZkYzlhNDAzODMyZWU6MTY5MjIwNTg4NA==.
Kaspersky result not found for analysis_id: MT

In [3]:
# Count occurrences of each unique result
from collections import Counter

result_counts = Counter(kaspersky_results)

# Sort categories by count in descending order
sorted_counts = sorted(result_counts.items(), key=lambda x: x[1], reverse=True)

# # Print out the count for each category
# print("\nResult Counts:")
# for category, count in result_counts.items():
#     print(f"{category}: {count}")

# Print out the count for each category
print("\nResult Counts:")
for category, count in sorted_counts:
    print(f"{category}: {count}")


Result Counts:
None: 7186
HEUR:Trojan.Script.Generic: 2554
not-a-virus:HEUR:RiskTool.HTML.Miner.gen: 1002
HEUR:Trojan.JS.Miner.gen: 268
HEUR:Hoax.HTML.Phish.gen: 259
HEUR:Trojan.HTML.Phish.gen: 210
Trojan-Downloader.JS.Iframe.dfw: 168
Trojan-Downloader.HTML.JScript.dj: 157
Trojan-Dropper.VBS.Agent.bp: 89
Trojan-Downloader.JS.Iframe.cqo: 42
Trojan-Downloader.JS.Agent.hbs: 33
Trojan.JS.Redirector.afa: 31
HEUR:Trojan.JS.Infect.gen: 30
not-a-virus:HEUR:AdWare.Script.Generic: 29
Trojan.JS.FBook.bk: 21
Trojan-Downloader.JS.Iframe.cvp: 20
Trojan.JS.AdInject.a: 15
Trojan.JS.HideLink.a: 12
HEUR:Trojan.Script.Miner.gen: 11
Hoax.HTML.Phish.zf: 10
Trojan.HTML.Redirector.cv: 9
Trojan.JS.Iframe.ku: 9
Trojan.JS.FBook.av: 8
HEUR:Trojan-Downloader.Script.SLoad.gen: 6
HEUR:Trojan.Script.Iframer: 5
Trojan-Downloader.HTML.JScript.dr: 5
HEUR:Trojan-PSW.Script.Generic: 5
Trojan.HTML.Agent.si: 5
HEUR:Trojan.Script.Agent.gen: 5
HEUR:Trojan-Downloader.Script.Generic: 4
Trojan-Clicker.HTML.IFrame.afm: 4
UDS:Da

In [4]:
# filepath + analysisid + type

import json

def process_folder(folder_number):
    # Step 1: Read analysis_ids.txt and create a dictionary mapping analysis_id to filepath
    analysis_id_to_filepath = {}
    with open(f'Full Samples/folder_{folder_number}/analysis_ids.txt', 'r') as file:
        for line in file:
            filepath, analysis_id = line.strip().split(": ")
            analysis_id_to_filepath[analysis_id] = filepath

    # Step 2: Read results jsonl and extract results
    kaspersky_results = []
    not_scanned_files = []
    with open(f'Full Samples/folder_{folder_number}/results{folder_number}.jsonl', 'r') as file:
        for line in file:
            entry = json.loads(line)
            try:
                result = entry['result']['data']['attributes']['results']['Kaspersky']['result']
                filepath = analysis_id_to_filepath.get(entry['analysis_id'], "UNKNOWN_FILEPATH")
                kaspersky_results.append((filepath, entry['analysis_id'], result))
            except KeyError:
                print(f"Kaspersky result not found for analysis_id: {entry['analysis_id']}.")
                not_scanned_files.append(entry['analysis_id'])

    # Step 3: Write results to a new file
    with open(f'Full Samples/folder_{folder_number}/output_results{folder_number}.txt', 'w') as file:
        for filepath, analysis_id, result in kaspersky_results:
            file.write(f"{filepath},{analysis_id},{result}\n")

# Loop to process multiple folders
number_of_folders = 6  # You can change this to the total number of folders you have
for i in range(1, number_of_folders + 1):
    process_folder(i)


Kaspersky result not found for analysis_id: YWM1OWI4MTkxOTNhOTY0YjRiMjZhZWI1Mzk2N2ViNmU6MTY5MjIwMzI5NA==.
Kaspersky result not found for analysis_id: ZGJiNDc2NmNkZDlmNGE3NTU4NzJkMjQ4ZjM5NmVmMjk6MTY5MjIwMzI5NQ==.
Kaspersky result not found for analysis_id: Y2QyMDA3ZTNmYTIwNWQ0ZGMwYjI0ZTVkNTc3Y2QwYjA6MTY5MjIwMzI5NQ==.
Kaspersky result not found for analysis_id: NDg3YjI1Mzc1MjFiYjkzMjU4OTg0NDdjNWQ4NWUxZTk6MTY5MjIwNTg4Nw==.
Kaspersky result not found for analysis_id: YzI3ZmUxMTllOTM2YzU0ZDMzZGE0NWJmYTljOTllMmM6MTY5MjIwNTg4Nw==.
Kaspersky result not found for analysis_id: MGE1MTA2OGQ2ZDAzZmNhMjIxM2I2N2Q2YmJjOTdmYjM6MTY5MjIwNTg4NA==.
Kaspersky result not found for analysis_id: NDk0ODBmMjgzNjg2N2JkNmIyNmU1MTJmYTNmZjMwNjk6MTY5MjIwNTg4NA==.
Kaspersky result not found for analysis_id: YzExMTliYTc5MTM3M2IxOTQ3NDMxZmVjYzk2MDI3YWY6MTY5MjIwNTg4NA==.
Kaspersky result not found for analysis_id: YTU1Y2ViYTNiYTNmZmQ3ZWEyNWZkYzlhNDAzODMyZWU6MTY5MjIwNTg4NA==.
Kaspersky result not found for analysis_id: MT

In [5]:
import pandas as pd

# Define the list of file paths
filepaths = [f"Full Samples/folder_{i}/output_results{i}.txt" for i in range(1, 7)]

# Read each file into a DataFrame and store them in a list
dfs = [pd.read_csv(filepath, header=None) for filepath in filepaths]

# Concatenate the DataFrames
combined_df = pd.concat(dfs, ignore_index=True)

# If you want to give column names to the combined DataFrame
combined_df.columns = ['filepath', 'analysis_id', 'result']

# Save the combined table to a new file if needed
combined_df.to_csv('Full Samples/combined_output_results.csv', index=False)


In [7]:
# Filter out rows where 'result' column is 'None' or NaN
filtered_df = combined_df[combined_df['result'].notna() & (combined_df['result'] != 'None')]


AttributeError: 'DataFrame' object has no attribute 'summary'

In [8]:
filtered_df.describe()

Unnamed: 0,filepath,analysis_id,result
count,4968,4968,4968
unique,4968,4968,87
top,/Users/sq/Downloads/VirusShare/Sorted_VS_Divid...,MjNkYjUwNDYyNTAwM2Y3M2UwYzQ4NmYyYWZhODg5MjA6MT...,HEUR:Trojan.Script.Generic
freq,1,1,2484


In [9]:
# Save the combined table to a new file if needed
filtered_df.to_csv('Full Samples/filtered_output_results.csv', index=False)