In [1]:
import pandas as pd
from io import StringIO

In [2]:
# Open VT reports and get SHA256 and first_submission_date values for each json (line).
def extract_first_submission_dates(file_path) -> pd.DataFrame:
    malwares_first_sub_date = []
    with open(file_path, "r") as reports:
        sha256_key, first_sub_date_key = "sha256", "first_submission_date"
        # Iterate through all reports
        for report in reports:
            df_report = pd.read_json(StringIO(report))["data"]["attributes"]
            sha256, first_sub_date = (
                df_report[sha256_key],
                df_report[first_sub_date_key],
            )
            malwares_first_sub_date.append((sha256, first_sub_date))

    return pd.DataFrame(
        malwares_first_sub_date, columns=[sha256_key, first_sub_date_key]
    )

In [4]:
reports_path = "../../vt_reports/vt_reports67k.jsons"
df_malwares_first_sub_date = extract_first_submission_dates(file_path=reports_path)

df_malwares_first_sub_date.head()

Unnamed: 0,sha256,first_submission_date
0,98f8e26e12b978102fa39c197f300ebe5fe535617737d5...,1630575593
1,7b2999ffadbc3b5b5c5e94145ca4e2f8de66ac1e3ddd52...,1629375559
2,e7569d494fe00be04ef6c9fcc5e54720c0df623b08e79d...,1362057319
3,1ed60c04f572b6acb9f64c31db55ef5c6b5465bd4da1eb...,1630624233
4,4c4aaff20a57213d9a786e56ad22f1eaa94694a2f1042b...,1592186154


In [None]:
def extract_malware_family(file_path) -> pd.DataFrame:
    df = pd.read_csv(file_path, usecols=["SHA256", "FAMILY"])
    return df.rename(str.lower, axis="columns")

In [None]:
malware_families_path = "../../vt_reports/siggregator_all_samples_no_fuzzy_hash.csv"
df_malware_families = extract_malware_family(file_path=malware_families_path)
df_malware_families.head()

In [None]:
# Join dataframes
df = pd.merge(left=df_malwares_first_sub_date, right=df_malware_families, on="sha256")
df.head()

In [None]:
df.to_csv("../../vt_reports/merge.csv", index=False)