In [1]:
#!pip install scikit-learn

In [32]:
# importing necessary libraries
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# Function to split file name into components - operating system, name, and version
def split_file_name(file_name):
    split_name = file_name.split('_')
    version = split_name[1]
    split_name = split_name[0].split('-')
    os = split_name[0]
    name = split_name[1]
    return os, name, version

# Load and prepare training data
file_dir = "../data/dataset_prepped/training"
corpus = []
file_names = []

# iterate over files in training directory
for file in os.listdir(file_dir):
    if file.endswith('.txt'):
        file_path = os.path.join(file_dir, file)
        with open(file_path, 'r', encoding='utf-8') as f:
                contents = f.read()
                corpus.append(contents)
                file_names.append(file[:-4])  # storing filename without .txt extension

# Create a TfidfVectorizer and transform the corpus
vectorizer = TfidfVectorizer(stop_words=None, token_pattern='.+') 
feature_matrix = vectorizer.fit_transform(corpus)

# Train a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(feature_matrix, file_names) 

# Load and prepare testing data
test_dir = "../data/dataset_prepped/testing"
test_corpus = []
test_file_names = []

# iterate over files in testing directory
for file in os.listdir(test_dir):
    if file.endswith('.txt'):
        file_path = os.path.join(test_dir, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            contents = f.read()
            test_corpus.append(contents)
            test_file_names.append(file[:-4])  # storing filename without .txt extension

# Transform the test corpus using the trained vectorizer
feature_matrix = vectorizer.transform(test_corpus)
predictions = clf.predict(feature_matrix)  # predict with trained classifier

# Compare predictions with actual data
df_rows = []
for i, pred in enumerate(predictions):
    if len(pred) == 0:
        pred_name = 'no match found'
    else:
        pred_name = pred

    test_os, test_name, test_version = split_file_name(test_file_names[i])
    pred_os, pred_name, pred_version = split_file_name(pred_name)
    
    # Store results in a dictionary
    df_rows.append({
        'Index': i+1,
        'Test File OS': test_os, 
        'Predicted File OS': pred_os,
        'OS Match': 'O' if test_os == pred_os else 'X',
        'Test File Name': test_name, 
        'Predicted File Name': pred_name, 
        'Name Match': 'O' if test_name == pred_name else 'X',
        'Test File Version': test_version,
        'Predicted File Version': pred_version,
        'Version Match': 'O' if test_version == pred_version else 'X'
    })

# convert the results to a pandas dataframe
df = pd.DataFrame(df_rows)

# Save the dataframe to a CSV file
df.to_csv('../out/test_prediction.csv', index=False)

# Print the dataframe to console
print(df.to_string(index=False))

# Count 'O's and 'X's in each match column
os_match_count = df['OS Match'].value_counts().reindex(['O', 'X'], fill_value=0).to_dict()
name_match_count = df['Name Match'].value_counts().reindex(['O', 'X'], fill_value=0).to_dict()
version_match_count = df['Version Match'].value_counts().reindex(['O', 'X'], fill_value=0).to_dict()

print("\nOS Match:\n", os_match_count)
print("\nName Match:\n", name_match_count)
print("\nVersion Match:\n", version_match_count)



 Index Test File OS Predicted File OS OS Match    Test File Name Predicted File Name Name Match       Test File Version Predicted File Version Version Match
     1          deb               deb        O               dns                 dns          O             samba-4.9.5         samba-4.18.1-1             X
     2           ar                ar        O           libsort             libsort          O              gedit-3.38           gedit-44.2-1             X
     3           ar               deb        X          winbindd            winbindd          O             samba-4.9.5         samba-4.18.1-1             X
     4           ar                ar        O        libdocinfo          libdocinfo          O              gedit-3.38           gedit-44.2-1             X
     5          deb               deb        O          winbindd            winbindd          O             samba-4.9.5         samba-4.18.1-1             X
     6          deb               deb        O            

In [28]:
#Validation file
#make into txt
input_file = '../data/validation_file.so'
output_file = '../data/validation_file.txt'
output_prediction = '../out/predicted_library.txt'

# Run the 'strings' command on the input file and capture the output
output = subprocess.check_output(['strings', input_file]).decode('utf-8')

# Save the output as the predicted library in the specified output file
with open(output_file, 'w', encoding='utf-8') as f:
    f.write(output)

# Read the contents of the document
with open(output_prediction, 'r', encoding='utf-8') as f:
    val_contents = f.read()

val_feature_matrix = vectorizer.transform([val_contents])
prediction = clf.predict(val_feature_matrix)


# Print or store the prediction result
if len(prediction) == 0:
    pred_name = 'no match found'
else:
    pred_name = prediction[0]
    print(f"The input file is most closely related to the training library {pred_name}")
      
    stream = os.popen(f"echo {pred_name} | awk -F'[-_]+' '{{print $2}}' ")
    output =stream.read()
    
    if not os.path.exists(output_prediction):
        with open(output_prediction, 'w', encoding='utf-8'):
            pass  # Creates an empty file

    with open(output_prediction, 'w', encoding='utf-8') as f:
        f.write(output)

print(f"Now searching NVD for the predicted name {output}")
stream = os.popen('python3 ../util/parse_nvd.py')
output = stream.read()
print(output)

The input file is most closely related to the training library ar-dcerpc_samba-4.18.1-1
Now searching NVD for the predicted name dcerpc

            CVE ID  ... Base Score
0    CVE-2003-0428  ...        5.0
1    CVE-2003-0715  ...       10.0
2    CVE-2005-2361  ...        5.0
3    CVE-2006-1939  ...        5.0
4    CVE-2007-1748  ...       10.0
5    CVE-2009-3550  ...        4.3
6    CVE-2012-4661  ...        9.0
7    CVE-2012-4662  ...        7.1
8    CVE-2012-4663  ...        7.1
9    CVE-2013-4408  ...        8.3
10   CVE-2015-6423  ...        3.5
11   CVE-2016-2118  ...        6.8
12   CVE-2016-2115  ...        4.3
13   CVE-2016-0907  ...        4.3
14   CVE-2016-5350  ...        4.3
15   CVE-2016-9373  ...        4.3
16   CVE-2017-9766  ...        5.0
17  CVE-2017-13766  ...        5.0
18  CVE-2019-10903  ...        5.0

[19 rows x 4 columns]
For more details please refer to the csv file.

