In [1]:
# importing necessary libraries
import os
import pandas as pd
import subprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# Function to split file name into components - operating system, name, and version
def split_file_name(file_name):
    split_name = file_name.split('_')
    version = split_name[1]
    split_name = split_name[0].split('-')
    os = split_name[0]
    name = split_name[1]
    return os, name, version

# Load and prepare training data
# training and testing data has been cleaned before usage 
file_dir = "../data/training"
corpus = []
file_names = []

# iterate over files in training directory
for file in os.listdir(file_dir):
    if file.endswith('.txt'):
        file_path = os.path.join(file_dir, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            contents = f.read()
            corpus.append(contents)
            file_names.append(file[:-4])  # storing filename without .txt extension

# Create a TfidfVectorizer and transform the corpus
vectorizer = TfidfVectorizer(token_pattern='.+', max_df=0.7)

feature_matrix = vectorizer.fit_transform(corpus)


# Train a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(feature_matrix, file_names) 




RandomForestClassifier()

In [2]:
# Load and prepare testing data
test_dir = "../data/testing"
test_corpus = []
test_file_names = []

# Iterate over files in testing directory
for file in os.listdir(test_dir):
    if file.endswith('.txt'):
        file_path = os.path.join(test_dir, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            contents = f.read()
            test_corpus.append(contents)
            test_file_names.append(file[:-4])  # storing filename without .txt extension

# Transform the test corpus using the trained vectorizer
feature_matrix = vectorizer.transform(test_corpus)

# Define the threshold for prediction
threshold = 0.07

# Initialize a list to store the results for each test file
results = []

# Get the predictions for each test file
for i in range(len(test_file_names)):
    file_name = test_file_names[i]
    prediction_scores = clf.predict_proba(feature_matrix[i])[0]  # Get the prediction probabilities
    predictions = [(clf.classes_[j], score) for j, score in enumerate(prediction_scores) if score >= threshold]
    predictions.sort(key=lambda x: x[1], reverse=True)  # Sort predictions by probability in descending order
    results.append((file_name, predictions))

# Print the results in table format for each test file
for file_name, predictions in results:
    df = pd.DataFrame(predictions, columns=['Prediction', 'Probability'])
    print(f"Results for {file_name}:")
    print(df)
    print()


Results for wireshark:
                  Prediction  Probability
0            libexpat.so.1.6         0.20
1         libdvdcss.so.2.2.0         0.18
2                    liblzo2         0.16
3      libgnutls-xssl.so.0.0         0.14
4  libgnutls-openssl.so.13.3         0.07
5                    liblzma         0.07

Results for vlc:
                  Prediction  Probability
0            libexpat.so.1.6         0.26
1         libdvdcss.so.2.2.0         0.19
2                    liblzo2         0.13
3      libgnutls-xssl.so.0.0         0.12
4  libgnutls-openssl.so.13.3         0.07
5        libpng16.so.16.37.0         0.07

Results for openttd:
                  Prediction  Probability
0            libexpat.so.1.6         0.21
1         libdvdcss.so.2.2.0         0.16
2      libgnutls-xssl.so.0.0         0.15
3                    liblzo2         0.15
4  libgnutls-openssl.so.13.3         0.08

Results for gimp-2.10:
                  Prediction  Probability
0            libexpat.so.1.6   

In [5]:
folder_path = '../data/target_file/'
output_file = f'../out/{target_file_name}.txt'
output_prediction = f'../out/{target_file_name}_predictions.txt'
threshold = 0.05  # Set your desired threshold value

# Get the list of files in the folder
files = os.listdir(folder_path)

# Select the first file in the folder
if files:
    target_file_name = files[0]
    input_file = os.path.join(folder_path, files[0])
else:
    print("../data/input_folder/ is empty. Please add a file to apply LIVUS to.")
    exit(1)

# Run the 'strings' command on the input file and capture the output
output = subprocess.check_output(['strings', input_file]).decode('utf-8')

# Save the output as the predicted library in the specified output file
with open(output_file, 'w', encoding='utf-8') as f:
    f.write(output)

# Read the contents of the document
with open(output_file, 'r', encoding='utf-8') as f:
    val_contents = f.read()

val_feature_matrix = vectorizer.transform([val_contents])
prediction_scores = clf.predict_proba(val_feature_matrix)[0]  # Get the prediction probabilities

# Get predictions above the threshold
predictions = [(clf.classes_[i], score) for i, score in enumerate(prediction_scores) if score >= threshold]
predictions.sort(key=lambda x: x[1], reverse=True)  # Sort predictions by probability in descending order

# Print the results in table format and write predictions to output prediction file
df = pd.DataFrame(predictions, columns=['Prediction', 'Probability'])
print(f"Results for {input_file}:")
print(df)

# Clear names after printing and write predictions to output prediction file
cleaned_predictions = [prediction[0].split('.')[0] for prediction in predictions]
with open(output_prediction, 'w', encoding='utf-8') as f:
    f.write('\n'.join(cleaned_predictions))

Results for ../data/target_file/openttd:
                  Prediction  Probability
0            libexpat.so.1.6         0.21
1         libdvdcss.so.2.2.0         0.16
2      libgnutls-xssl.so.0.0         0.15
3                    liblzo2         0.15
4  libgnutls-openssl.so.13.3         0.08
5                    liblzma         0.06
6        libpng16.so.16.37.0         0.05
7             libtiff.so.5.2         0.05
Now searching NVD for the predicted libraries...



Traceback (most recent call last):
  File "/home/user/Desktop/MA/LIVUS/src/../util/parse_nvd.py", line 110, in <module>
    main()
  File "/home/user/Desktop/MA/LIVUS/src/../util/parse_nvd.py", line 84, in main
    save_json_data(file_path, json_data)
  File "/home/user/Desktop/MA/LIVUS/src/../util/parse_nvd.py", line 28, in save_json_data
    if os.path.exists(file_path):
NameError: name 'os' is not defined


In [7]:
print(f"Now searching NVD for the predicted libraries...")
stream = os.popen('python3 ../util/parse_nvd.py')
output = stream.read()
print(output)

Now searching NVD for the predicted libraries...
Searching for libexpat...
Updated CVE file for libexpat
Processing the CVE data...
Saved to file
Searching for libdvdcss...
No vulnerabilities found for libdvdcss.
Searching for libgnutls-xssl...
No vulnerabilities found for libgnutls-xssl.

