In [3]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

def train_model():
    # Load and prepare training data
    file_dir = "../data/training"
    corpus = []
    file_names = []

    # iterate over files in training directory
    for file in os.listdir(file_dir):
        if file.endswith('.txt'):
            file_path = os.path.join(file_dir, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                contents = f.read()
                corpus.append(contents)
                file_names.append(file[:-4])

    # Create a TfidfVectorizer and transform the corpus
    vectorizer = TfidfVectorizer(token_pattern='.+', max_df=0.8)
    feature_matrix = vectorizer.fit_transform(corpus)

    # Train a RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(feature_matrix, file_names)

    print("Model training complete.")
    return vectorizer, clf

def prepare_test_data(vectorizer):
    # Load and prepare testing data
    test_dir = "../data/testing"
    test_corpus = []
    test_file_names = []

    # Iterate over files in testing directory
    for file in os.listdir(test_dir):
        if file.endswith('.txt'):
            file_path = os.path.join(test_dir, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                contents = f.read()
                test_corpus.append(contents)
                test_file_names.append(file[:-4])

    # Transform the test corpus using the trained vectorizer
    feature_matrix = vectorizer.transform(test_corpus)

    print("Test data preparation complete.")
    return test_file_names, feature_matrix

def highest_predictions(clf, test_file_names, feature_matrix):
    # Get the highest predictions for each test file
    predictions = clf.predict(feature_matrix)
    result = list(zip(test_file_names, predictions))

    df = pd.DataFrame(result, columns=['Test File Name', 'Prediction'])
    print("\nHighest predictions for each test file:")
    print(df)

    return None

def threshold_predictions(clf, test_file_names, feature_matrix, threshold):
    threshold_predictions_list = []
    unique_predictions_set = set()  # store unique predictions

    # Get the predictions for each test file
    for i in range(len(test_file_names)):
        prediction_scores = clf.predict_proba(feature_matrix[i])[0]  # Get the prediction probabilities

        # Get predictions above the threshold
        above_threshold = clf.classes_[prediction_scores >= threshold]
        threshold_predictions_list.append((test_file_names[i], list(above_threshold)))

        # Add unique cleaned predictions to set
        cleaned_predictions = [pred.split(".")[0] for pred in above_threshold]
        unique_predictions_set.update(cleaned_predictions)

        # Write predictions for each file in a specific directory with a specific name
        output_file = os.path.join('../out/exec', f"{test_file_names[i]}_predictions.txt")
        with open(output_file, 'w') as f:
            for prediction in above_threshold:
                f.write(prediction + '\n')

    # Printing threshold predictions as separate tables for each test file
    for file_name, predictions in threshold_predictions_list:
        df = pd.DataFrame(predictions, columns=['Predictions'])
        print(f"\nPredictions for {file_name}:")
        print(df)

    # Writing unique cleaned predictions to a file
    with open('../out/exec/predictions.txt', 'w') as f:
        for prediction in unique_predictions_set:
            f.write(prediction + '\n')

    return None


def parse_NVD():
    print(f"Now searching NVD for the predicted libraries...")
    stream = os.popen(f'python3 ../util/parse_nvd.py')
    output = stream.read()
    print("Done!")
    print(output)

In [4]:
def main():
    # Train your model
    vectorizer, clf = train_model()

    # Prepare your test data
    test_file_names, feature_matrix = prepare_test_data(vectorizer)

    # Use this function to get only the highest prediction, e.g. when using it on SOs
    # highest_pred_df = highest_predictions(clf, test_file_names, feature_matrix)
    
    # Use this function to get all identified libraries for given programs
    threshold_predictions(clf, test_file_names, feature_matrix, threshold=0.07)
    # Run this to parse the NVD. This is currently only possible for the threshold predictions
    parse_NVD()



In [5]:
if __name__ == "__main__":
    main()

Model training complete.
Test data preparation complete.

Predictions for wireshark:
           Predictions
0       libcurl.so.4.0
1      libexpat.so.1.6
2              liblzma
3  libpng16.so.16.37.0
4       libtiff.so.5.2

Predictions for vlc:
           Predictions
0       libcurl.so.4.0
1      libexpat.so.1.6
2              liblzma
3  libpng16.so.16.37.0
4       libtiff.so.5.2

Predictions for openttd:
            Predictions
0        libcurl.so.4.0
1       libexpat.so.1.6
2  libgnutls.so.30.26.0
3               liblzma
4   libpng16.so.16.37.0
5        libtiff.so.5.2

Predictions for gimp-2.10:
           Predictions
0       libcurl.so.4.0
1      libexpat.so.1.6
2              liblzma
3  libpng16.so.16.37.0
4       libtiff.so.5.2
Now searching NVD for the predicted libraries...
Done!
Searching for libexpat...
Processing the CVE data...
Saved to file
Searching for libgnutls...
Processing the CVE data...
Saved to file
Searching for libpng16...
Processing the CVE data...
Saved to file
