In [None]:
# importing necessary libraries
import os
import pandas as pd
import subprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# Function to split file name into components - operating system, name, and version
def split_file_name(file_name):
    split_name = file_name.split('_')
    version = split_name[1]
    split_name = split_name[0].split('-')
    os = split_name[0]
    name = split_name[1]
    return os, name, version

# Load and prepare training data
# training and testing data has been cleaned before usage 
file_dir = "../data/dataset_prepped/training"
corpus = []
file_names = []

# iterate over files in training directory
for file in os.listdir(file_dir):
    if file.endswith('.txt'):
        file_path = os.path.join(file_dir, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            contents = f.read()
            corpus.append(contents)
            file_names.append(file[:-4])  # storing filename without .txt extension

# Create a TfidfVectorizer and transform the corpus
vectorizer = TfidfVectorizer(token_pattern='.+') 
feature_matrix = vectorizer.fit_transform(corpus)

# Train a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(feature_matrix, file_names) 


# Load and prepare testing data
test_dir = "../data/dataset_prepped/testing"
test_corpus = []
test_file_names = []

# Iterate over files in testing directory
for file in os.listdir(test_dir):
    if file.endswith('.txt'):
        file_path = os.path.join(test_dir, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            contents = f.read()
            test_corpus.append(contents)
            test_file_names.append(file[:-4])  # storing filename without .txt extension

# Transform the test corpus using the trained vectorizer
feature_matrix = vectorizer.transform(test_corpus)

# Get the top 10 predictions for each test file
top_predictions = []
for i in range(len(test_file_names)):
    file_name = test_file_names[i]
    prediction_scores = clf.predict_proba(feature_matrix[i])[0]  # Get the prediction probabilities
    sorted_indices = prediction_scores.argsort()[::-1][:100]  # Sort indices in descending order of probabilities
    top_predictions.append([clf.classes_[index] for index in sorted_indices])  # Get the class names for the sorted indices

# Print the top 100 predictions for each test file
for i in range(len(test_file_names)):
    file_name = test_file_names[i]
    predictions = top_predictions[i]
    print(f"Top 100 predictions for {file_name}: {predictions}")
