In [29]:
import json
# Create a feature column for each entry in the list of features
from tqdm import tqdm, trange
import warnings
import pandas as pd
warnings.filterwarnings('ignore')

# Set up seeds
seed = 42
import numpy as np
np.random.seed(seed)
import torch
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
import random
random.seed(seed)


def read_jsonl_file(jsonl_path):
    with open(jsonl_path, 'r') as f:
        for line in f:
            yield json.loads(line)



diff_jsonl_path = 'artifacts/accunuated_featuree_importance_diff.jsonl'
diff_data = list(read_jsonl_file(diff_jsonl_path))

diff_df = pd.DataFrame(diff_data)

for i in trange(len(diff_df['logit_diff'][0])):
    diff_df[f'feature_{i}'] = diff_df['logit_diff'].apply(lambda x: x[i])

diff_df.drop(columns=['logit_diff', 'labels', 'model', 'plot_type'], inplace=True)

100%|██████████| 24576/24576 [00:32<00:00, 748.32it/s] 


In [30]:
train_indexes = np.random.choice(diff_df.index, int(len(diff_df) * 0.8), replace=False)
train_df_diff = diff_df.loc[train_indexes]

In [31]:
train_df_diff.head(2)

Unnamed: 0,index,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_24566,feature_24567,feature_24568,feature_24569,feature_24570,feature_24571,feature_24572,feature_24573,feature_24574,feature_24575
105,93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
108,96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
vuln_jsonl_path = 'artifacts/accumulated_featuree_importance_vuln.jsonl'
safe_jsonl_path = 'artifacts/accumulated_featuree_importance_safe.jsonl'

vuln_data = list(read_jsonl_file(vuln_jsonl_path))
safe_data = list(read_jsonl_file(safe_jsonl_path))
vuln_df = pd.DataFrame(vuln_data)
safe_df = pd.DataFrame(safe_data)
vuln_df.drop(columns=['labels', "model", "plot_type"], inplace=True)
vuln_df['vuln'] = 1

safe_df.drop(columns=['labels', "model", "plot_type"], inplace=True)
safe_df['vuln'] = 0

for i in trange(len(vuln_df['logit_diff'][0])):
    vuln_df[f'feature_{i}'] = vuln_df['logit_diff'].apply(lambda x: x[i])
    safe_df[f'feature_{i}'] = safe_df['logit_diff'].apply(lambda x: x[i])

100%|██████████| 24576/24576 [01:33<00:00, 262.55it/s]


In [33]:
safe_df_train = safe_df.loc[train_indexes]
safe_df_test = safe_df.drop(train_indexes)

vuln_df_train = vuln_df.loc[train_indexes]
vuln_df_test = vuln_df.drop(train_indexes)

df_train = pd.concat([safe_df_train, vuln_df_train])
df_test  = pd.concat([safe_df_test, vuln_df_test])


df_train = df_train.sample(frac=1).reset_index(drop=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)



In [34]:
df_train.head(1)

Unnamed: 0,index,logit_diff,vuln,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,...,feature_24566,feature_24567,feature_24568,feature_24569,feature_24570,feature_24571,feature_24572,feature_24573,feature_24574,feature_24575
0,107,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:

df_train.drop(columns=['logit_diff'], inplace=True)
df_test.drop(columns=['logit_diff'], inplace=True)

In [36]:
print(df_train.shape)
print(df_test.shape)
diff_df.head(1)


(256, 24578)
(64, 24578)


Unnamed: 0,index,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_24566,feature_24567,feature_24568,feature_24569,feature_24570,feature_24571,feature_24572,feature_24573,feature_24574,feature_24575
0,0,0.0,0.0,0.0,0.0,0.0,0.033859,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.008174,0.0,0.002857,0.0


In [37]:

most_important_cols = train_df_diff.sum(axis=0).sort_values(ascending=False).index[1:101]

most_important_cols

Index(['feature_6843', 'feature_21821', 'feature_6111', 'feature_1352',
       'feature_22338', 'feature_18861', 'feature_6002', 'feature_19462',
       'feature_24401', 'feature_19337', 'feature_16488', 'feature_22351',
       'feature_7101', 'feature_14084', 'feature_17229', 'feature_9226',
       'feature_9954', 'feature_2122', 'feature_19388', 'feature_916',
       'feature_22779', 'feature_22622', 'feature_2631', 'feature_2333',
       'feature_14477', 'feature_1070', 'feature_11955', 'feature_6807',
       'feature_11786', 'feature_7343', 'feature_4556', 'feature_19133',
       'feature_11233', 'feature_15267', 'feature_20042', 'feature_14941',
       'feature_7261', 'feature_8716', 'feature_8382', 'feature_13303',
       'feature_2782', 'feature_9555', 'feature_12330', 'feature_11356',
       'feature_12184', 'feature_16054', 'feature_12558', 'feature_22328',
       'feature_4018', 'feature_2171', 'feature_15572', 'feature_15654',
       'feature_11722', 'feature_1698', 'feature

In [38]:
# Train a tree model on the training data
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train = df_train[most_important_cols]
y_train = df_train['vuln']

X_test = df_test[most_important_cols]
y_test = df_test['vuln']




In [39]:

clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
clf.fit(X_train, y_train)

clf.feature_importances_

import numpy as np
import matplotlib.pyplot as plt

importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print(f"{f+1}. feature {indices[f]} ({importances[indices[f]]})")

Feature ranking:
1. feature 0 (0.07573116812746128)
2. feature 1 (0.0674952771473302)
3. feature 9 (0.045584891010504607)
4. feature 59 (0.03927561005821213)
5. feature 70 (0.03831007290946142)
6. feature 3 (0.032007135785758103)
7. feature 51 (0.028334404615127522)
8. feature 31 (0.026358733666708843)
9. feature 56 (0.022744569959691423)
10. feature 23 (0.02070913590528861)
11. feature 89 (0.019983002277725273)
12. feature 6 (0.019065664513091902)
13. feature 97 (0.017045535274353502)
14. feature 48 (0.01668761493385539)
15. feature 61 (0.016589485876273954)
16. feature 2 (0.015954550929461662)
17. feature 53 (0.015606394003027962)
18. feature 38 (0.015214581639333384)
19. feature 54 (0.015168456775560554)
20. feature 78 (0.01492108721398288)
21. feature 88 (0.014570892362014472)
22. feature 5 (0.014043329805492352)
23. feature 69 (0.012822116615331012)
24. feature 52 (0.012644907675347549)
25. feature 82 (0.012541051508032516)
26. feature 25 (0.011997298965178375)
27. feature 18 (0.0

In [40]:
# USe logistic regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train, y_train)
print(f"Logistic Regression: {clf.score(X_test, y_test)}")

# Use a decision tree
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
print(f"Decision Tree: {clf.score(X_test, y_test)}")

# Use a SVM
from sklearn import svm
clf = svm.SVC().fit(X_train, y_train)
print(f"SVM: {clf.score(X_test, y_test)}")

# Use a KNN
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)
print(f"KNN: {clf.score(X_test, y_test)}")


Logistic Regression: 0.625
Decision Tree: 0.671875
SVM: 0.59375
KNN: 0.59375


In [42]:
# jsonl_path = "artifacts/gbug-java.jsonl"
# # Convert to csv
# import json
# import pandas as pd
# from tqdm import tqdm

# data = list(read_jsonl_file(jsonl_path))

# df = pd.DataFrame(data)

# # rename the columns to be more readable
# #before_func_col: str = "func_before",
# #after_func_col: str = "func_after",
# df.rename(columns={"buggy_code":  "func_before", "fixed_code": "func_after"}, inplace=True)
# #Save the data to a csv file
# df.to_csv("gbug-java.csv", index=False)
