In [1]:
import pandas as pd
import re

In [2]:
APPLICATION_BASE_PATH = "I:/PhD/Model_Building/Stage_4_Building_Vulnerability_Dataset/7_Round_Qark"
unprocessed_csv = "Scanned_Files_Unprocessed_Dataset.csv"
processed_csv = unprocessed_csv.replace("Unprocessed","Processed")

In [3]:
def preprocess_comments_and_strings(code_line):
    
    processed_code_line = code_line

    encryption_hashing_pattern = "AES|aes|SHA-1|sha-1|SHA1|sha1|MD5|md5"
    ip_pattern = "\w*([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})\w*"
    string_pattern = "\"[\w|\s|$|&|+|,|:|;|=|?|@|#|_|/|\-|\.|!|`|~|%|\^|\*|\(|\)|\'\\[|\]\{|\}]*\""

    # Checking for encryption related strings
    find_encryption = re.search(encryption_hashing_pattern, processed_code_line)

    # Checking for IP related strings
    find_ip = re.search(ip_pattern, processed_code_line)

    if (find_encryption is None) & (find_ip is None):
        # replacing all strings with dummy string
        processed_code_line = re.sub(string_pattern, "\"user_str\"", processed_code_line)
        
    # replacing comments with dummy comment
    #comment_pattern = "//.*|/\\*(?s:.*?)\\*/|(\"(?:(?<!\\\\)(?:\\\\\\\\)*\\\\\"|[^\r\n\"])*\")"
    comment_pattern = "//.*|/\\*(?s:.*?)\\*/|/\\*(.)*|(.)*\\*/"
    processed_code_line = re.sub(comment_pattern, "//user_comment",processed_code_line)

    return processed_code_line

In [4]:
unprocessed_dataset_df = pd.read_csv(APPLICATION_BASE_PATH+"/"+unprocessed_csv).fillna("")

In [5]:
unprocessed_dataset_df.head()

Unnamed: 0.1,Unnamed: 0,app_name,category,severity,qark_name,Code,CWE_ID,CWE_Desc,Vulnerability_status
0,0,us-travel-converter,,,,/*,,,0
1,1,us-travel-converter,,,,"* GNU GENERAL PUBLIC LICENSE Version 3, 2...",,,0
2,2,us-travel-converter,,,,*,,,0
3,3,us-travel-converter,,,,* This program converts some imperial ...,,,0
4,4,us-travel-converter,,,,* Copyright (C) <2019> <Github: Omikr...,,,0


In [6]:
unprocessed_dataset_df.Code

0                                                        /*
1         *      GNU GENERAL PUBLIC LICENSE Version 3, 2...
2                                                         *
3         *         This program converts some imperial ...
4         *         Copyright (C) <2019>  <Github: Omikr...
                                ...                        
118917    public static ArrayList<WeatherStation> fromJs...
118918    ArrayList<WeatherStation> weatherStations = ne...
118919     for (int i = 0; i < jsonObjects.length(); i++) {
118920    weatherStations.add(new WeatherStation(jsonObj...
118921                              return weatherStations;
Name: Code, Length: 118922, dtype: object

In [7]:
comments_and_strings_processed_codes=[]
for code in unprocessed_dataset_df.Code:
    if(len(code)>150):
        processed_code = "user_str"
    else:
        processed_code=preprocess_comments_and_strings(code)   
        
    comments_and_strings_processed_codes.append(processed_code)

In [8]:
comments_and_strings_processed_codes_series= pd.Series(comments_and_strings_processed_codes)
print(comments_and_strings_processed_codes_series)

0                                            //user_comment
1         *      GNU GENERAL PUBLIC LICENSE Version 3, 2...
2                                                         *
3         *         This program converts some imperial ...
4         *         Copyright (C) <2019>  <Github: Omikr...
                                ...                        
118917    public static ArrayList<WeatherStation> fromJs...
118918    ArrayList<WeatherStation> weatherStations = ne...
118919     for (int i = 0; i < jsonObjects.length(); i++) {
118920    weatherStations.add(new WeatherStation(jsonObj...
118921                              return weatherStations;
Length: 118922, dtype: object


In [9]:
unprocessed_dataset_df["processed_code"]= comments_and_strings_processed_codes

In [10]:
unprocessed_dataset_df.head()

Unnamed: 0.1,Unnamed: 0,app_name,category,severity,qark_name,Code,CWE_ID,CWE_Desc,Vulnerability_status,processed_code
0,0,us-travel-converter,,,,/*,,,0,//user_comment
1,1,us-travel-converter,,,,"* GNU GENERAL PUBLIC LICENSE Version 3, 2...",,,0,"* GNU GENERAL PUBLIC LICENSE Version 3, 2..."
2,2,us-travel-converter,,,,*,,,0,*
3,3,us-travel-converter,,,,* This program converts some imperial ...,,,0,* This program converts some imperial ...
4,4,us-travel-converter,,,,* Copyright (C) <2019> <Github: Omikr...,,,0,* Copyright (C) <2019> <Github: Omikr...


In [11]:
#processed_df = unprocessed_dataset_df.drop('Code', axis=1)
processed_df = unprocessed_dataset_df

In [12]:
processed_df.head()

Unnamed: 0.1,Unnamed: 0,app_name,category,severity,qark_name,Code,CWE_ID,CWE_Desc,Vulnerability_status,processed_code
0,0,us-travel-converter,,,,/*,,,0,//user_comment
1,1,us-travel-converter,,,,"* GNU GENERAL PUBLIC LICENSE Version 3, 2...",,,0,"* GNU GENERAL PUBLIC LICENSE Version 3, 2..."
2,2,us-travel-converter,,,,*,,,0,*
3,3,us-travel-converter,,,,* This program converts some imperial ...,,,0,* This program converts some imperial ...
4,4,us-travel-converter,,,,* Copyright (C) <2019> <Github: Omikr...,,,0,* Copyright (C) <2019> <Github: Omikr...


In [13]:
print(unprocessed_dataset_df.size ,",", processed_df.size)

1189220 , 1189220


In [14]:
processed_df.drop_duplicates(subset=['processed_code', 'CWE_ID'], inplace=True)

In [15]:
processed_df.size

1037230

In [16]:
processed_df.to_csv(APPLICATION_BASE_PATH+"/"+processed_csv, sep=',', encoding='utf-8', index=False)

In [17]:
processed_df.size

1037230