In [1]:
import pandas as pd
import re

In [2]:
APPLICATION_BASE_PATH = "D:/PhD/Model_Building/Stage_4_Building_Vulnerability_Dataset/8_Round_Qark"
unprocessed_csv = "Scanned_App_Unprocessed_Dataset.csv"
processed_csv = unprocessed_csv.replace("Unprocessed","Processed")

In [3]:
def preprocess_comments_and_strings(code_line):
    
    processed_code_line = code_line

    encryption_hashing_pattern = "AES|aes|SHA-1|sha-1|SHA1|sha1|MD5|md5"
    ip_pattern = "\w*([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})\w*"
    string_pattern = "\"[\w|\s|$|&|+|,|:|;|=|?|@|#|_|/|\-|\.|!|`|~|%|\^|\*|\(|\)|\'\\[|\]\{|\}]*\""

    # Checking for encryption related strings
    find_encryption = re.search(encryption_hashing_pattern, processed_code_line)

    # Checking for IP related strings
    find_ip = re.search(ip_pattern, processed_code_line)

    if (find_encryption is None) & (find_ip is None):
        # replacing all strings with dummy string
        processed_code_line = re.sub(string_pattern, "\"user_str\"", processed_code_line)
        
    # replacing comments with dummy comment
    #comment_pattern = "//.*|/\\*(?s:.*?)\\*/|(\"(?:(?<!\\\\)(?:\\\\\\\\)*\\\\\"|[^\r\n\"])*\")"
    comment_pattern = "//.*|/\\*(?s:.*?)\\*/|/\\*(.)*|(.)*\\*/"
    processed_code_line = re.sub(comment_pattern, "//user_comment",processed_code_line)

    return processed_code_line

In [5]:
unprocessed_dataset_df = pd.read_csv(APPLICATION_BASE_PATH+"/"+unprocessed_csv).fillna("")

In [6]:
unprocessed_dataset_df.head()

Unnamed: 0.1,Unnamed: 0,app_name,category,severity,qark_name,Code,CWE_ID,CWE_Desc,Vulnerability_status
0,0,yalp-store-fork,,,,package com.github.yeriomin.yalpstore.bugreport;,,,0
1,1,yalp-store-fork,,,,,,,0
2,2,yalp-store-fork,,,,import android.content.Context;,,,0
3,3,yalp-store-fork,,,,import android.util.Log;,,,0
4,4,yalp-store-fork,,,,import com.github.yeriomin.yalpstore.Util;,,,0


In [7]:
unprocessed_dataset_df.Code

0          package com.github.yeriomin.yalpstore.bugreport;
1                                                          
2                           import android.content.Context;
3                                  import android.util.Log;
4                import com.github.yeriomin.yalpstore.Util;
                                ...                        
291953                   Log.v("hidapi", var10.toString());
291954                  Log.v("pythonutil", sb.toString());
291955                   Log.v("SDLAudio", sb5.toString());
291956    class PythonService extends org.kivy.android.P...
291957                    Log.d("hidapi", var7.toString());
Name: Code, Length: 291958, dtype: object

In [9]:
comments_and_strings_processed_codes=[]
for code in unprocessed_dataset_df.Code:
    if(len(code)>150):
        processed_code = "user_str"
    else:
        processed_code=preprocess_comments_and_strings(code)   
        
    comments_and_strings_processed_codes.append(processed_code)

In [10]:
comments_and_strings_processed_codes_series= pd.Series(comments_and_strings_processed_codes)
print(comments_and_strings_processed_codes_series)

0          package com.github.yeriomin.yalpstore.bugreport;
1                                                          
2                           import android.content.Context;
3                                  import android.util.Log;
4                import com.github.yeriomin.yalpstore.Util;
                                ...                        
291953                 Log.v("user_str", var10.toString());
291954                    Log.v("user_str", sb.toString());
291955                   Log.v("user_str", sb5.toString());
291956    class PythonService extends org.kivy.android.P...
291957                  Log.d("user_str", var7.toString());
Length: 291958, dtype: object


In [11]:
unprocessed_dataset_df["processed_code"]= comments_and_strings_processed_codes

In [12]:
unprocessed_dataset_df.head()

Unnamed: 0.1,Unnamed: 0,app_name,category,severity,qark_name,Code,CWE_ID,CWE_Desc,Vulnerability_status,processed_code
0,0,yalp-store-fork,,,,package com.github.yeriomin.yalpstore.bugreport;,,,0,package com.github.yeriomin.yalpstore.bugreport;
1,1,yalp-store-fork,,,,,,,0,
2,2,yalp-store-fork,,,,import android.content.Context;,,,0,import android.content.Context;
3,3,yalp-store-fork,,,,import android.util.Log;,,,0,import android.util.Log;
4,4,yalp-store-fork,,,,import com.github.yeriomin.yalpstore.Util;,,,0,import com.github.yeriomin.yalpstore.Util;


In [13]:
#processed_df = unprocessed_dataset_df.drop('Code', axis=1)
processed_df = unprocessed_dataset_df

In [14]:
processed_df.head()

Unnamed: 0.1,Unnamed: 0,app_name,category,severity,qark_name,Code,CWE_ID,CWE_Desc,Vulnerability_status,processed_code
0,0,yalp-store-fork,,,,package com.github.yeriomin.yalpstore.bugreport;,,,0,package com.github.yeriomin.yalpstore.bugreport;
1,1,yalp-store-fork,,,,,,,0,
2,2,yalp-store-fork,,,,import android.content.Context;,,,0,import android.content.Context;
3,3,yalp-store-fork,,,,import android.util.Log;,,,0,import android.util.Log;
4,4,yalp-store-fork,,,,import com.github.yeriomin.yalpstore.Util;,,,0,import com.github.yeriomin.yalpstore.Util;


In [15]:
print(unprocessed_dataset_df.size ,",", processed_df.size)

2919580 , 2919580


In [16]:
processed_df.drop_duplicates(subset=['processed_code', 'CWE_ID'], inplace=True)

In [17]:
processed_df.size

2428860

In [18]:
processed_df.to_csv(APPLICATION_BASE_PATH+"/"+processed_csv, sep=',', encoding='utf-8', index=False)

In [19]:
processed_df.size

2428860