# GROUP-1-PURPOSE-NO-LEAK
### Link: https://developer.android.com/reference/android/media/ExifInterface

In [1]:
import re
import json
import os
import pandas as pd

## 1. Function

In [2]:
# Function 1 - Get all file in directory
def get_all_files_in_directory(directory_path):
    # Initialize an empty list to store file names
    file_list = []

    # Iterate through all files and directories in the specified directory
    for item in os.listdir(directory_path):
        # Construct the full path of the item
        full_path = os.path.join(directory_path, item)

        # Check if the item is a file (not a directory)
        if os.path.isfile(full_path):
            # Add the file to the list
            file_list.append(full_path)
    return file_list
# Function 2 - read json file
def read_json_file(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            data = json.load(file)
        return data
    else:
        print(f"The file '{file_path}' does not exist.")
        return None
# Function 3 - Check exist value
def check_values_exist(list_of_dicts, purpose):
    # Initialize an empty set to store the found values
    found_values = set()

    # Iterate over each dictionary in the list of dictionaries
    for d in list_of_dicts:
        # Iterate over each value in the dictionary
        for value_list in d.values():
            # Flatten the list of lists into a single list
            flat_list = [item.lower() for sublist in value_list for item in sublist]
            # Check if any value in 'purpose' exists in the flat list
            for val in purpose:
                if any(val.lower() in item for item in flat_list):
                    found_values.add(val)

    return found_values
# Function 4 - get key of dict
def get_dict_keys(input_dict):
    # Get the keys of the dictionary
    keys = list(input_dict.keys())
    # Return the keys
    return keys
# Function 5 -  check string in list
def check_list_contain_string(input_list, search_string):
    # Initialize an empty list to store strings containing the specified substring
    result = []
    # Iterate over each element in the input list
    for element in input_list:
        # Check if the search_string (case-insensitive) is present in the element
        if isinstance(element, str) and search_string.lower() in element.lower():
            # If found, add the element to the result list
            result.append(element)
        # If the element is a list, recursively call the function
        elif isinstance(element, list):
            result.extend(check_list_contain_string(element, search_string))
    # Return the list of strings containing the specified substring
    return result
# Function 6 - remove duplicate value in list
def remove_duplicates(input_list):
    # Convert list to set to remove duplicates
    unique_set = set(input_list)
    # Convert set back to list
    unique_list = list(unique_set)
    return unique_list
# Function 7 - get APK name for CSV
def extract_filename(file_path):
    # Extract the filename from the path
    filename_with_extension = os.path.basename(file_path)
    # Remove the file extension
    filename_without_extension = os.path.splitext(filename_with_extension)[0]
    return filename_without_extension
# Function 8 - evaluation
def update_evaluation(df, row_index):
    # Read values of "metadata_leak" and "code_purpose" at the specified row index
    metadata_leak_value = df.at[row_index, "metadata_leak"]
    code_purpose_value = df.at[row_index, "code_purpose"]
    print(metadata_leak_value)
    print("$$$$$$$$$$$$$$$$$")
    print(code_purpose_value)
    df['evaluation'] = df['evaluation'].astype('object')
    # Compare the two column values
    if all(item in metadata_leak_value for item in code_purpose_value.split('\n')):
        # If "code_purpose" value is contained in "metadata_leak" value, update "evaluation" to TRUE
        #df.at[row_index, "evaluation"] = "True"
        df.loc[row_index,"evaluation"]="TRUE"
        print("TRUE")
    else:
        # If not, update "evaluation" to FALSE
        #df.at[row_index, "evaluation"] = "False"
        df.loc[row_index,"evaluation"]="FALSE"
        print("FALSE")
    # Save the updated DataFrame back to the CSV file
    df.to_csv(csv_file, index=False)
# Function 9 -  check string in list
def check_for_string_in_list(input_list, search_string):
    for item in input_list:
        if search_string in item:
            return True
    return False
# Function 10 - AND all value in list
def logic_and_all_values(lst):
    # If the list is empty, return True
    if not lst:
        return True
    
    # Check each value in the list
    for value in lst:
        # If any value is False, return False immediately
        if not value:
            return False
    
    # If all values are True, return True
    return True

## 2. Static Variable

In [3]:
group_1_json_path = r"C:\Users\ASUS\anaconda3\metaLeak-ml-code-search\grouping-no-leak\group-2-no-leak\json-no-leak"
purpose = [
    "datetime",
    "DateTime",
    "Time"
    "date",
    "gps",
    "program",
    "maker",
    "model",
    "longitude",
    "latitude",
    "altitude"
]
csv_path = "no_leak_apk_low_size.csv"

## 3. Main

In [4]:
# Read CSV file
df = pd.read_csv(csv_path,low_memory=False)
group_1_json_files = get_all_files_in_directory(group_1_json_path)
arr_row_index = []
for i in range(len(group_1_json_files)):
    print("------------------------------Loop-"+str(i)+"------------------------------")
    json_file = group_1_json_files[i]
    print("File path: "+str(json_file))
    apkfile_name  = extract_filename(json_file)
    apkfile_name = apkfile_name+".apk"
    print("Apk file name: "+apkfile_name)
    
    json_list_of_dict = read_json_file(json_file)
    #print(json_list_of_dict)
    leak_result = []
    for x in range(len(json_list_of_dict)):
        print("==========================Loop-"+str(i)+"."+str(x)+"==========================")
        layer_2_dict = json_list_of_dict[x]
        print(layer_2_dict)
        #print(type(layer_2_dict))
        layer_2_key = get_dict_keys(layer_2_dict)
        print(layer_2_key)
        for y in range(len(layer_2_key)):
            layer_3_value = layer_2_dict[layer_2_key[y]]
            print(layer_3_value)
            #print(type(layer_3_value))
            for z in range(len(purpose)):
                print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
                search_string = purpose[z]
                print("search_string: "+search_string)
                check_result = check_list_contain_string(layer_3_value, search_string)
                if len(check_result) >0:
                    print("check_result")
                    print(check_result)
                    leak_result.append(search_string)
    leak_result = remove_duplicates(leak_result)
    print(leak_result)
    purpose_array = []
    if (len(leak_result)>0):
        for m in range(len(leak_result)):
            leak_value = leak_result[m]
            print(leak_value)
            if(leak_value=="datetime") or (leak_value=="date") or (leak_value=="DateTime"):
                purpose_array.append("Leak datetime taken")
            if(leak_value=="gps") or (leak_value=="longitude") or (leak_value=="latitude") or (leak_value=="altitude"):
                purpose_array.append("Leak gps")
            if(leak_value=="program"):
                purpose_array.append("Leak smart phone software")
            if(leak_value=="maker"):
                purpose_array.append("Leak smart phone brand")
            if(leak_value=="model"):
                purpose_array.append("Leak smart phone model")
    else:
        purpose_array.append("No Leak Sensitive Metadata")
    #print(purpose_array)
    purpose_array = remove_duplicates(purpose_array)
    print(purpose_array)
    insert_csv_string = '\n'.join(purpose_array)
    try:
        row_index = int(df[df['apkfile_name']==apkfile_name].index.values)
        print("APK index: "+str(row_index))
        arr_row_index.append(row_index)
        df['code_purpose'] = df['code_purpose'].astype('object')
        df.loc[row_index,"code_purpose"]=insert_csv_string
        df.to_csv(csv_path, index=False)
    except Exception as e:
        print(f"An error occurred: {e}")
        continue
    #break

------------------------------Loop-0------------------------------
File path: C:\Users\ASUS\anaconda3\metaLeak-ml-code-search\grouping-no-leak\group-2-no-leak\json-no-leak\All-Photo-Frames-2023-1.0.34.json
Apk file name: All-Photo-Frames-2023-1.0.34.apk
{'static_declare': [['private static final int AVIF_BRAND = 1635150182;', 'private static final int AVIS_BRAND = 1635150195;', 'private static final int FTYP_HEADER = 1718909296;', 'private static final int GIF_HEADER = 4671814;', 'private static final int INTEL_TIFF_MAGIC_NUMBER = 18761;', 'private static final String JPEG_EXIF_SEGMENT_PREAMBLE = "Exif\\u0000\\u0000";', 'private static final int MARKER_EOI = 217;', 'private static final int MOTOROLA_TIFF_MAGIC_NUMBER = 19789;', 'private static final int ORIENTATION_TAG_TYPE = 274;', 'private static final int PNG_HEADER = -1991225785;', 'private static final int RIFF_HEADER = 1380533830;', 'private static final int SEGMENT_SOS = 218;', 'private static final String TAG = "DfltImageHeader

------------------------------Loop-25------------------------------
File path: C:\Users\ASUS\anaconda3\metaLeak-ml-code-search\grouping-no-leak\group-2-no-leak\json-no-leak\IU-Fandom-Wallpaper-GIF-Fa-2.2.7.json
Apk file name: IU-Fandom-Wallpaper-GIF-Fa-2.2.7.apk
{'static_declare': [['private static final Object DECODE_LOCK;', 'private static final RequestHandler ERRORING_HANDLER;', 'private static final AtomicInteger SEQUENCE_GENERATOR;'], ['static final Rect EMPTY_RECT;', 'static final RectF EMPTY_RECT_F;', 'static final RectF RECT;'], ['private static final AtomicLong _sequenceNumber;'], ['static final int BYTES_IN_A_GIGABYTE = 1073741824;', 'static final int BYTES_IN_A_KILOBYTE = 1024;', 'static final int BYTES_IN_A_MEGABYTE = 1048576;', 'private static final boolean CLS_TRACE_DEFAULT = false;', 'private static final String CLS_TRACE_PREFERENCE_NAME = "com.crashlytics.Trace";', 'public static final int DEVICE_STATE_BETAOS = 8;', 'public static final int DEVICE_STATE_COMPROMISEDLIBRA

------------------------------Loop-46------------------------------
File path: C:\Users\ASUS\anaconda3\metaLeak-ml-code-search\grouping-no-leak\group-2-no-leak\json-no-leak\Selfie-With-Emma-Watson-Emma-Wallpapers-Editor-1.1.json
Apk file name: Selfie-With-Emma-Watson-Emma-Wallpapers-Editor-1.1.apk
{'static_declare': [['static final Rect a;', 'static final RectF b;', 'static final RectF c;'], ['public static final short ALTITUDE_ABOVE_SEA_LEVEL = 0;', 'public static final short ALTITUDE_BELOW_SEA_LEVEL = 1;', 'public static final int COLOR_SPACE_S_RGB = 1;', 'public static final int COLOR_SPACE_UNCALIBRATED = 65535;', 'public static final short CONTRAST_HARD = 2;', 'public static final short CONTRAST_NORMAL = 0;', 'public static final short CONTRAST_SOFT = 1;', 'public static final int DATA_DEFLATE_ZIP = 8;', 'public static final int DATA_HUFFMAN_COMPRESSED = 2;', 'public static final int DATA_JPEG = 6;', 'public static final int DATA_JPEG_COMPRESSED = 7;', 'public static final int DATA

In [5]:
df = pd.read_csv(csv_path,low_memory=False)
for l in range(len(arr_row_index)):
    row_index = int(arr_row_index[l])
    print("+++++++++++++++++++++++++++++++++++++++++++"+str(row_index)+"+++++++++++++++++++++++++++++++++++++++++++")
    metadata_leak_value = df.at[row_index, "metadata_leak"]
    code_purpose_value = df.at[row_index, "code_purpose"]
    print("metadata_leak_value",metadata_leak_value)
#     arr_metadata_leak_value =  metadata_leak_value.split('\n')
    print("code_purpose_value: ",code_purpose_value)
#     print("*****************")
    if(metadata_leak_value==code_purpose_value):
        df.loc[row_index,"evaluation"]=True
    else:
        df.loc[row_index,"evaluation"]=False   
#     print(code_purpose_value)
#     arr_code_purpose_value = code_purpose_value.split('\n')
#     print("arr_code_purpose_value: ",arr_code_purpose_value)
#     evaluation =  []
#     search_string = "No Leak Sensitive Metadata"
#     real_result = check_for_string_in_list(arr_metadata_leak_value, search_string)
#     evaluation.append(real_result)
#     purpose_result = check_for_string_in_list(arr_metadata_leak_value, search_string)
#     evaluation.append(purpose_result)
#     update_evaluation = logic_and_all_values(evaluation)
#     print("Evaluation:"+str(update_evaluation))
#     df['evaluation'] = df['evaluation'].astype('object')
#     df.loc[row_index,"evaluation"]=update_evaluation
#     break
df.to_csv(csv_path, index=False)

+++++++++++++++++++++++++++++++++++++++++++353+++++++++++++++++++++++++++++++++++++++++++
metadata_leak_value No Leak Sensitive Metadata
code_purpose_value:  Leak smart phone brand
Leak gps
Leak smart phone model
Leak smart phone software
Leak datetime taken
+++++++++++++++++++++++++++++++++++++++++++117+++++++++++++++++++++++++++++++++++++++++++
metadata_leak_value No Leak Sensitive Metadata
code_purpose_value:  Leak smart phone brand
Leak gps
Leak smart phone model
Leak smart phone software
Leak datetime taken
+++++++++++++++++++++++++++++++++++++++++++498+++++++++++++++++++++++++++++++++++++++++++
metadata_leak_value No Leak Sensitive Metadata
code_purpose_value:  Leak smart phone brand
Leak gps
Leak smart phone model
Leak smart phone software
Leak datetime taken
+++++++++++++++++++++++++++++++++++++++++++538+++++++++++++++++++++++++++++++++++++++++++
metadata_leak_value No Leak Sensitive Metadata
code_purpose_value:  Leak smart phone brand
Leak gps
Leak smart phone model
Leak smart

In [6]:
def read_csv_false_true_rows_group(csv_file, true_file, false_file):
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(csv_file)
    
    # Filter rows where the evaluation value is FALSE and group value is 1
    false_group_rows = df[(df['evaluation'] == False) & (df['group'] == 2)]
    true_group_rows = df[(df['evaluation'] == True) & (df['group'] == 2)]
    # Extract the apkfile_name column from the filtered rows
    apkfile_names_false = false_group_rows['apkfile_name']
    apkfile_names_true = true_group_rows['apkfile_name']
    # Write the apkfile_names to a text file
    with open(true_file, 'w') as file_1:
        for name in apkfile_names_true:
            file_1.write(name + '\n')
    with open(false_file, 'w') as file_2:
        for name in apkfile_names_false:
            file_2.write(name + '\n')
# Example usage:
true_file = 'group-2-no-leak-true.txt' 
false_file = 'group-2-no-leak-false.txt'
read_csv_false_true_rows_group(csv_path, true_file,false_file)