# Fingerprinting_detector AI

In [1]:
## Escaner

In [15]:
from pathlib import Path

from custom_command import LinkCountingCommand
from openwpm.command_sequence import CommandSequence
from openwpm.commands.browser_commands import GetCommand
from openwpm.config import BrowserParams, ManagerParams
from openwpm.storage.sql_provider import SQLiteStorageProvider
from openwpm.task_manager import TaskManager

def scannerPage(page, database):
    # The list of sites that we wish to crawl
    NUM_BROWSERS = 1
    sites = page

    # Loads the default ManagerParams
    # and NUM_BROWSERS copies of the default BrowserParams

    manager_params = ManagerParams(num_browsers=NUM_BROWSERS)
    browser_params = [BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS)]

    # Update browser configuration (use this for per-browser settings)
    for browser_param in browser_params:
        # Record HTTP Requests and Responses
        browser_param.http_instrument = True
        # Record cookie changes
        browser_param.cookie_instrument = True
        # Record Navigations
        browser_param.navigation_instrument = True
        # Record JS Web API calls
        browser_param.js_instrument = True
        # Record the callstack of all WebRequests made
        browser_param.callstack_instrument = True
        # Record DNS resolution
        browser_param.dns_instrument = True

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params.data_directory = Path("./fingerprinting_detector/")
    manager_params.log_path = Path("./fingerprinting_detector/openwpm.log")

    # memory_watchdog and process_watchdog are useful for large scale cloud crawls.
    # Please refer to docs/Configuration.md#platform-configuration-options for more information
    # manager_params.memory_watchdog = True
    # manager_params.process_watchdog = True


    # Commands time out by default after 60 seconds
    with TaskManager(
        manager_params,
        browser_params,
        SQLiteStorageProvider(Path(f"./fingerprinting_detector/result/{database}.sqlite")),
        None,
    ) as manager:
        # Visits the sites
        for index, site in enumerate(sites):

            def callback(success: bool, val: str = site) -> None:
                print(
                    f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}"
                )

            # Parallelize sites over all number of browsers set above.
            command_sequence = CommandSequence(
                site,
                site_rank=index,
                callback=callback,
            )

            # Start by visiting the page
            command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60)
            # Have a look at custom_command.py to see how to implement your own command
            command_sequence.append_command(LinkCountingCommand())

            # Run commands across all browsers (simple parallelization)
            manager.execute_command_sequence(command_sequence)

In [16]:
dataBaseName = "browserleaks"
pages = ["https://browserleaks.com/"]
scannerPage(pages,  dataBaseName)

browser_manager      - INFO     - BROWSER 3570851773: Launching browser...
storage_controller   - INFO     - Awaiting all tasks for visit_id -1
task_manager         - INFO     - 

OpenWPM Version: 0.19.1
Firefox Version: b'98.0'

{
  "_failure_limit": null,
  "data_directory": "/home/in/Escritorio/OpenWPM-0.19.1/fingerprinting_detector",
  "log_path": "/home/in/Escritorio/OpenWPM-0.19.1/fingerprinting_detector/openwpm.log",
  "logger_address": [
    "127.0.0.1",
    44511
  ],
  "memory_watchdog": false,
  "num_browsers": 1,
  "process_watchdog": false,
  "screenshot_path": "/home/in/Escritorio/OpenWPM-0.19.1/fingerprinting_detector/screenshots",
  "source_dump_path": "/home/in/Escritorio/OpenWPM-0.19.1/fingerprinting_detector/sources",
  "storage_controller_address": [
    "127.0.0.1",
    45627
  ],
  "testing": false
}

Keys:
{
  "browser_id": 0,
  "bot_mitigation": 1,
  "browser": 2,
  "callstack_instrument": 3,
  "cookie_instrument": 4,
  "custom_params": 5,
  "display_mode": 6,
 

fatal: no es un repositorio git (ni ninguno de los directorios superiores): .git


browser_manager      - INFO     - Starting to work on CommandSequence with visit_id 8752108791520136 on browser with id 3570851773
browser_manager      - INFO     - BROWSER 3570851773: EXECUTING COMMAND: InitializeCommand()
browser_manager      - INFO     - BROWSER 3570851773: EXECUTING COMMAND: GetCommand(https://browserleaks.com/,3)
browser_manager      - INFO     - BROWSER 3570851773: EXECUTING COMMAND: LinkCountingCommand
custom_command       - INFO     - There are 39 links on https://browserleaks.com/
browser_manager      - INFO     - BROWSER 3570851773: EXECUTING COMMAND: FinalizeCommand(5)
browser_manager      - INFO     - Finished working on CommandSequence with visit_id 8752108791520136 on browser with id 3570851773
storage_controller   - INFO     - Awaiting all tasks for visit_id 8752108791520136
storage_controller   - INFO     - Terminating handler, because the underlying socket closed
storage_controller   - INFO     - Terminating handler, because the underlying socket close

In [17]:
import re
import json
import sqlite3
import pandas as pd
from _collections import defaultdict
from tqdm import tqdm

In [18]:
import sqlite3

con = sqlite3.connect(f"./fingerprinting_detector/result/{dataBaseName}.sqlite")
con.row_factory = sqlite3.Row
cur = con.cursor()
query = """SELECT distinct j.script_url, j.symbol, j.operation, j.value, j.arguments, v.site_url 
        FROM javascript as j 
        JOIN site_visits as v ON j.visit_id = v.visit_id 
        WHERE j.symbol LIKE '%Canvas%' ORDER BY v.site_url
    """
js = pd.read_sql_query(query, con)
print ("Number of javascript calls", len(js))

Number of javascript calls 2


In [19]:
# Script informativeJS
info_ob= ["window.navigator.appCodeName", "window.navigator.appName", "window.navigator.appVersion", "window.navigator.buildID", "window.navigator.cookieEnabled", "window.navigator.doNotTrack", "window.navigator.geolocation", "window.navigator.language", "window.navigator.languages", "window.navigator.onLine", "window.navigator.oscpu", "window.navigator.platform", "window.navigator.product", "window.navigator.productSub", "window.navigator.userAgent", "window.navigator.vendorSub", "window.navigator.vendor", "window.screen.pixelDepth", "window.screen.colorDepth"]
navplug= "window.navigator.plugins"
navmim= "window.navigator.mimeTypes"

query_infor = """SELECT distinct j.script_url, j.symbol, j.operation, j.value, j.arguments, v.site_url 
        FROM javascript as j 
        JOIN site_visits as v ON j.visit_id = v.visit_id 
        WHERE j.symbol LIKE '%window%' ORDER BY v.site_url """

df_informative = pd.read_sql_query(query_infor, con)
df_informative

Unnamed: 0,script_url,symbol,operation,value,arguments,site_url
0,https://googleads.g.doubleclick.net/pagead/htm...,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko...,,https://browserleaks.com/
1,https://googleads.g.doubleclick.net/pagead/htm...,window.name,get,google_esf,,https://browserleaks.com/
2,https://pagead2.googlesyndication.com/pagead/j...,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko...,,https://browserleaks.com/
3,https://pagead2.googlesyndication.com/pagead/j...,window.localStorage,get,{},,https://browserleaks.com/
4,https://pagead2.googlesyndication.com/pagead/j...,window.localStorage,get,"{""goog_pem_mod"":""755""}",,https://browserleaks.com/
5,https://pagead2.googlesyndication.com/pagead/j...,window.localStorage,get,"{""goog_pem_mod"":""755"",""google_experiment_mod34...",,https://browserleaks.com/
6,https://pagead2.googlesyndication.com/pagead/j...,window.localStorage,get,"{""google_experiment_mod36"":""577"",""goog_pem_mod...",,https://browserleaks.com/
7,https://pagead2.googlesyndication.com/pagead/j...,window.localStorage,get,"{""google_experiment_mod36"":""577"",""goog_pem_mod...",,https://browserleaks.com/
8,https://pagead2.googlesyndication.com/pagead/j...,window.document.cookie,get,,,https://browserleaks.com/
9,https://www.google-analytics.com/analytics.js,window.document.referrer,get,,,https://browserleaks.com/


In [20]:
# Agrupos por script y hacemos un recuento 
df_informative_groupby = df_informative.groupby(by='site_url')
df_inf_group = [pd.DataFrame(group) for _, group in df_informative_groupby]
df_inf_group

[                                           script_url  \
 0   https://googleads.g.doubleclick.net/pagead/htm...   
 1   https://googleads.g.doubleclick.net/pagead/htm...   
 2   https://pagead2.googlesyndication.com/pagead/j...   
 3   https://pagead2.googlesyndication.com/pagead/j...   
 4   https://pagead2.googlesyndication.com/pagead/j...   
 5   https://pagead2.googlesyndication.com/pagead/j...   
 6   https://pagead2.googlesyndication.com/pagead/j...   
 7   https://pagead2.googlesyndication.com/pagead/j...   
 8   https://pagead2.googlesyndication.com/pagead/j...   
 9       https://www.google-analytics.com/analytics.js   
 10      https://www.google-analytics.com/analytics.js   
 11      https://www.google-analytics.com/analytics.js   
 12      https://www.google-analytics.com/analytics.js   
 13      https://www.google-analytics.com/analytics.js   
 14      https://www.google-analytics.com/analytics.js   
 15      https://www.google-analytics.com/analytics.js   
 16      https

In [21]:
sites = set()
row_dict = {}
for df in df_inf_group:
#     navplug_count = 0
#     navmim_count  = 0
#     info_ob_count = 0
    for index, row in df.iterrows(): 
        count_dict = {'site_url':"", 'navplug_count':0, 'navmim_count':0, 'info_ob_count':0}
        # Comprobaciones 
        site_url = row['site_url'] + ' ' + row['script_url']
        if site_url in row_dict:    
            if navplug in row['symbol']: 
                row_dict[site_url]['navplug_count'] +=1
            elif navmim in row['symbol']:
                row_dict[site_url]['navmim_count'] +=1
            if row['symbol'] in info_ob: 
                row_dict[site_url]['info_ob_count'] +=1
        else: 
            # Asignamos la url
            count_dict['site_url'] = row['site_url'] 
            if navplug in row['symbol']: 
                count_dict['navplug_count'] +=1
            elif navmim in row['symbol']:
                count_dict['navmim_count'] +=1
            if row['symbol'] in info_ob: 
                count_dict['info_ob_count'] +=1
            row_dict[site_url] = count_dict 
row_dict

{'https://browserleaks.com/ https://googleads.g.doubleclick.net/pagead/html/r20220412/r20190131/zrt_lookup.html': {'site_url': 'https://browserleaks.com/',
  'navplug_count': 0,
  'navmim_count': 0,
  'info_ob_count': 1},
 'https://browserleaks.com/ https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js': {'site_url': 'https://browserleaks.com/',
  'navplug_count': 0,
  'navmim_count': 0,
  'info_ob_count': 1},
 'https://browserleaks.com/ https://www.google-analytics.com/analytics.js': {'site_url': 'https://browserleaks.com/',
  'navplug_count': 1,
  'navmim_count': 0,
  'info_ob_count': 3},
 'https://browserleaks.com/ https://pagead2.googlesyndication.com/pagead/managed/js/adsense/m202204040101/show_ads_impl_with_ama_fy2019.js?client=ca-pub-4594829972910700&plah=browserleaks.com': {'site_url': 'https://browserleaks.com/',
  'navplug_count': 0,
  'navmim_count': 0,
  'info_ob_count': 4},
 'https://browserleaks.com/ https://browserleaks.com/js/default.js?v=28880151': {'site_url'

In [22]:
# detección de webRTC 
rtc_data= "RTCPeerConnection.createDataChannel"
rtc_offer= "RTCPeerConnection.createOffer"
rtc_cand= "RTCPeerConnection.onicecandidate"

query_webrtc = """SELECT distinct j.script_url, j.symbol, j.operation, j.value, j.arguments, v.site_url 
        FROM javascript as j 
        JOIN site_visits as v ON j.visit_id = v.visit_id 
        WHERE j.symbol LIKE '%RTCPeerConnection%' ORDER BY v.site_url """

df_webrtc = pd.read_sql_query(query_webrtc, con)
df_webrtc_groupby = df_webrtc.groupby(by='site_url')
df_webrtc_group = [pd.DataFrame(group) for _, group in df_webrtc_groupby]
df_webrtc_group

[]

In [23]:
webrtc_dict = {}
for df in df_webrtc_group:
    for index, row in df.iterrows(): 
        count_dict = {'rtc_data':0, 'rtc_offer':0, 'rtc_cand':0}
        # Comprobaciones 
        site_url = row['site_url'] + ' ' + row['script_url']
        if site_url in webrtc_dict:    
            if rtc_data in row['symbol']: 
                webrtc_dict[site_url]['rtc_data'] +=1
            if rtc_offer in row['symbol']:
                webrtc_dict[site_url]['rtc_offer'] +=1
            if rtc_cand in row['symbol']:
                webrtc_dict[site_url]['rtc_cand'] +=1
        else: 
            count_dict['site_url'] = row['site_url'] 
            if rtc_data in row['symbol']: 
                count_dict['rtc_data'] +=1
            if rtc_offer in row['symbol']:
                count_dict['rtc_offer'] +=1
            if rtc_cand in row['symbol']: 
                count_dict['rtc_cand'] +=1
            webrtc_dict[site_url] = count_dict 
webrtc_dict

{}

In [24]:
# Detección de canvas Font
canvas_font= "CanvasRenderingContext2D.font"
canvas_text= "CanvasRenderingContext2D.measureText"
query_canvasFont = """SELECT distinct j.script_url, j.symbol, j.operation, j.value, j.arguments, v.site_url 
        FROM javascript as j 
        JOIN site_visits as v ON j.visit_id = v.visit_id 
        WHERE j.symbol LIKE '%Canvas%' ORDER BY v.site_url """

df_canvasFont = pd.read_sql_query(query_canvasFont, con)
df_canvasFont_groupby = df_canvasFont.groupby(by='site_url')
df_canvasFont_group = [pd.DataFrame(group) for _, group in df_canvasFont_groupby]
df_canvasFont_group

[                                          script_url  \
 0  https://pagead2.googlesyndication.com/bg/l2o4c...   
 1  https://pagead2.googlesyndication.com/bg/l2o4c...   
 
                          symbol operation value               arguments  \
 0  HTMLCanvasElement.getContext      call                     ["webgl"]   
 1  HTMLCanvasElement.getContext      call        ["experimental-webgl"]   
 
                     site_url  
 0  https://browserleaks.com/  
 1  https://browserleaks.com/  ]

In [25]:
canvasFont_dict = {}
for df in df_canvasFont_group:
    for index, row in df.iterrows(): 
        count_dict = {'canvas_font':0, 'canvas_text':0}
        # Comprobaciones 
        site_url = row['site_url'] + ' ' + row['script_url']
        if site_url in canvasFont_dict:    
            if canvas_font in row['symbol']: 
                canvasFont_dict[site_url]['canvas_font'] +=1
            if canvas_text in row['symbol']:
                canvasFont_dict[site_url]['canvas_text'] +=1
        else: 
            count_dict['site_url'] = row['site_url'] 
            if canvas_font in row['symbol']: 
                count_dict['canvas_font'] +=1
            if canvas_text in row['symbol']:
                count_dict['canvas_text'] +=1
            canvasFont_dict[site_url] = count_dict 
canvasFont_dict

{'https://browserleaks.com/ https://pagead2.googlesyndication.com/bg/l2o4cWLNalU19nN7vA12WZhb1qS4KDqIWPmZT-glBuk.js': {'canvas_font': 0,
  'canvas_text': 0,
  'site_url': 'https://browserleaks.com/'}}

In [26]:
# Detección de canvas 
canvas_h = "HTMLCanvasElement.height"
canvas_w = "HTMLCanvasElement.width"
canvas_cf= "CanvasRenderingContext2D.fillStyle"
canvas_cs= "CanvasRenderingContext2D.strokeStyle"
canvas_ff= "CanvasRenderingContext2D.fillText"
canvas_fs= "CanvasRenderingContext2D.strokeText"
canvas_save= "CanvasRenderingContext2D.save"
canvas_rest= "CanvasRenderingContext2D.restore"
canvas_el= "HTMLCanvasElement.addEventListener"
canvas_data= "HTMLCanvasElement.toDataURL"
canvas_img= "CanvasRenderingContext2D.getImageData"
MIN_CANVAS_TEXT_LEN = 10
MIN_CANVAS_IMAGE_WIDTH = 16
MIN_CANVAS_IMAGE_HEIGHT = 16

query_canvas = """SELECT distinct j.script_url, j.symbol, j.operation, j.value, j.arguments, v.site_url 
        FROM javascript as j 
        JOIN site_visits as v ON j.visit_id = v.visit_id 
        WHERE j.symbol LIKE '%Canvas%' ORDER BY v.site_url """

df_canvas = pd.read_sql_query(query_canvas, con)
df_canvas_groupby = df_canvas.groupby(by='site_url')
df_canvas_group = [pd.DataFrame(group) for _, group in df_canvas_groupby]
df_canvas_group

[                                          script_url  \
 0  https://pagead2.googlesyndication.com/bg/l2o4c...   
 1  https://pagead2.googlesyndication.com/bg/l2o4c...   
 
                          symbol operation value               arguments  \
 0  HTMLCanvasElement.getContext      call                     ["webgl"]   
 1  HTMLCanvasElement.getContext      call        ["experimental-webgl"]   
 
                     site_url  
 0  https://browserleaks.com/  
 1  https://browserleaks.com/  ]

In [27]:
"""
1. Valor de canvas_h y canvas_w sea mayor o igual que 16
2. fillstyle y strokestyle tienen el mismo valor o total de fillstyle es mayor que 1
3. fillText y (len(set(arg[arg.find('0":"')+4:arg.find(',"1"')-1])))>= 10
4. strokeText y (len(set(arg[arg.find('0":"')+4:arg.find(',"1"')-1])))>= 10
5. Existencia de canvas_save, canvas_rest y canvas_el
6. Existencia de toDataURL
7. Existencia de canvas_img y el tamaño
"""

canvas_dict = {}
for df in df_canvas_group:
    for index, row in df.iterrows(): 
        count_dict = {'canvas_h':0, 'canvas_w':0, 'canvas_cf':0, 'canvas_cs':0, 
                     'canvas_ff':[], 'canvas_fs':[], 'canvas_save': 0, 
                     'canvas_rest':0, 'canvas_el':0, 'canvas_data':0, 'canvas_img':[]}
        # Comprobaciones 
        site_url = row['site_url'] + ' ' + row['script_url']
        if site_url in canvas_dict:    
            if canvas_h in row['symbol']: 
                 if float(row['value']) > float(canvas_dict[site_url]['canvas_h']):
                    canvas_dict[site_url]['canvas_h'] = row['value']
                    
            if canvas_w in row['symbol']: 
                 if float(row['value']) > float(canvas_dict[site_url]['canvas_w']):
                    canvas_dict[site_url]['canvas_w'] = row['value']
                    
            if canvas_cf in row['symbol']:
                canvas_dict[site_url]['canvas_cf'] = row['value']
                
            if canvas_cs in row['symbol']:
                canvas_dict[site_url]['canvas_cs'] = row['value']
                
            if canvas_ff in row['symbol']: 
                canvas_dict[site_url]['canvas_ff'].append(row['arguments'])
            elif canvas_fs in row['symbol']:
                canvas_dict[site_url]['canvas_fs'].append(row['arguments'])
                
            if canvas_save in row['symbol']:
                canvas_dict[site_url]['canvas_save'] +=1
            
            if canvas_rest in row['symbol']:
                canvas_dict[site_url]['canvas_rest'] +=1
            
            if canvas_el in row['symbol']:
                canvas_dict[site_url]['canvas_el'] +=1
                
            if canvas_data in row['symbol']:
                canvas_dict[site_url]['canvas_data'] +=1
                
            if canvas_img in row['symbol']:
                canvas_dict[site_url]['canvas_img'].append(row['arguments'])
        else: 
            count_dict['site_url'] = row['site_url'] 
            if canvas_h in row['symbol']: 
                count_dict['canvas_h'] = row['value']
                    
            if canvas_w in row['symbol']: 
                count_dict['canvas_w'] = row['value']
                    
            if canvas_cf in row['symbol']:
                count_dict['canvas_cf'] = row['value']
                
            if canvas_cs in row['symbol']:
                count_dict['canvas_cs'] = row['value']
                
            if canvas_ff in row['symbol']:
                count_dict['canvas_ff'].append(row['arguments'])       
            elif canvas_fs in  row['symbol']:
                count_dict['canvas_fs'].append(row['arguments'])
                
            if canvas_save in row['symbol']:
                count_dict['canvas_save'] +=1
            
            if canvas_rest in row['symbol']:
                count_dict['canvas_rest'] +=1
            
            if canvas_el in row['symbol']:
                count_dict['canvas_el'] +=1
                
            if canvas_data in row['symbol']:
                count_dict['canvas_data'] +=1
                
            if canvas_img in row['symbol']:
                count_dict['canvas_img'].append(row['arguments'])
                
            canvas_dict[site_url] = count_dict
canvas_dict

{'https://browserleaks.com/ https://pagead2.googlesyndication.com/bg/l2o4cWLNalU19nN7vA12WZhb1qS4KDqIWPmZT-glBuk.js': {'canvas_h': 0,
  'canvas_w': 0,
  'canvas_cf': 0,
  'canvas_cs': 0,
  'canvas_ff': [],
  'canvas_fs': [],
  'canvas_save': 0,
  'canvas_rest': 0,
  'canvas_el': 0,
  'canvas_data': 0,
  'canvas_img': [],
  'site_url': 'https://browserleaks.com/'}}

In [28]:
webrtc_df = pd.DataFrame(webrtc_dict).T
inf_df = pd.DataFrame(row_dict).T
canvasFont_df = pd.DataFrame(canvasFont_dict).T
canvas_df = pd.DataFrame(canvas_dict).T

print(len(webrtc_df))
print(len(inf_df))
print(len(canvasFont_df))
print(len(canvas_df))

0
13
1
1


In [29]:
def checkInformative(navplug, navmim, info_ob): 
    if navplug > 5:
        return True
    if navmim > 3: 
        return True 
    if info_ob > 15: 
        return True 
    return False
# inf_df[inf_df['tracking']==True]
inf_df['tracking'] = inf_df.apply(lambda x: checkInformative(x['navplug_count'],x['navmim_count'],x['info_ob_count']), axis=1)
inf_df

Unnamed: 0,site_url,navplug_count,navmim_count,info_ob_count,tracking
https://browserleaks.com/ https://googleads.g.doubleclick.net/pagead/html/r20220412/r20190131/zrt_lookup.html,https://browserleaks.com/,0,0,1,False
https://browserleaks.com/ https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js,https://browserleaks.com/,0,0,1,False
https://browserleaks.com/ https://www.google-analytics.com/analytics.js,https://browserleaks.com/,1,0,3,False
https://browserleaks.com/ https://pagead2.googlesyndication.com/pagead/managed/js/adsense/m202204040101/show_ads_impl_with_ama_fy2019.js?client=ca-pub-4594829972910700&plah=browserleaks.com,https://browserleaks.com/,0,0,4,False
https://browserleaks.com/ https://browserleaks.com/js/default.js?v=28880151,https://browserleaks.com/,0,0,1,False
https://browserleaks.com/ https://cdn.ampproject.org/rtv/012203150226000/amp4ads-v0.mjs,https://browserleaks.com/,0,0,2,False
https://browserleaks.com/ https://pagead2.googlesyndication.com/pagead/managed/js/adsense/m202204040101/reactive_library_fy2019.js,https://browserleaks.com/,0,0,1,False
https://browserleaks.com/ https://googleads.g.doubleclick.net/pagead/html/r20220412/r20110914/zrt_lookup.html?fsb=1#RS-0-&adk=1812271808&client=ca-pub-4594829972910700&fa=8&ifi=3&uci=a!3&xpc=OTn9BM41gj&p=https%3A//browserleaks.com,https://browserleaks.com/,0,0,1,False
https://browserleaks.com/ https://www.googletagservices.com/activeview/js/current/rx_lidar.js?cache=r20110914,https://browserleaks.com/,0,0,1,False
https://browserleaks.com/ https://www.gstatic.com/mysidia/fb084ba56019ecef1e967c41e75d05fd.js?tag=mysidia_one_click_handler_one_afma_2019,https://browserleaks.com/,0,0,1,False


In [30]:
inf_df[inf_df['tracking']==True]

Unnamed: 0,site_url,navplug_count,navmim_count,info_ob_count,tracking


In [31]:
def checkWebRTC(rtc_data, rtc_offer, rtc_cand):
    if rtc_data > 0 and rtc_offer > 0 and rtc_cand > 0:
        return True 
    return False 

webrtc_df['tracking'] = webrtc_df.apply(lambda x: checkWebRTC(x['rtc_data'],x['rtc_offer'],x['rtc_cand']), axis=1)
webrtc_df

IndexError: index 0 is out of bounds for axis 0 with size 0

Empty DataFrame
Columns: [tracking]
Index: []

In [32]:
def checkCanvasFont(canvas_font, canvas_text):
    if canvas_font >= 50 or canvas_text >= 50:
        return True
    return False

canvasFont_df['tracking'] = canvasFont_df.apply(lambda x: checkCanvasFont(x['canvas_font'],x['canvas_text']), axis=1)
canvasFont_df

Unnamed: 0,canvas_font,canvas_text,site_url,tracking
https://browserleaks.com/ https://pagead2.googlesyndication.com/bg/l2o4cWLNalU19nN7vA12WZhb1qS4KDqIWPmZT-glBuk.js,0,0,https://browserleaks.com/,False


In [33]:
canvas_df['canvas_cf_values'] = canvas_df['canvas_cf'].apply(lambda x: True if (x != 0) else False)
canvas_df['canvas_cs_values'] = canvas_df['canvas_cs'].apply(lambda x: True if (x != 0) else False)

In [34]:
def check_canvas_ff(canvas_ff):
    if len(canvas_ff) != 0: 
        for ff in canvas_ff: 
            if len(set(ff[ff.find('0":"')+4:ff.find(',"1"')-1]))>= 10:
                return True 
    return False

def check_canvas_fs(canvas_fs):
    if len(canvas_ff) != 0: 
        for fs in canvas_fs: 
            if len(set(fs[fs.find('0":"')+4:fs.find(',"1"')-1]))>= 10:
                return True 
    return False

def check_canvas_img(canvas_img): 
    for img in canvas_img: 
        if float(img[img.find('"2":')+4:img.find(',"3"')]) >= 16:
            if float(img[img.find('"3":')+4:img.find('}')]) >= 16:
                return True
    return False

canvas_df['canvas_ff_values'] = canvas_df['canvas_ff'].apply(lambda x: check_canvas_ff(x))
canvas_df['canvas_fs_values'] = canvas_df['canvas_fs'].apply(lambda x: check_canvas_fs(x))
canvas_df['canvas_img_values'] = canvas_df['canvas_img'].apply(lambda x: check_canvas_img(x))


In [35]:
# canvas_df
def checkCanvas(canvas_h, canvas_w, canvas_cf_values, canvas_cs_values, canvas_ff_values, canvas_fs_values, 
                canvas_save, canvas_rest, canvas_el, canvas_data, canvas_img_values):
    if float(canvas_h) >= 16 and float(canvas_w) >= 16: 
        return True 
    
    if canvas_cf_values and canvas_cs_values: 
        return True 
    
    if canvas_ff_values or canvas_fs_values or canvas_img_values:
        return True

    if canvas_save > 0 or canvas_rest > 0 or canvas_el > 0 or canvas_data > 0: 
        return True 
    
    return False

canvas_df['tracking'] = canvas_df.apply(lambda x: checkCanvas(x['canvas_h'],x['canvas_w'],x['canvas_cf'], x['canvas_cs'],x['canvas_ff'], x['canvas_fs'], x['canvas_save'], x['canvas_rest'], x['canvas_el'], x['canvas_data'], x['canvas_img']), axis=1)


In [36]:
canvas_df[canvas_df['tracking']==True]

Unnamed: 0,canvas_cf,canvas_cs,canvas_data,canvas_el,canvas_ff,canvas_fs,canvas_h,canvas_img,canvas_rest,canvas_save,canvas_w,site_url,canvas_cf_values,canvas_cs_values,canvas_ff_values,canvas_fs_values,canvas_img_values,tracking


## Prueba con modelos de predicción 

In [37]:
import joblib
canvas_model = joblib.load("./models/canvas_model.joblib")

In [38]:
columns = ["canvas_h", "canvas_w", "canvas_cf_values", "canvas_cs_values", "canvas_ff_values", "canvas_fs_values", 
                "canvas_save", "canvas_rest", "canvas_el", "canvas_data", "canvas_img_values"]
canvas = canvas_df[columns]
canvas

Unnamed: 0,canvas_h,canvas_w,canvas_cf_values,canvas_cs_values,canvas_ff_values,canvas_fs_values,canvas_save,canvas_rest,canvas_el,canvas_data,canvas_img_values
https://browserleaks.com/ https://pagead2.googlesyndication.com/bg/l2o4cWLNalU19nN7vA12WZhb1qS4KDqIWPmZT-glBuk.js,0,0,False,False,False,False,0,0,0,0,False


In [39]:
canvas_model.predict(canvas)

array([False])

In [40]:
canvasFont_model = joblib.load("./models/canvas_font.joblib")
columns_font = ["canvas_font", "canvas_text"]
canvas_font = canvasFont_df[columns_font]
canvas_font

Unnamed: 0,canvas_font,canvas_text
https://browserleaks.com/ https://pagead2.googlesyndication.com/bg/l2o4cWLNalU19nN7vA12WZhb1qS4KDqIWPmZT-glBuk.js,0,0


In [41]:
canvasFont_model.predict(canvas_font)

array([False])

In [45]:
inf_object_model = joblib.load("./models/inf_model.joblib")
columns_inf = ["navplug_count", "navmim_count", "info_ob_count"]
inf_object = inf_df[columns_inf]
inf_object

Unnamed: 0,navplug_count,navmim_count,info_ob_count
https://browserleaks.com/ https://googleads.g.doubleclick.net/pagead/html/r20220412/r20190131/zrt_lookup.html,0,0,1
https://browserleaks.com/ https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js,0,0,1
https://browserleaks.com/ https://www.google-analytics.com/analytics.js,1,0,3
https://browserleaks.com/ https://pagead2.googlesyndication.com/pagead/managed/js/adsense/m202204040101/show_ads_impl_with_ama_fy2019.js?client=ca-pub-4594829972910700&plah=browserleaks.com,0,0,4
https://browserleaks.com/ https://browserleaks.com/js/default.js?v=28880151,0,0,1
https://browserleaks.com/ https://cdn.ampproject.org/rtv/012203150226000/amp4ads-v0.mjs,0,0,2
https://browserleaks.com/ https://pagead2.googlesyndication.com/pagead/managed/js/adsense/m202204040101/reactive_library_fy2019.js,0,0,1
https://browserleaks.com/ https://googleads.g.doubleclick.net/pagead/html/r20220412/r20110914/zrt_lookup.html?fsb=1#RS-0-&adk=1812271808&client=ca-pub-4594829972910700&fa=8&ifi=3&uci=a!3&xpc=OTn9BM41gj&p=https%3A//browserleaks.com,0,0,1
https://browserleaks.com/ https://www.googletagservices.com/activeview/js/current/rx_lidar.js?cache=r20110914,0,0,1
https://browserleaks.com/ https://www.gstatic.com/mysidia/fb084ba56019ecef1e967c41e75d05fd.js?tag=mysidia_one_click_handler_one_afma_2019,0,0,1


In [46]:
inf_object_model.predict(inf_object)

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [None]:
webrtc_model = joblib.load("./models/webrtc_model.joblib")
columns_webrtc = ["rtc_data", "rtc_offer", "rtc_cand"]
webrtc = webrtc_df[columns_webrtc]
webrtc

In [None]:
webrtc_model.predict(webrtc)