In [1]:
import common as cm
import os
import sqlite3
import pandas as pd
from multiprocessing import Pool as ThreadPool
import jsbeautifier
import utilities

In [2]:
# base_directory = '/Users/uiqbal/Documents/work/data/crawl_20K/'
base_directory = '/mnt/data_6tb/macbook/crawl_20K/'

In [3]:
# Connect to content database
ldb = cm.get_leveldb(os.path.join(base_directory, 'content.ldb'))

In [4]:
con = sqlite3.connect(os.path.join(base_directory, 'crawl-data.sqlite'))
cur = con.cursor()
cm.optimize_db(cur)

In [5]:
cm.create_visit_id_index(cur, "http_responses")
cm.create_visit_id_index(cur, "http_requests")
cm.create_visit_id_index(cur, "javascript")
cur.execute("CREATE INDEX IF NOT EXISTS javascript_script_url ON javascript(script_url);")
con.commit()

CREATE INDEX http_responses
Finished adding visit_id index to the table http_responses
CREATE INDEX http_requests
Finished adding visit_id index to the table http_requests
CREATE INDEX javascript
Finished adding visit_id index to the table javascript


In [6]:
js_id_urls = pd.read_sql('SELECT distinct visit_id, script_url FROM javascript', con=con)

In [8]:
def get_url_content_hash(url, sqlite_cur, visit_id=None):
    """Return javascript content for given url.

    Parameters
    ----------
    url : str
        url to search content hash for
    sqlite_cur : sqlite3.Cursor
        cursor for crawl database
    visit_id : int
        (optional) `visit_id` of the page visit where this URL was loaded
    """
    if visit_id is not None:
        sqlite_cur.execute(
            "SELECT content_hash FROM http_responses WHERE "
            "visit_id = ? AND url = ? LIMIT 1;", (visit_id, url))
    else:
        sqlite_cur.execute(
            "SELECT content_hash FROM http_responses WHERE url = ? LIMIT 1;",
            (url,))
    content_hash = sqlite_cur.fetchone()
    if content_hash is None or len(content_hash) == 0 or content_hash[0] is None or content_hash[0].strip() == '':
#         print("Content hash not found for url %s" % url)
        return ''
    return content_hash[0]

In [9]:
js_id_urls['content_hash'] = js_id_urls.apply(
    axis=1,
    func=lambda x: get_url_content_hash(x['script_url'], cur, visit_id=x['visit_id'])
)

In [10]:
def get_content_policy_type(url, sqlite_cur, visit_id=None):
    """Return content policy type for given url.

    Parameters
    ----------
    url : str
        url to search content hash for
    sqlite_cur : sqlite3.Cursor
        cursor for crawl database
    visit_id : int
        (optional) `visit_id` of the page visit where this URL was loaded
    """
    if visit_id is not None:
        sqlite_cur.execute(
            "SELECT content_policy_type FROM http_requests WHERE "
            "visit_id = ? AND url = ? LIMIT 1;", (visit_id, url))
    else:
        sqlite_cur.execute(
            "SELECT content_policy_type FROM http_requests WHERE url = ? LIMIT 1;",
            (url,))
    content_policy_type = sqlite_cur.fetchone()
    if content_policy_type is None or len(content_policy_type) == 0:
#         print("Content policy type not found for url %s" % url)
        return -1
    return content_policy_type[0]

In [11]:
js_id_urls['content_policy_type'] = js_id_urls.apply(
    axis=1,
    func=lambda x: get_content_policy_type(x['script_url'], cur, visit_id=x['visit_id'])
)

In [12]:
print(js_id_urls.shape)
print(js_id_urls.columns)
print('No content hash: ', (js_id_urls['content_hash'].values == '').sum())
print('No script url: ', (js_id_urls['script_url'].values == '').sum())
# print('No visit id: ', (js_id_urls['visit_id'].values == None).sum())
print('No content policy type: ', (js_id_urls['content_policy_type'].values == -1).sum())
# print(js_id_urls.isnull().sum(axis = 0))
# print(list(js_id_urls['content_policy_type'].unique()))

(417846, 4)
Index(['visit_id', 'script_url', 'content_hash', 'content_policy_type'], dtype='object')
No content hash:  14087
No script url:  11316
No content policy type:  13943


In [57]:
print(js_id_urls[(js_id_urls['content_hash'] != '') & (js_id_urls['content_policy_type'] == -1)])
# print(js_id_urls)

Empty DataFrame
Columns: [visit_id, script_url, content_hash, content_policy_type, eval_function_presence, id]
Index: []


In [116]:
js_data = pd.read_sql('SELECT distinct visit_id, script_url, top_level_url, symbol, \
                      arguments, value, script_line, script_col FROM javascript where script_url == "" \
                      and script_line != "http" and script_line != "https" \
                      and symbol != "Node.nodeType" ', con=con)

      visit_id script_url                   top_level_url  \
0           55                       https://sinoptik.ua/   
1          394                 https://www.goal.com/en-us   
2          394                 https://www.goal.com/en-us   
3          394                 https://www.goal.com/en-us   
4          394                 https://www.goal.com/en-us   
5          394                 https://www.goal.com/en-us   
6          394                 https://www.goal.com/en-us   
7          394                 https://www.goal.com/en-us   
8          394                 https://www.goal.com/en-us   
9          394                 https://www.goal.com/en-us   
10         394                 https://www.goal.com/en-us   
11         394                 https://www.goal.com/en-us   
12         401                     https://saglik.gov.tr/   
13         546             https://www.mismarcadores.com/   
14         546             https://www.mismarcadores.com/   
15         546          

In [117]:
print(js_data.script_col)

0                                                        
1       perform/performid/dist/performid.js"]/e.export...
2       perform/performid/dist/performid.js"]/e.export...
3       perform/performid/dist/performid.js"]/e.export...
4       perform/performid/dist/performid.js"]/e.export...
5       perform/performid/dist/performid.js"]/e.export...
6       perform/performid/dist/performid.js"]/e.export...
7       perform/performid/dist/performid.js"]/e.export...
8       perform/performid/dist/performid.js"]/e.export...
9       perform/performid/dist/performid.js"]/e.export...
10      perform/performid/dist/performid.js"]/e.export...
11      perform/performid/dist/performid.js"]/e.export...
12                                                       
13                                                       
14                                                       
15                                                       
16                                                       
17            

In [118]:
print(list(js_id_urls['content_policy_type'].unique()))

[2, 6, -1, 7, 5, 11, 3, 20, 4]


In [105]:
def get_url_content(db, content_hash, compression='snappy', beautify=True):
    """ Returns decompressed content from javascript leveldb database """
    if content_hash is None or content_hash == '':
#         print("ERROR: content_hash can't be None or empty...")
        return
    content = db.get(bytes(content_hash, encoding='UTF-8'))
    if content is None:
        print("ERROR: content hash: %s NOT FOUND" % content_hash)
        return
    supported = ['snappy', 'none', 'gzip']
    if compression not in supported:
        print("Unsupported compression type %s. Only %s "
              "are the supported options." % (compression, str(supported)))
        return
    elif compression == 'gzip':
        try:
            content = zlib.decompress(content, zlib.MAX_WBITS | 16)
        except Exception:
            try:
                content = zlib.decompress(content)
            except Exception:
                print("Failed to decompress gzipped content...")
                return
    if beautify:
        return jsbeautifier.beautify(content)
    else:
        if b'eval(' in content or b'Function(' in content:
            return True
        return False
#         return content

In [14]:
js_id_urls['eval_function_presence'] = js_id_urls.apply(
    axis=1,
    func=lambda x: get_url_content(ldb, x['content_hash'], beautify=False)
)

In [15]:
print('Eval: ', (js_id_urls['eval_function_presence'].values == False).sum())
print('Eval: ', (js_id_urls['eval_function_presence'].values == True).sum())

Eval:  258537
Eval:  145222


In [16]:
# add index
js_id_urls['id'] = js_id_urls.index + 1
print(js_id_urls.columns)

Index(['visit_id', 'script_url', 'content_hash', 'content_policy_type',
       'eval_function_presence', 'id'],
      dtype='object')


In [24]:
js_id_urls.content_policy_type.value_counts()

 2     335656
 7      51673
 6      16323
-1      13943
 11       155
 5         71
 20        16
 3          8
 4          1
Name: content_policy_type, dtype: int64

In [112]:
def get_url_content_2(db, content_hash, compression='snappy', beautify=True):
    """ Returns decompressed content from javascript leveldb database """
    if content_hash is None or content_hash == '':
#         print("ERROR: content_hash can't be None or empty...")
        return
    content = db.get(bytes(content_hash, encoding='UTF-8'))
    if content is None:
        print("ERROR: content hash: %s NOT FOUND" % content_hash)
        return
    supported = ['snappy', 'none', 'gzip']
    if compression not in supported:
        print("Unsupported compression type %s. Only %s "
              "are the supported options." % (compression, str(supported)))
        return
    elif compression == 'gzip':
        try:
            content = zlib.decompress(content, zlib.MAX_WBITS | 16)
        except Exception:
            try:
                content = zlib.decompress(content)
            except Exception:
                print("Failed to decompress gzipped content...")
                return
    if beautify:
        return jsbeautifier.beautify(content.decode("utf-8"))
    else:
        return content

In [65]:
for hash in list(js_id_urls[(js_id_urls['content_policy_type'].values == 4) & (js_id_urls['content_hash'].values != '')]['content_hash']):
    print(get_url_content_2(ldb, hash))
    break

In [111]:
evaled_scripts_directory = os.path.join(base_directory, 'evaled_scripts')

if not os.path.isdir(evaled_scripts_directory):
    os.mkdir(evaled_scripts_directory)
    
def write_content(file_addr, content):
    with open(file_addr, 'wb') as out_file:        
        out_file.write(content) 
        
def dump_html_js(content, file_name, js_or_html='js'):
    if js_or_html == 'html':
        write_content(os.path.join(evaled_scripts_directory, file_name + '.html'), content)
    elif js_or_html == 'js':
        write_content(os.path.join(evaled_scripts_directory, file_name + '.js'), content)
        html_markup = """
<!doctype html>
<html>
    <head>
    <title>Test page for script %s </title>
          <script type="text/javascript" src="%s"></script>
     </head>
     <body>
     </body>
 </html>

"""
        write_content(os.path.join(evaled_scripts_directory, file_name + '.html'), str.encode(html_markup % (file_name + '.js', file_name + '.js')))
    
    

In [110]:
def append_file(f_name, content_list):
    with open(f_name, 'a') as out_file:
        out_file.write(','.join(content_list)) 
        
count_js = 0
count_html = 0
content_list = []

for index, row in js_id_urls.drop_duplicates('content_hash').iterrows():
    if row['content_hash'] != '' and row['eval_function_presence'] == True:
        if row['content_policy_type'] == 7 or row['content_policy_type'] == 6:
            dump_html_js(get_url_content_2(ldb, row['content_hash'], beautify=False), str(row['id']) + '_' + row['content_hash'], js_or_html='html')
            count_html += 1
            content_list.append(str(row['id']) + '_' + row['content_hash'] + '.html')
        elif row['content_policy_type'] == 2 or row['content_policy_type'] == 11 or row['content_policy_type'] == 5:
            dump_html_js(get_url_content_2(ldb, row['content_hash'], beautify=False), str(row['id']) + '_' + row['content_hash'], js_or_html='js')
            count_js += 1
            content_list.append(str(row['id']) + '_' + row['content_hash'] + '.html')

print('Total files with eval (scripts, iframes) : ( ', str(count_js) + ' , ' + str(count_html) + ' )')
append_file(os.path.join(base_directory, 'packed_files_names.txt'), content_list)
js_id_urls.to_json(os.path.join(base_directory, 'js_id_urls_hash_content_type_mapping.json'))

Total files with eval (scripts, iframes) : (  53861 , 3080 )


In [99]:
# content_list = []
# # only keep unique for processing evals
# for index, row in js_id_urls.drop_duplicates().iterrows():
#     if row['content_hash'] != '' and row['eval_function_presence'] == True:
#         if row['content_policy_type'] == 7 or row['content_policy_type'] == 6:
#             content_list.append(str(row['id']) + '.html')
#         elif row['content_policy_type'] == 2 or row['content_policy_type'] == 11 or row['content_policy_type'] == 5:
#             content_list.append(str(row['id']) + '.html')
            
# append_file(os.path.join(base_directory, 'packed_files_names.txt'), content_list)
# js_id_urls.to_json(os.path.join(base_directory, 'js_id_urls_hash_content_type_mapping.json'))

In [14]:
# def find_eval_or_Function(content):
#     if 'eval(' in content or 'Function(' in content:
#         return True
#     return False
# pool = ThreadPool(processes=7)
# results = pool.map(find_eval_or_Function, js_id_urls['content'])
# js_id_urls['eval_function_presence'] = [x for x in results]

In [3]:
def read_df(file_name):
    dataframe = pd.read_json(file_name)
    return dataframe

def write_content(file_addr, content):
    with open(file_addr, 'wb') as out_file:        
        out_file.write(content)

non_evaled_scripts_directory =  os.path.join(base_directory, 'non_evaled_scripts')
if not os.path.isdir(non_evaled_scripts_directory):
    os.mkdir(non_evaled_scripts_directory)
    

In [4]:
js_id_hash_file = read_df(os.path.join(base_directory, 'js_id_urls_hash_content_type_mapping.json'))

In [5]:
js_id_hash_file.columns

Index(['visit_id', 'script_url', 'content_hash', 'content_policy_type',
       'eval_function_presence', 'id'],
      dtype='object')

In [6]:
def write_file(f_name, content_list):
    with open(f_name, 'a') as out_file:
        for item in content_list:
            out_file.write(item + '\n') 
        
count = 0
content_list = []
for index, row in js_id_hash_file.drop_duplicates('content_hash').iterrows():
    if row['content_hash'] != '' and row['eval_function_presence'] == False and (row['content_policy_type'] == 7 or row['content_policy_type'] ==6):
#         dump_html_js(get_url_content_2(ldb, row['content_hash'], beautify=False), str(row['id']) + '_' + row['content_hash'], js_or_html='html')
#         content_list.append(str(row['id']) + '_' + row['content_hash'] + '.html')
        count += 1
        content_list.append(str(row['id']) + '_' + row['content_hash'] + '.html')
        if count % 10000 == 0:
            print('Processed so far: ', count)

write_file(os.path.join(base_directory, 'iframe_file_names.txt'), content_list)
print('Total files without eval: ', str(count))

Total files without eval:  3080


In [7]:
def read_df(file_name):
    dataframe = pd.read_json(file_name)
    return dataframe

already_read = read_df(os.path.join(base_directory, 'js_id_urls_hash_content_type_mapping.json'))
already_processed = set(already_read['content_hash'].tolist())

In [8]:
print(len(already_processed))

151192


# To compute crawling statistics

In [9]:
already_read.columns

Index(['content_hash', 'content_policy_type', 'eval_function_presence', 'id',
       'script_url', 'visit_id'],
      dtype='object')

In [73]:
print('Eval: ', (already_read['eval_function_presence'].values == False).sum())
print('Eval: ', (already_read['eval_function_presence'].values == True).sum())

filter = already_read["eval_function_presence"]==True
# filtering data 
print(already_read.where(filter, inplace = False).groupby('content_hash').count().count())

# already_read.groupby('content_hash').apply(filter, inplace = False).count()

Eval:  258537
Eval:  145222
content_policy_type       56941
eval_function_presence    56941
id                        56941
script_url                56941
visit_id                  56941
dtype: int64


In [37]:
filter = already_read["eval_function_presence"]==1
# filtering data 
already_read.where(filter, inplace = True) 

content_hash                  1
content_policy_type           8
eval_function_presence        0
id                        14087
script_url                 2214
visit_id                  12761
dtype: int64

# Process Not Executing Scripts

In [18]:
all_hashes = read_df(os.path.join(base_directory, 'all_hashes_url_top-10K_ran-10K.json'))

In [19]:
all_hashes['content_hash'].nunique()

254790

In [20]:
all_hashes['url'].nunique()

357726

In [21]:
cond = all_hashes['content_hash'].isin(already_read['content_hash']) == True
all_hashes.drop(all_hashes[cond].index, inplace = True)

In [22]:
all_hashes['content_hash'].nunique()
# already_read['content_hash'].nunique()

136685

In [23]:
all_hashes['url'].nunique()

168917

In [16]:
all_hashes

## To check presence of hash urls in javascript table

In [18]:
def get_presence(script_url, sqlite_cur, visit_id=None):
    """Return content policy type for given url.

    Parameters
    ----------
    url : str
        url to search content hash for
    sqlite_cur : sqlite3.Cursor
        cursor for crawl database
    visit_id : int
        (optional) `visit_id` of the page visit where this URL was loaded
    """
    if visit_id is not None:
#         SELECT distinct visit_id, script_url FROM javascript
        sqlite_cur.execute(
            "SELECT script_url FROM javascript WHERE "
            "visit_id = ? AND script_url = ? LIMIT 1;", (visit_id, script_url))
    else:
        sqlite_cur.execute(
            "SELECT script_url FROM javascript WHERE script_url = ? LIMIT 1;",
            (script_url,))
    script_url = sqlite_cur.fetchone()
    if script_url is None or len(script_url) == 0:
#         print("Content policy type not found for url %s" % url)
        return False
    return True

In [15]:
all_hashes['presence'] = all_hashes.apply(
    axis=1,
    func=lambda x: get_presence(x['url'], cur, visit_id=x['visit_id'])
)

In [14]:
cond = all_hashes['url'].isin(js_id_urls['script_url']) == True
all_hashes.drop(all_hashes[cond].index, inplace = True)

In [106]:
all_hashes['eval_function_presence'] = all_hashes.apply(
    axis=1,
    func=lambda x: get_url_content(ldb, x['content_hash'], beautify=False)
)

In [119]:
all_hashes.insert(0, 'id', range(417846, 417846 + len(all_hashes)))

In [128]:
def append_file(f_name, content_list):
    with open(f_name, 'a') as out_file:
        out_file.write(','.join(content_list)) 
        
def write_content(file_addr, content):
    with open(file_addr, 'wb') as out_file:        
        out_file.write(content)
        
non_evaled_scripts_directory =  os.path.join(base_directory, 'non_evaled_scripts')
if not os.path.isdir(non_evaled_scripts_directory):
    os.mkdir(non_evaled_scripts_directory)
    

In [132]:
count_eval = 0
count_non_eval = 0
content_list = []
non_evaled_content_list = []

for index, row in all_hashes.drop_duplicates('content_hash').iterrows():
    if row['content_hash'] != '' and row['eval_function_presence'] == True:
        dump_html_js(get_url_content_2(ldb, row['content_hash'], beautify=False), str(row['id']) + '_' + row['content_hash'], js_or_html='js')
        content_list.append(str(row['id']) + '_' + row['content_hash'] + '.html')
        count_eval += 1
        if count_eval % 10000 == 0:
            print('Processed so far: ', count_eval)
    elif row['content_hash'] != '' and row['eval_function_presence'] == False:
        write_content(os.path.join(non_evaled_scripts_directory, str(row['id']) + '_' + row['content_hash']), get_url_content_2(ldb, row['content_hash'], beautify=False))
        non_evaled_content_list.append(str(row['id']) + '_' + row['content_hash'])        
        count_non_eval += 1
        if count_non_eval % 10000 == 0:
            print('Processed so far: ', count_non_eval)
    

print('Total files with eval, non_eval : ', count_eval, count_non_eval)
append_file(os.path.join(base_directory, 'packed_files_names.txt'), content_list)
append_file(os.path.join(base_directory, 'non_evaled_files_names.txt'), non_evaled_content_list)
all_hashes.to_json(os.path.join(base_directory, 'not_executed_js_id_urls_hash_mapping.json'))

Processed so far:  10000
Processed so far:  20000
Processed so far:  30000
Processed so far:  40000
Processed so far:  50000
Processed so far:  60000
Processed so far:  70000
Processed so far:  80000
Processed so far:  10000
Processed so far:  90000
Processed so far:  100000
Processed so far:  110000
Processed so far:  120000
Total files with eval, non_eval :  14700 121985


In [126]:
all_hashes.shape

(211235, 5)

In [7]:
temp = read_df('labels/2019-06-23_top_10k_random_10k_stateless-ground_truth.json')

In [140]:
temp.shape

(5437, 8)

In [153]:
# temp.where(temp.content_hash == None).count()
(temp['content_hash'].values == None).sum()

300

In [154]:
temp['content_hash'].nunique()

956

In [182]:
# for hash in list(temp[(temp['content_policy_type'].values == 4) & (temp['content_hash'].values == None)]['content_hash']):
#     print(get_url_content_2(ldb, hash))
#     break

temp_list = temp[temp['content_hash'].values == None]['script_url'].unique()

In [189]:
temp_set = set()
for index, row in temp.iterrows():
    if row['script_url'] in temp_list and row['content_hash'] != None:
        temp_set.add(row['script_url'])

In [190]:
len(temp_set)

163

In [183]:
temp[temp['content_hash'].values == None]['script_url'].nunique()

168

In [185]:
count = 0 
for index, row in temp.iterrows():
    if row['content_hash'] == None:
        count += 1
        
print(count)

300


In [195]:
for i in temp_list:
    if i.strip() not in temp_set:
        print(i.strip())

(program):2
dna.min.js

webpack:///../node_modules/fingerprintjs2/fingerprint2.js?
dna-test.min.js


In [2]:
print('hello')

hello


In [9]:
temp[temp.script_url == '/index.js']

Unnamed: 0,content_hash,is_audio,is_canvas,is_canvas_font,is_webrtc,script_url,top_level_url,visit_id


In [28]:
temp[temp['content_hash'].values == None]['script_url'].unique()

array(['https://c.adsco.re/',
       'https://www.spectrum.com/static/6a6d3d1b77e185aaf5e4dc3e9065d02',
       'https://www.cibc.com/public/a42a80573231e662bfcf1176a8c1f',
       'https://www.academy.com/assets/0efdabb1142fd64881345184d251',
       'http://c.adsco.re/',
       'https://www.jdsports.co.uk/resources/c1b010f854236babd5d8a29fccf23f',
       'https://www.zalando.fr/resources/5ec1b2235b180f5d17cbb60fe8f294',
       'https://www.desjardins.com/static/8f64f77494c229d13ce1b2e62d40305',
       'https://www.navyfederal.org/resources/93d02c85f6148b791193a3e097941a',
       'https://www.otomoto.pl/resources/f7b2f3bb71224a67360493d44a9903',
       'https://www.emirates.com/resources/ee9ea7b762160cc99a0d7ad6c6bd99',
       'https://wizzair.com/public/4924df5c722758564e47174466d8e',
       'https://www.tcsion.com/assets/4b001aa61701202035a4ff2be56d',
       'https://www.kobo.com/static/cb916be95261810023fac2b4ef93e3f',
       'https://www.casasbahia.com.br/resources/af16793fb1156ba632

In [37]:
temp['script_url'].str.contains('/dufhidufhweiufhuwiefh.js', na=False, regex=False).any()

False

In [35]:
temp['content_hash'].str.contains('script_hash', na=False, regex=False).any()

False

In [49]:
temp = read_df('/mnt/data_6tb/macbook/crawl_20K/js_id_urls_hash_content_type_mapping.json')

In [50]:
temp

Unnamed: 0,content_hash,content_policy_type,eval_function_presence,id,script_url,visit_id
0,5f48fc77cac90c4778fa24ec9c57f37d,2,1.0,1,https://code.jquery.com/jquery-3.2.1.slim.min.js,2
1,bffc6023835e717c0348c41583e56eba,2,0.0,2,http://hajoopteg.com/feed/assets/default/js/fo...,2
10,d6784348cabbc2b6b0db7aa1a6c7f1d5,6,0.0,11,https://www.linuxquestions.org/,3
100,133700ce91867800e5b6489adc2e644a,7,0.0,101,https://pagead2.googlesyndication.com/pagead/s...,4
1000,e23415c704caa3067181e404a7d47e95,2,1.0,1001,https://api.useinsider.com/js/squery.min.js,36
10000,c8e0d2176bb426a32c4c6a89c3e60390,2,1.0,10001,https://g.alicdn.com/shop/shop-caja/0.1.0/r300...,425
100000,73e6254903375e432d34c87deaec234a,2,1.0,100001,https://www.cgg.gov.in/slider/jquery.js,4454
100001,0aa9e28b4a0be2a105e18f0e718e12cf,2,1.0,100002,https://www.cgg.gov.in/core/themes/edsbootstra...,4454
100002,5a03f97cc479b9f5d7efdaccec31bc17,2,0.0,100003,https://www.cgg.gov.in/helper/js/wp-embed.min....,4454
100003,95e5ffd5de759d81ed7cd57c2affd429,2,1.0,100004,https://www.cgg.gov.in/core/themes/edsbootstra...,4454


In [53]:
temp[temp['content_hash'].values == '21424a27c98324d45752bc21cecdbfb5']['script_url']

102675    https://cdn.upbit.com/vendors-chunk-fa7c5c5c56...
Name: script_url, dtype: object

In [54]:
df = read_df('/mnt/drive/firefox/parser/labels/2019-06-23_top_10k_random_10k_stateless-ground_truth.json')

In [55]:
df['content_hash'].str.contains('78437e88602a3c5674db24cff9ae1481', na=False).any()

False