In [1]:
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd

## Intro
This file loads and persists data from the OPP-115 corpus, courtesy of UsablePrivacy.org.

### Some basic db setup



In [64]:
dbname = 'beforeiagree_db'
username = 'peterostendorp'

#Create engine
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print(engine.url)

#create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

postgres://peterostendorp@localhost/beforeiagree_db
True


### Sites table setup
Read in sites, including those not covered by the OPP-115 corpus.

In [184]:
sites = pd.read_csv('../OPP-115/documentation/websites_covered_opp115.csv',index_col=3,parse_dates=[4])
sites.head()

Unnamed: 0_level_0,Site UID,Site URL,Site Human-Readable Name,Site Check Date,In 115 Set?,Comments,Sectoral Data,Unnamed: 8,Unnamed: 9,Unnamed: 10,...,Unnamed: 97,Unnamed: 98,Unnamed: 99,Unnamed: 100,Unnamed: 101,Unnamed: 102,Unnamed: 103,Unnamed: 104,Unnamed: 105,Unnamed: 106
Policy UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20,1,theatlantic.com,The Atlantic,2016-02-08,Yes,"Alexa Rank: 975 (Global), 289 (US)","Arts: Literature: Authors: A: Alcott, Louisa M...","Arts: Literature: Authors: M: Merwin, W. S.: W...",Arts: Literature: World Literature: American: ...,"Arts: Music: Composition: Composers: T: Toch, ...",...,Regional: North America: United States: Tennes...,Regional: North America: United States: Vermon...,Regional: North America: United States: Washin...,Regional: Oceania: Australia: New South Wales:...,Science: Agriculture: Crop Plants: Coffee,Science: Astronomy: Solar System: Dwarf Planet...,Science: Biology: Microbiology: Virology,Science: Earth Sciences: Atmospheric Sciences:...,Science: Social Sciences: Economics: Economic ...,Science: Social Sciences: Economics: People: G...
21,2,imdb.com,IMDb,2016-02-08,Yes,"Alexa Rank: 49 (Global), 27 (US)",Kids and Teens: Entertainment: Animation: Movies,Regional: Europe: United Kingdom: Arts and Ent...,Arts: Performing Arts: Magic: Magicians: Siegf...,Arts: Television: Programs: Science Fiction an...,...,"Arts: People: S: Sagal, Jean and Liz","Arts: People: W: Wilde, Lyn and Lee",Arts: Performing Arts: Magic: Magicians: Penn ...,Arts: Performing Arts: Puppetry: Muppets: Movi...,Arts: Performing Arts: Stunts: Players,Arts: Performing Arts: Theatre: Awards: Americ...,Arts: Television: Programs: Action and Adventu...,Arts: Television: Programs: Action and Adventu...,Arts: Television: Programs: Children's: Beetle...,Arts: Television: Programs: Comedy: Sitcoms: W...
26,3,nytimes.com,New York Times,2016-02-08,Yes,"Alexa Rank: 101 (Global), 22 (US)",Society: Issues: Warfare and Conflict: Specifi...,Science: Social Sciences: Psychology: Evolutio...,Society: Issues: Warfare and Conflict: Specifi...,Computers: E-Books: News and Media,...,Science: Technology: Space: Missions: Manned: ...,Society: Activism: Media: Culture Jamming: Spo...,Society: Crime: Sex Offenses: Child Pornograph...,Society: History: By Topic: Maritime: Ships: O...,"Society: History: Historians: Parker, Geoffrey",Society: Issues: Business: Allegedly Unethical...,Society: Issues: Education: Standardized Testing,Society: Issues: Environment: Light Pollution:...,Society: Issues: Environment: News and Media: ...,Society: Issues: Environment: News and Media: ...
32,4,theverge.com,The Verge,2016-02-08,Yes,"Alexa Rank: 525 (Global), 230 (US)",Home: Consumer Information: Computers and Inte...,Home: Consumer Information: Electronics: Weblogs,,,...,,,,,,,,,,
33,5,nbc.com,NBC Universal,2016-02-08,Yes,"Alexa Rank: 1548 (Global), 426 (US)",Kids and Teens: Entertainment: Television: Rea...,Arts: Television: Networks: NBC,Arts: Television: Programs: Action and Adventu...,Arts: Television: Programs: Comedy: Sitcoms: 3...,...,,,,,,,,,,


In [185]:
#Change interpretation of 115 set membership column
sites['In 115 Set?']=sites['In 115 Set?'].apply(lambda yn: True if yn == 'Yes' else False)
sites.head()

Unnamed: 0_level_0,Site UID,Site URL,Site Human-Readable Name,Site Check Date,In 115 Set?,Comments,Sectoral Data,Unnamed: 8,Unnamed: 9,Unnamed: 10,...,Unnamed: 97,Unnamed: 98,Unnamed: 99,Unnamed: 100,Unnamed: 101,Unnamed: 102,Unnamed: 103,Unnamed: 104,Unnamed: 105,Unnamed: 106
Policy UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20,1,theatlantic.com,The Atlantic,2016-02-08,True,"Alexa Rank: 975 (Global), 289 (US)","Arts: Literature: Authors: A: Alcott, Louisa M...","Arts: Literature: Authors: M: Merwin, W. S.: W...",Arts: Literature: World Literature: American: ...,"Arts: Music: Composition: Composers: T: Toch, ...",...,Regional: North America: United States: Tennes...,Regional: North America: United States: Vermon...,Regional: North America: United States: Washin...,Regional: Oceania: Australia: New South Wales:...,Science: Agriculture: Crop Plants: Coffee,Science: Astronomy: Solar System: Dwarf Planet...,Science: Biology: Microbiology: Virology,Science: Earth Sciences: Atmospheric Sciences:...,Science: Social Sciences: Economics: Economic ...,Science: Social Sciences: Economics: People: G...
21,2,imdb.com,IMDb,2016-02-08,True,"Alexa Rank: 49 (Global), 27 (US)",Kids and Teens: Entertainment: Animation: Movies,Regional: Europe: United Kingdom: Arts and Ent...,Arts: Performing Arts: Magic: Magicians: Siegf...,Arts: Television: Programs: Science Fiction an...,...,"Arts: People: S: Sagal, Jean and Liz","Arts: People: W: Wilde, Lyn and Lee",Arts: Performing Arts: Magic: Magicians: Penn ...,Arts: Performing Arts: Puppetry: Muppets: Movi...,Arts: Performing Arts: Stunts: Players,Arts: Performing Arts: Theatre: Awards: Americ...,Arts: Television: Programs: Action and Adventu...,Arts: Television: Programs: Action and Adventu...,Arts: Television: Programs: Children's: Beetle...,Arts: Television: Programs: Comedy: Sitcoms: W...
26,3,nytimes.com,New York Times,2016-02-08,True,"Alexa Rank: 101 (Global), 22 (US)",Society: Issues: Warfare and Conflict: Specifi...,Science: Social Sciences: Psychology: Evolutio...,Society: Issues: Warfare and Conflict: Specifi...,Computers: E-Books: News and Media,...,Science: Technology: Space: Missions: Manned: ...,Society: Activism: Media: Culture Jamming: Spo...,Society: Crime: Sex Offenses: Child Pornograph...,Society: History: By Topic: Maritime: Ships: O...,"Society: History: Historians: Parker, Geoffrey",Society: Issues: Business: Allegedly Unethical...,Society: Issues: Education: Standardized Testing,Society: Issues: Environment: Light Pollution:...,Society: Issues: Environment: News and Media: ...,Society: Issues: Environment: News and Media: ...
32,4,theverge.com,The Verge,2016-02-08,True,"Alexa Rank: 525 (Global), 230 (US)",Home: Consumer Information: Computers and Inte...,Home: Consumer Information: Electronics: Weblogs,,,...,,,,,,,,,,
33,5,nbc.com,NBC Universal,2016-02-08,True,"Alexa Rank: 1548 (Global), 426 (US)",Kids and Teens: Entertainment: Television: Rea...,Arts: Television: Networks: NBC,Arts: Television: Programs: Action and Adventu...,Arts: Television: Programs: Comedy: Sitcoms: 3...,...,,,,,,,,,,


In [186]:
#Reinterpret the categories for the sites. Find the primary category and take the mode across all columns
sectors = sites[sites.columns[6:]]
sectors = sectors.applymap(lambda s: str(s).split(':')[0])
sectors.head()

Unnamed: 0_level_0,Sectoral Data,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,...,Unnamed: 97,Unnamed: 98,Unnamed: 99,Unnamed: 100,Unnamed: 101,Unnamed: 102,Unnamed: 103,Unnamed: 104,Unnamed: 105,Unnamed: 106
Policy UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20,Arts,Arts,Arts,Arts,Arts,Arts,Regional,Regional,Science,Society,...,Regional,Regional,Regional,Regional,Science,Science,Science,Science,Science,Science
21,Kids and Teens,Regional,Arts,Arts,Arts,Arts,Arts,Arts,Regional,Sports,...,Arts,Arts,Arts,Arts,Arts,Arts,Arts,Arts,Arts,Arts
26,Society,Science,Society,Computers,Arts,Arts,Arts,Society,Regional,Society,...,Science,Society,Society,Society,Society,Society,Society,Society,Society,Society
32,Home,Home,,,,,,,,,...,,,,,,,,,,
33,Kids and Teens,Arts,Arts,Arts,Arts,Arts,Arts,Arts,Arts,Arts,...,,,,,,,,,,


In [187]:
s = []
for i in range(0,sectors.shape[0]):
    
    sec = sectors.iloc[i][sectors.iloc[i]!='nan'].mode()
    sec = sec.iloc[0] if len(sec) > 0 else 'None'
    s.append(sec)

In [188]:
s

['Arts',
 'Arts',
 'Arts',
 'Home',
 'Arts',
 'Arts',
 'Home',
 'Home',
 'Home',
 'Kids and Teens',
 'Regional',
 'Computers',
 'World',
 'Arts',
 'World',
 'News',
 'Sports',
 'Regional',
 'Arts',
 'Health',
 'Games',
 'Arts',
 'Games',
 'World',
 'Regional',
 'Business',
 'Business',
 'World',
 'Arts',
 'Regional',
 'Arts',
 'Games',
 'News',
 'Business',
 'Health',
 'Home',
 'Regional',
 'Reference',
 'Regional',
 'Regional',
 'Health',
 'Regional',
 'Business',
 'Regional',
 'Regional',
 'World',
 'Shopping',
 'Games',
 'Reference',
 'Business',
 'Business',
 'Regional',
 'Regional',
 'Recreation',
 'Games',
 'Home',
 'Home',
 'Kids and Teens',
 'Science',
 'Shopping',
 'Recreation',
 'Science',
 'Society',
 'Regional',
 'Health',
 'Health',
 'Health',
 'None',
 'Science',
 'Science',
 'Regional',
 'Computers',
 'Kids and Teens',
 'Games',
 'Science',
 'Science',
 'Home',
 'Regional',
 'Arts',
 'Kids and Teens',
 'Regional',
 'Recreation',
 'Regional',
 'Recreation',
 'Regional',
 

In [189]:
#Drop and append
sites = sites.drop(sites.columns[6:], axis=1)
sites['Sector'] = s

In [190]:
sites.head()

Unnamed: 0_level_0,Site UID,Site URL,Site Human-Readable Name,Site Check Date,In 115 Set?,Comments,Sector
Policy UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
20,1,theatlantic.com,The Atlantic,2016-02-08,True,"Alexa Rank: 975 (Global), 289 (US)",Arts
21,2,imdb.com,IMDb,2016-02-08,True,"Alexa Rank: 49 (Global), 27 (US)",Arts
26,3,nytimes.com,New York Times,2016-02-08,True,"Alexa Rank: 101 (Global), 22 (US)",Arts
32,4,theverge.com,The Verge,2016-02-08,True,"Alexa Rank: 525 (Global), 230 (US)",Home
33,5,nbc.com,NBC Universal,2016-02-08,True,"Alexa Rank: 1548 (Global), 426 (US)",Arts


In [191]:
#Grab the policies table
policies = pd.read_csv('../OPP-115/documentation/policies_opp115.csv',index_col=0,parse_dates=[2,3])
policies = policies.drop('Unnamed: 4',axis=1)
policies.head()

Unnamed: 0_level_0,Policy URL,Policy collection date,Policy last updated date
Policy UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20,theatlantic.com/privacy-policy/,2015-07-02,2015-01-01
21,imdb.com/privacy,2015-07-02,2014-12-05
26,nytimes.com/privacy,2015-07-08,2015-06-10
32,voxmedia.com/privacy-policy,2015-07-02,2014-05-01
33,nbcuniversal.com/privacy/full-privacy-policy,2015-07-02,2015-01-14


In [192]:
#Grab the original policy text from html and append to this table
from os import listdir
import re
from lxml import etree, html
base_dir = '../OPP-115/original_policies/'
files = [f for f in listdir(base_dir) if f.endswith('.txt')]

#Check matching policy ids
# pids = sites.index.tolist()
# for f in files:
#     pid = np.int(f.split('_')[0])
#     if pid not in pids:
#         print('MISSING '+str(pid))

In [193]:
txt = []
pids = []
for f in files:
    with open(base_dir + f, 'r') as pg:
        #print('Reading: ' + f)
        page = pg.read()
        txt.append(page)
        pid = np.int(f.split('_')[0])
        pids.append(pid)

In [194]:
#Create dataframe of texts and join it to policies frame
txts = pd.DataFrame({'policy_text': txt},index=pids)
txts.index.name = 'Policy UID'
policies = pd.merge(policies,txts,left_index=True,right_index=True,how='outer')

In [195]:
policies.head()

Unnamed: 0_level_0,Policy URL,Policy collection date,Policy last updated date,policy_text
Policy UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20,theatlantic.com/privacy-policy/,2015-07-02,2015-01-01,"Privacy Policy\nEffective: January 1, 2015 ..."
21,imdb.com/privacy,2015-07-02,2014-12-05,"IMDb Privacy Notice\n\nLast Updated, December ..."
26,nytimes.com/privacy,2015-07-08,2015-06-10,"Privacy Policy\nLast Updated on June 10, 2015\..."
32,voxmedia.com/privacy-policy,2015-07-02,2014-05-01,Your privacy is important to us and we have pr...
33,nbcuniversal.com/privacy/full-privacy-policy,2015-07-02,2015-01-14,Full Privacy Policy\nENGLISH\n\nLast Updated: ...


In [196]:
#Now merge the sites and policies tables
sites = pd.merge(sites,policies,left_index=True,right_index=True,how='outer')
sites.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 192 entries, 20 to 1713
Data columns (total 11 columns):
Site UID                    192 non-null int64
Site URL                    192 non-null object
Site Human-Readable Name    192 non-null object
Site Check Date             192 non-null datetime64[ns]
In 115 Set?                 192 non-null bool
Comments                    192 non-null object
Sector                      192 non-null object
Policy URL                  192 non-null object
Policy collection date      192 non-null datetime64[ns]
Policy last updated date    159 non-null datetime64[ns]
policy_text                 190 non-null object
dtypes: bool(1), datetime64[ns](3), int64(1), object(6)
memory usage: 16.7+ KB


In [202]:
sites.index.nunique()

115

In [203]:
#Persist to sites table in beforeiagree_db
sites.to_sql('sites', engine, if_exists='replace')

### Annotations table

Each CSV file in annotations/ contains the annotation data for one privacy policy. Each policy has been annotated by three workers, and data from all three workers is present in the same file. These files do not represent any efforts to consolidate information from multiple annotators.

These CSV files use the Excel dialect. Each row contains the information for a single data practice. The columns are:
1. annotation ID (a globally unique identifier for a data practice)
2. batch ID (name of a batch in the annotation tool; often indicates who the annotators were)
3. annotator ID
4. policy ID (this corresponds to the numeric prefixes in the policy filename, as found in other directories)
5. segment ID (the zero-indexed, sequential identifier of the policy segment; e.g., the first segment in a policy's text is segment zero)
6. category name
7. attribute-value pairs (represented as JSON)
8. policy URL
9. date

The attribute-value dictionary (column 7/G), at its highest level, consists of keys (strings that correspond to attribute names) that map to nested dictionaries. The nested dictionaries have keys that specify the selected text, its location in the segment, and the value associated with the attribute.

**Consolidation** - In addition, we're using the consolidated annotations at threshold value of 0.5. Additional guidance on annotation consolidation:

Since multiple annotators worked on each privacy policy, they produced some redundant annotations. Each of the three subdirectories in consolidation/ contains the results of the consolidation algorithm with a different convergence threshold, as described in the paper. The annotations are in CSV files in the same format as described in the _Annotations_ section of this document.

In the consolidation CSV files, the data practices that were created by consolidation (i.e., by merging together 2-3 data practices, each from a different annotator) have a 'C' prefix at the beginning of their annotation IDs. All others are "singlet" data practices, identified by one annotator and too distinct for consolidation.

In [3]:
#Locate the files
from os import listdir
import re
ann_dir = '../OPP-115/consolidation/threshold-0.5-overlap-similarity/'
files = [f for f in listdir(ann_dir) if f.endswith('.csv')]

In [4]:
#Set up name conventions and data conversions for import
import numpy as np
names = """annotation_id 
        batch_id 
        annotator_id 
        policy_id 
        segment_id 
        category_name 
        attributes_value_pairs 
        date 
        policy_url""".split()

types = {'annotation_id': str,
        'batch_id': str,
        'annotator_id': np.int,
        'policy_id': np.int,
        'segment_id': np.int,
        'category_name': str}

ann_list = []
pids = []
for f in files:
    df = pd.read_csv(ann_dir + f,header=None,names=names,na_values={'date': 'Not specified'},
                 parse_dates=[7],index_col=0)
    ann_list.append(df)
    pids.append(np.int(f.split('_')[0]))

In [5]:
annotations = pd.concat(ann_list,axis=0,keys=pids,names=['Policy UID','annotation_id'])

In [6]:
annotations.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,batch_id,annotator_id,policy_id,segment_id,category_name,attributes_value_pairs,date,policy_url
Policy UID,annotation_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1017,C4885,test_category_labeling_highlight_fordham_aaaaa,121,3905,0,Other,"{""Other Type"": {""selectedText"": ""Sci-News.com ...",NaT,http://www.sci-news.com/privacy-policy.html
1017,20234,test_category_labeling_highlight_fordham_ddddd,118,3905,0,Policy Change,"{""Change Type"": {""selectedText"": ""Please note ...",NaT,http://www.sci-news.com/privacy-policy.html
1017,20324,test_category_labeling_highlight_fordham_aaaaa,121,3905,1,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""nformati...",NaT,http://www.sci-news.com/privacy-policy.html
1017,20325,test_category_labeling_highlight_fordham_aaaaa,121,3905,1,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""nformati...",NaT,http://www.sci-news.com/privacy-policy.html
1017,20590,test_category_labeling_highlight_fordham_ccccc,117,3905,1,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""informat...",NaT,http://www.sci-news.com/privacy-policy.html


In [28]:
attr_values = pd.DataFrame(data=None,columns=['annotation_id','start_idx','end_idx','attribute','text','value'])
template = dict.fromkeys('startIndexInSegment endIndexInSegment selectedText value'.split())

import json
import time

In [53]:
#BATCH PROCESSING OF ATTRIBUTE-VALUE PAIRS: THIS NEEDS TO BE DONE MANUALLY!
attr_values1 = pd.DataFrame(data=None,columns=['annotation_id','start_idx','end_idx','attribute','text','value'])
t0 = time.time()
print('Starting at ' + str(t0))
for i in range(16001,annotations.shape[0]):    #Note manual adjustment of batch size here...
    
    attr_val = annotations['attributes_value_pairs'].iloc[i]
    ann_id = annotations.index.values[i][-1]
    
    obj = json.loads(attr_val)
    keys = list(obj.keys())
    
    for k in keys:
        obj2 = dict(template, **obj[k]) #Ensures at a minimum we get the empty template data.
        df = pd.DataFrame({'annotation_id': [ann_id],
                           'start_idx': [obj2['startIndexInSegment']],
                           'end_idx': [obj2['endIndexInSegment']],
                           'attribute': [k],
                           'text': [obj2['selectedText']],
                           'value': [obj2['value']]})
        attr_values1 = attr_values1.append(df,ignore_index=True)
t1 = time.time()
print('Finished at ' + str(t1))
print('Total elapsed time ' + str(t1-t0))
attr_values = attr_values.append(attr_values1,ignore_index=True)
attr_values.info()

Starting at 1505169998.423711
Finished at 1505170030.09333
Total elapsed time 31.669618844985962
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115911 entries, 0 to 115910
Data columns (total 6 columns):
annotation_id    115911 non-null object
attribute        115911 non-null object
end_idx          115911 non-null object
start_idx        115911 non-null object
text             80684 non-null object
value            115911 non-null object
dtypes: object(6)
memory usage: 5.3+ MB


In [59]:
#Set up indices
attr_values.index.name = 'text_selection_id'
attr_values = attr_values.set_index(['annotation_id'],
                                    append=True).reorder_levels(['annotation_id','text_selection_id'])

In [62]:
attr_values = attr_values.rename(columns={'text':'text_selection'})

In [65]:
#Persist all of this crap to a new table. Phew!
annotations.to_sql('annotations', engine, if_exists='replace')
attr_values.to_sql('text_selections', engine, if_exists='replace')

### Pretty annotations table

['1017_sci-news.com.csv',
 '1028_redorbit.com.csv',
 '1034_aol.com.csv',
 '1050_honda.com.csv',
 '105_amazon.com.csv',
 '1070_wnep.com.csv',
 '1083_highgearmedia.com.csv',
 '1089_freep.com.csv',
 '1099_enthusiastnetwork.com.csv',
 '1106_allstate.com.csv',
 '1164_acbj.com.csv',
 '1205_opensecrets.org.csv',
 '1206_dcccd.edu.csv',
 '1221_gwdocs.com.csv',
 '1224_austincc.edu.csv',
 '1252_cincymuseum.org.csv',
 '1259_fool.com.csv',
 '1261_zacks.com.csv',
 '1264_citizen.org.csv',
 '1300_bankofamerica.com.csv',
 '1306_chasepaymentech.com.csv',
 '133_fortune.com.csv',
 '135_instagram.com.csv',
 '1360_thehill.com.csv',
 '1361_yahoo.com.csv',
 '1419_miaminewtimes.com.csv',
 '144_style.com.csv',
 '1468_rockstargames.com.csv',
 '1470_steampowered.com.csv',
 '1498_ticketmaster.com.csv',
 '1510_jibjab.com.csv',
 '1539_geocaching.com.csv',
 '1545_taylorswift.com.csv',
 '1582_msn.com.csv',
 '1610_post-gazette.com.csv',
 '1618_sltrib.com.csv',
 '1636_sidearmsports.com.csv',
 '1637_dailyillini.com.csv',