In [9]:
#warcio is used to iterate over warc file
#boilerpipe is used to extract text from HTML pages

from warcio.archiveiterator import ArchiveIterator
import io, time, json
from boilerpipe.extract import Extractor
import datetime
from collections import defaultdict
import numpy as np
from scipy import stats
import json

In [2]:
#RUN EVERYTHING
a = datetime.datetime.now()
print (a)



with open('warc_commoncrawl_example_2', 'rb') as stream:
    
    #Initialize data structures used to store the parsed webpages 
    data_file = {}
    web_pages = []
    not_parsed_types = defaultdict(int)
    warc_offset=0
    parsed_pages = 0
    not_parsed_pages = 0
    
    #Iterate the .WARC file
    for record in ArchiveIterator(stream):
        
        #there is only one 'warcinfo' per file
        #represents the metadata of that file
        if record.rec_type == 'warcinfo':
            
            #warc_warcinfo header
            tmp = dict(record.rec_headers.headers)
            data_file['date']=tmp['WARC-Date']
            data_file['filename']=tmp['WARC-Filename']
        
        #represents the HTTP response.
        #This represent the crawled data
        elif record.rec_type == 'response':
            
            #warc_response header
            tmp = dict(record.rec_headers.headers)
            record_dict = {'warc_offset':warc_offset,
                        'length':int(tmp['Content-Length']),
                        'ip':tmp['WARC-IP-Address'],
                        'URI':tmp['WARC-Target-URI'],
                        'Payload-Digest': tmp['WARC-Payload-Digest']
                          }
            
            #there are different several types of data,
            #we only parse HTML files and decide that using the HTTP Header field'content-types'
            content_type = record.http_headers.get_header('Content-Type')
            if content_type != None and content_type[:9] == 'text/html':
                
                #Extract text from HTML, no removing of punctuation/numbers. 
                #no lower case conversion,etc. is done
                try:

                    all_crawl = record.raw_stream.read()
                    #print ("**************************************")
                    #print ("WARC offfset", warc_offset)
                    #print(all_crawl, type(all_crawl))
                    
                    #validate if the HTML webpage has content
                    if all_crawl != None and all_crawl != b'':
                        
                        
                        extractor = Extractor(extractor='KeepEverythingExtractor', 
                                            kMin=20, 
                                              html=all_crawl)
                        parsed_text = extractor.getText()
                        if len(parsed_text) > 0:
                            record_dict['text'] = parsed_text
                            web_pages.append(record_dict)
                            parsed_pages +=1
                        else:
                            #parsed data is empty
                            not_parsed_types['ZERO_DATA']+=1
                            not_parsed_pages +=1
                    else:
                        #entire HTML is empty
                        #print ("Couldn't find anything to parse, found None object or empty string")
                        not_parsed_pages +=1
                        not_parsed_types['ZERO_DATA']+=1
                        
                except UnicodeDecodeError:
                    #print ("Boilerpipe chose wrong encode selection, going to skip this")
                    not_parsed_pages+=1
                    not_parsed_types['WRONG_ENCODING'] += 1
                    #record_dict['text'] = None
                    
                except TypeError:
                    #print ("Unkown error appear, going to skip this")
                    not_parsed_pages+=1
                    not_parsed_types['UNKNOWN'] += 1
                    #record_dict['text'] = None
                
                
            else:
                not_parsed_pages +=1
                if content_type:
                    content_type = content_type.lower().split(';')
                    #print("No HTML found, found " + content_type + ". Not parsing")
                    not_parsed_types[content_type[0]] += 1
                else:
                    #print("****************************")
                    #print("Don't know what this HTTP response has:\n/t/t" \
                          +str(record.http_headers))
                    not_parsed_types['WRONG_HTTP_RESPONSE'] += 1
                    #print("****************************")
        
        elif record.rec_type == 'resource':
            print  ("WARC file with type RESOURCE find")
        
        elif record.rec_type == 'revisit':
            print  ("WARC file with type REVISIT find")
        
        elif record.rec_type == 'conversion':
            print  ("WARC file with type CONVERSION find")
        
        warc_offset+=1
        if warc_offset % 500 == 0:
            print ("OFFSET: ",warc_offset," PARSED: ",parsed_pages,
                  "NOT_PARSED: ", not_parsed_pages)

print ("THE END")
print ("Total parsed pages: ",parsed_pages,
                  "\nTotal not parsed: ", not_parsed_pages)
for key,value in not_parsed_types.items():
    print ("\t",key,": ",value)

b = datetime.datetime.now()
print (b)
print (b-a)

###CHECK for the error

2018-03-02 01:00:45.411298
Boilerpipe chose wrong encode selection, going to skip this
****************************
Don't know what this HTTP response has:
/t/tHTTP/1.1 200 OK
Server: Microsoft-IIS/8.0
p3p: CP=BUS CUR CONo FIN IVDo ONL OUR PHY SAMo TELo
Date: Fri, 05 Feb 2016 21:55:55 GMT
Connection: close
Content-Length: 0

****************************
****************************
Don't know what this HTTP response has:
/t/tHTTP/1.1 200 OK
Server: Microsoft-IIS/8.0
p3p: CP=BUS CUR CONo FIN IVDo ONL OUR PHY SAMo TELo
Date: Fri, 05 Feb 2016 22:06:08 GMT
Connection: close
Content-Length: 0

****************************
****************************
Don't know what this HTTP response has:
/t/tHTTP/1.1 200 OK
Server: Microsoft-IIS/8.0
p3p: CP=BUS CUR CONo FIN IVDo ONL OUR PHY SAMo TELo
Date: Fri, 05 Feb 2016 22:16:30 GMT
Connection: close
Content-Length: 0

****************************
Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going

Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
OFFSET:  19500  PARSED:  5979 NOT_PARSED:  445
OFFSET:  20000  PARSED:  6141 NOT_PARSED:  448
OFFSET:  20500  PARSED:  6303 NOT_PARSED:  451
Couldn't find anything to parse, found None object or empty string
Boilerpipe chose wrong encode selection, going to skip this
OFFSET:  21000  PARSED:  6457 NOT_PARSED:  461
OFFSET:  21500  PARSED:  6613 NOT_PARSED:  469
Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
Couldn't find anything to parse, found None object or empty string
OFFSET:  22000  PARSED:  6771 NOT_PARSED:  474
Boilerpipe chose wrong encode selection, going to skip this
OFFSET:  22500  PARSED:  6930 NOT_PARSED:  480
OFFSET:  23000  PARSED:  7093 NOT_PARSED:  482
Boilerpipe chose wrong encode selection, going to skip this
OFFSET:  2

Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
OFFSET:  39500  PARSED:  12247 NOT_PARSED:  756
OFFSET:  40000  PARSED:  12409 NOT_PARSED:  759
Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
OFFSET:  40500  PARSED:  12569 NOT_PARSED:  765
OFFSET:  41000  PARSED:  12732 NOT_PARSED:  767
Couldn't find anything to parse, found None object or empty string
Couldn't find anything to parse, found None object or empty string
Couldn't find anything to parse, found None object or empty string
Couldn'

Boilerpipe chose wrong encode selection, going to skip this
OFFSET:  58000  PARSED:  18142 NOT_PARSED:  911
Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
OFFSET:  58500  PARSED:  18297 NOT_PARSED:  916
Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
OFFSET:  59000  PARSED:  18453 NOT_PARSED:  924
OFFSET:  59500  PARSED:  18612 NOT_PARSED:  928
****************************
Don't know what this HTTP response has:
/t/tHTTP/1.1 200 OK
ETag: "6866-205-443131913a4c0"
Content-Length: 517
Last-Modified: Sun, 06 Jan 2008 19:53:31 GMT
Connection: close
Server: Apache/2.2.15 (Red Hat)
Vary: Accept-Encoding
Date: Fri, 05 Feb 2016 22:29:21 GMT
Accept-Ranges: bytes

****************************
****************************
Don't know what

OFFSET:  84500  PARSED:  26628 NOT_PARSED:  1102
Boilerpipe chose wrong encode selection, going to skip this
OFFSET:  85000  PARSED:  26789 NOT_PARSED:  1106
Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
OFFSET:  85500  PARSED:  26947 NOT_PARSED:  1112
OFFSET:  86000  PARSED:  27112 NOT_PARSED:  1112
OFFSET:  86500  PARSED:  27277 NOT_PARSED:  1114
Couldn't find anything to parse, found None object or empty string
Boilerpipe chose wrong encode selection, going to skip this
OFFSET:  87000  PARSED:  27437 NOT_PARSED:  1119
OFFSET:  87500  PARSED:  27601 NOT_PARSED:  1121
Boilerpipe chose wrong encode selection, going to skip this
Unkown error appear, going to skip this
OFFSET:  88000  PARSED:  27763 NOT_PARSED:  1124
Couldn't find anything to parse, found None object or empty string
Couldn't find anything to parse, found None object or empty string
Couldn't find anything to parse, found None object or empty string


OFFSET:  103500  PARSED:  32692 NOT_PARSED:  1284
****************************
Don't know what this HTTP response has:
/t/tHTTP/1.1 200 OK
Server: Microsoft-IIS/8.5
Date: Fri, 05 Feb 2016 23:13:26 GMT
Connection: close
Content-Length: 0
Set-Cookie: BIGipServercgis_pol_web_pool-http=!aEpfG77DZSun8DjVVZwnwxKzRjoc1ROXwibbO31gu3OfpXiipun4egVfononiIr7olhwG3zjikvd738=; path=/

****************************
****************************
Don't know what this HTTP response has:
/t/tHTTP/1.1 200 OK
Server: Microsoft-IIS/8.5
Date: Fri, 05 Feb 2016 22:20:04 GMT
Connection: close
Content-Length: 0
Set-Cookie: BIGipServercgis_pol_web_pool-http=!7xwQ7DNJZ3PJvW7VVZwnwxKzRjoc1bC2MYgE7XWDqX2JbKsTLE0Bd2XFhmG7o9OeTQaZwmE5nuyX9TA=; path=/

****************************
OFFSET:  104000  PARSED:  32849 NOT_PARSED:  1290
OFFSET:  104500  PARSED:  33014 NOT_PARSED:  1292
OFFSET:  105000  PARSED:  33176 NOT_PARSED:  1297
Couldn't find anything to parse, found None object or empty string
OFFSET:  105500  PARSED:  3

OFFSET:  124500  PARSED:  39431 NOT_PARSED:  1431
OFFSET:  125000  PARSED:  39595 NOT_PARSED:  1431
Couldn't find anything to parse, found None object or empty string
OFFSET:  125500  PARSED:  39759 NOT_PARSED:  1434
OFFSET:  126000  PARSED:  39908 NOT_PARSED:  1442
Couldn't find anything to parse, found None object or empty string
Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
Couldn't find anything to parse, found None object or empty string
OFFSET:  126500  PARSED:  40067 NOT_PARSED:  1448
Couldn't find anything to parse, found None object or empty string
Couldn't find anything to parse, found None object or empty string
Couldn't find anything to parse, found None object or empty string
Couldn't find anything to parse, found None object or empty string
OFFSET:  127000  PARSED:  40229 NOT_PARSED:  1453
OFFSET:  127500  PARSED:  40390 NOT_PARSED:  1454
C

Boilerpipe chose wrong encode selection, going to skip this
OFFSET:  145000  PARSED:  45982 NOT_PARSED:  1584
Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
Boilerpipe chose wrong encode selection, going to skip this
OFFSET:  145500  PARSED:  46137 NOT_PARSED:  1589
OFFSET:  146000  PARSED:  46299 NOT_PARSED:  1590
Couldn't find anything to parse, found None object or empty string
Couldn't find anything to parse, found None object or empty string
Couldn't find anything to parse, found None object or empty string
Couldn't find anything to parse, found None object or empty string
Couldn't find anything to parse, found None object or empty string
Couldn't find anything to parse, found None object or empty string
Couldn't find anything to parse, found None object or empty string
Couldn't find anything to parse, found None object or empty string
Couldn't find anything to parse, found None object or empty string
OFFSET:

In [10]:
#Get stats
print("Total number of parsed files: ",len(web_pages))
len_text = []
for x in web_pages:
    len_text.append(len(x['text']))
a = np.asarray(len_text)
stats.describe(a)

Total number of parsed files:  47374


DescribeResult(nobs=47374, minmax=(2, 997491), mean=6232.388293156584, variance=165824154.05565083, skewness=26.430153979182524, kurtosis=1367.6593689238523)

In [7]:
#write it down to a file

json_text = json.dumps(web_pages)
with open("parsed_data2.txt",'w') as file:
    json.dump(json_text, file) 

In [14]:
a = "image/jpeg" 
c = "application/atom+xml; charset=utf-8"
c.split(';')

['application/atom+xml', ' charset=utf-8']