# Notebook 

## check the NZ file directory

```
$ ls -l
total 258624
-rw-r-----@   1 robert  staff  132413440 Sep 29 17:52 nz2.tar
drwxr-xr-x@ 171 robert  staff       5814 Sep 29 17:56 nz2_merged

$ du -csh ./*
126M	./nz2.tar
127M	./nz2_merged
253M	total

$ tree nz2_merged/
nz2_merged/
├── 0_data
├── 0_index
├── 10_data
├── 10_index
├── 11_data
├── 11_index

...

├── 9_data
├── 9_index
├── indexfile_list.txt
├── indexlist.inf
└── stat.txt

$ ls -l | grep -v '_index' | grep -v '_data'
total 259120
-rw-r--r--@ 1 robert  staff     1152 Oct 23  2005 indexfile_list.txt
-rw-r--r--@ 1 robert  staff  4738263 Oct 23  2005 indexlist.inf
-rw-r--r--@ 1 robert  staff       76 Oct 23  2005 stat.txt

$ cat stat.txt
total processed data: 866684658 bytes!
total time cost: 3761429279 seconds!

$ head indexfile_list.txt
nz2_merged/0
nz2_merged/1
nz2_merged/2
nz2_merged/3
nz2_merged/4
nz2_merged/5
nz2_merged/6
nz2_merged/7
nz2_merged/8
nz2_merged/9

$ head indexlist.inf
0010024.e-xpert.co.nz/SITE_Default/SITE_ema/search.asp 1 0 2161 203.110.28.35 80 ok
1000-plastic-plant-markers.leto.co.nz/ 2 0 12660 70.85.28.124 80 ok
100-hot-jokes.exportexpress.co.nz/ 1 2161 12988 70.85.28.182 80 ok
1031-tax-exchange-milwaukee.clfnz.org.nz/ 2 12660 13541 64.237.47.161 80 ok
12-lcd-monitor.deutschebotschaftwellington.co.nz/ 1 15149 14465 62.4.83.208 80 ok
17-lcd-monitor-1080p.earthhealers.co.nz/ 1 29614 15634 66.55.151.28 80 ok
17-monitor-screen-touch.earthhealers.co.nz/ 2 26201 13273 66.55.151.28 80 ok
1933-pontiac.bluestaroffice.co.nz/ 0 0 11353 62.4.83.208 80 ok
1937-ford.netday.net.nz/ 2 39474 13797 70.85.152.219 80 ok
1937-ford-truck.wideawake.co.nz/ 0 11353 12685 66.55.139.99 80 ok
```

### Decomposition of indexlist.inf

```
1000-plastic-plant-markers.leto.co.nz/ 2 0 12660 70.85.28.124 80 ok
|                                      | | |     |            |  |
|                                      | | |     |            |  + status
|                                      | | |     |            +port
|                                      | | |     + IP address
|                                      | | + length
|                                      | + offset
|                                      + ??
+ URL for this page

```


## Get the list of index files

In [95]:
# the filelist is in 'indexfile_list.txt'

import os

nz2_basedir = '/Users/robert/Downloads/wse/nz2_merged'

indexfile_list = 'indexfile_list.txt'
indexfile_list = os.path.join(nz2_basedir, indexfile_list)
with open(indexfile_list) as fdi:
    index_ids = [line.strip().split('/')[1] for line in fdi]

index_file_list = [os.path.join(nz2_basedir, "%s_index" % id) for id in index_ids]
data_file_list = [os.path.join(nz2_basedir, "%s_data" % id) for id in index_ids]

print len(index_file_list)
print len(data_file_list)

83
83


## Decompress the index file

In [180]:
"""
$ file 8_data
8_data: gzip compressed data, from Unix

$ file 8_index
8_index: gzip compressed data, was "tmpindex", from Unix, last modified: Sun Oct 23 19:14:22 2005
"""

class PageIndex(object):
    
    def __init__(self, url, ids, offset, length, ip, port, st):
        self.url = url
        self.ids = ids
        self.off = offset
        self.len = length
        self.ip  = ip
        self.port = port
        self.st  = st

import gzip

page_index = []

# only check the first index file
for indexfile in index_file_list[:1]:
    print indexfile
    with gzip.open(indexfile) as fdi:
        for line in fdi:
            url, ids, offset, length, ip, port, st = line.rstrip().split()
            page_index.append(PageIndex(url, int(ids), int(offset), int(length), ip, int(port), st))

print len(page_index)

/Users/robert/Downloads/wse/nz2_merged/0_index
727


## Check the page index

In [181]:
"""
explore the data format of nz index
we know approximately 
"""

# explore the 2nd column which is unknown
ids = {index_entry.ids for index_entry in page_index}
print list(ids)

# explore the hosting port
port = {index_entry.port for index_entry in page_index}
print list(port)

# explore the access status
st = {index_entry.st for index_entry in page_index}
print list(st)

for index_entry in page_index:
    # explore the last column for status not ok
    if index_entry.ids == 0:
        print index_entry.off, index_entry.len, index_entry.off+index_entry.len
        # nothing is printed, all successful
print '---------'
for index_entry in page_index:
    # explore the last column for status not ok
    if index_entry.ids == 1:
        print index_entry.off, index_entry.len, index_entry.off+index_entry.len
        # nothing is printed, all successful
print '---------'
for index_entry in page_index:
    # explore the last column for status not ok
    if index_entry.ids == 2:
        print index_entry.off, index_entry.len, index_entry.off+index_entry.len
        # nothing is printed, all successful

"""
3540071 19799 3559870
---------
3435323 9578 3444901
---------
3465942 4664 3470606
"""

print 3559870 + 3444901 + 3470606
print len(data)

# so three part makes the whole 10M data

print 2**21

[0, 1, 2]
[80]
['ok']
0 11353 11353
11353 12685 24038
24038 10930 34968
34968 12514 47482
47482 13192 60674
60674 13091 73765
73765 12744 86509
86509 10113 96622
96622 10216 106838
106838 11100 117938
117938 10629 128567
128567 14177 142744
142744 11283 154027
154027 13555 167582
167582 9984 177566
177566 11173 188739
188739 8869 197608
197608 9941 207549
207549 11654 219203
219203 11207 230410
230410 11925 242335
242335 10986 253321
253321 12753 266074
266074 11371 277445
277445 11044 288489
288489 13989 302478
302478 12393 314871
314871 9451 324322
324322 10608 334930
334930 11739 346669
346669 12741 359410
359410 10640 370050
370050 11641 381691
381691 10616 392307
392307 9087 401394
401394 15294 416688
416688 10460 427148
427148 11547 438695
438695 11935 450630
450630 11069 461699
461699 11586 473285
473285 15614 488899
488899 11566 500465
500465 13164 513629
513629 12732 526361
526361 11348 537709
537709 10086 547795
547795 5470 553265
553265 22391 575656
575656 19338 594994
59499

## What is the (0,1,2) ??

In [182]:
## explore the 0,1,2 in the index, what does that mean?

import gzip

datafile = data_file_list[0]
print datafile

with gzip.open(datafile) as fdd:
    data = fdd.read()
    print len(data)
    
    print data[0:10]
    print '----------------'
    print data[3559870 + 3444901:3559870 + 3444901+10]

/Users/robert/Downloads/wse/nz2_merged/0_data
10475377
HTTP/1.1 2
----------------
ion value=


## Decompress the data file

In [183]:
import gzip

datafile = data_file_list[0]
print datafile

with gzip.open(datafile) as fdd:
    data = fdd.read()
    '''
    print len(data)
    print data[0:100]
    print '-------------------'
    print data[2161:2161+100]
    print '-------------------'
    print data[15149:15149+100]
    print '-------------------'
    print data[29614:29614+100]
    '''

# I cannot find the pattern in the data file

# searching the HTTP/1.1 ...
lines = data.split('\n')
print len(lines)
#for line in lines:
#    if line.startswith('HTTP/1'):
#        print line
        
# check data
import re
page_starts = [m.start() for m in re.finditer('HTTP/1', data)]
print page_starts
print len(page_starts)

/Users/robert/Downloads/wse/nz2_merged/0_data
101925
[0, 2161, 14821, 27809, 41350, 55815, 71449, 84722, 96075, 109872, 122557, 133487, 143602, 156972, 168468, 180982, 194174, 204810, 217901, 232827, 243836, 256580, 266693, 278037, 289419, 299635, 310225, 321359, 331233, 341659, 353117, 364217, 374846, 384688, 397144, 411321, 421611, 432894, 446449, 454966, 469118, 479506, 488818, 498802, 509975, 518844, 528785, 540439, 549169, 561585, 576488, 587695, 599620, 610606, 621573, 634326, 645697, 656063, 666209, 677253, 688715, 699622, 710821, 722798, 733895, 747249, 757503, 766994, 780983, 794016, 804081, 816474, 827193, 839010, 849997, 860190, 872774, 886782, 898994, 909433, 919687, 930726, 940177, 950785, 962689, 972328, 982031, 991600, 1003339, 1012588, 1025329, 1035969, 1047610, 1060094, 1070710, 1085296, 1098663, 1107750, 1123044, 1134089, 1145694, 1156933, 1169399, 1180827, 1189902, 1200362, 1215200, 1226747, 1239080, 1250369, 1264590, 1276525, 1289619, 1299886, 1310955, 1320616, 1329