In [None]:
import pandas as pd
import csv

# Load the IP address ranges as a dataframe
ip_database = open('ip_list.csv')
ip_reader = csv.reader(ip_database)
ip_df = pd.DataFrame(ip_reader)

# Set the column names
ip_df.columns = ip_df.iloc[0]
ip_df.drop(ip_df.index[0])

In [None]:
# Define a function to split the ip addresses (stored as strings) into a list
def split(s):
    parts = []
    bracket_level = 0
    current = []
    # Remove special-case of trailing characters
    for c in (s + ","):
        if c == "," and bracket_level == 0:
            parts.append("".join(current))
            current = []
        else:
            if c == "[":
                bracket_level += 1
            elif c == "]":
                bracket_level -= 1
            current.append(c)
    return parts

In [None]:
import netaddr

# Create a list of ip_ranges
ip_list_bunched = []

for i in list(ip_df['ip_range']):
    ip_list_bunched.append(split(i[1:-1]))

# Flatten the list    
ip_list = [item for sublist in ip_list_bunched for item in sublist]

# Remove the heading
del ip_list[0]

# Convert CIDR to IP ranges
for i in range(len(ip_list)):
    if '[' in ip_list[i]:
        # Remove whitespace and leading and trailing inverted commas
        startip = ip_list[i].split()[0].strip('[], ')[1:-1]
        endip = ip_list[i].split()[1].strip('[], ')[1:-1]
        
        # Convert to CIDR as a string
        temp_ip = netaddr.iprange_to_cidrs(startip, endip)[0]
        
        # Replace value in list
        ip_list[i] = temp_ip
    else:
        # Strip the leading and trailing inverted commas
        temp_ip = ip_list[i].strip('\'\" ')
        ip_list[i] = netaddr.IPNetwork(temp_ip)
        
# Merge adjacent CIDR
ip_list = netaddr.cidr_merge(ip_list)

# Create an equivalent set to check
ip_set = netaddr.IPSet(ip_list)

In [None]:
# Data from https://dumps.wikimedia.org/enwiki/latest/

In [None]:
import xml.etree.ElementTree as etree
import codecs
import time
import os

PATH_WIKI_XML = '/Users/BDD/Documents/05 Yale/01 Study/04 Fall, 2020/04 GLBL 849 Big Data and Global Policies/03 Final Project/01 Code/02-Processing_edits_data'
FILENAME_WIKI = 'enwiki-latest-stub-meta-history1.xml'
FILENAME_REVISIONS = 'revisions1.csv'
ENCODING = "utf-8"

In [None]:
# Nicely format time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = int(sec_elapsed % 60)
    return "{}:{:>02}:{:>02}".format(h, m, s)

def strip_tag_name(t):
    t = elem.tag
    idx = k = t.rfind("}")
    if idx != -1:
        t = t[idx + 1:]
    return t

In [None]:
pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathRevisions = os.path.join(PATH_WIKI_XML, FILENAME_REVISIONS)

totalCount = 0
recordedCount = 0
title = None
start_time = time.time()

In [None]:
# Open up XML file and compare to IP address list and copy if it does
with codecs.open(pathRevisions, "w", ENCODING) as revisionsFH:
    revisionsWriter = csv.writer(revisionsFH, quoting=csv.QUOTE_MINIMAL)

    revisionsWriter.writerow(['id', 'title', 'redirect', 'ns', 'rev_id', 'timestamp', 'ip', 'username', 'comment', 'byt'])

    for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
        tname = strip_tag_name(elem.tag)

        if event == 'start':
            if tname == 'page':
                id = -1
                title = ''
                redirect = ''
                ns = -1
                rev_id = -1
                timestamp = ''
                ip = None
                username = ''
                comment = ''
                byt = -1
                inrevision = False
                incontributor = False

            elif tname == 'revision':
                # Do not pick up on revision id's
                inrevision = True
                incontributor = False
                
            elif tname == 'contributor':
                # Do not pick up on contributor id's
                incontributor = True

            elif tname == 'title':
                title = elem.text
            elif tname == 'id' and not inrevision and elem.text!=None:
                id = int(elem.text)
            elif tname == 'redirect':
                redirect = elem.get('title', '')
            elif tname == 'ns' and elem.text!=None:
                ns = int(elem.text)

            elif tname == 'id' and inrevision and not incontributor and elem.text!=None:
                rev_id = int(elem.text)
            elif tname == 'timestamp' and inrevision:
                timestamp = elem.text
            elif tname == 'ip' and inrevision:
                ip = elem.text
            elif tname == 'username' and inrevision:
                username = elem.text
            elif tname == 'comment' and inrevision:
                comment = elem.text
            elif tname == 'text' and inrevision:
                byt = elem.get('bytes', '')

        elif tname == 'revision':
            totalCount += 1
            
            if ip is not None and len(ip) <= 15:
                if netaddr.IPAddress(ip) in ip_set:
                    revisionsWriter.writerow([id, title, redirect, ns, rev_id, timestamp, ip, username, comment, byt])
                    recordedCount += 1
                        
            if totalCount > 1 and (totalCount % 100000) == 0:
                print("{:,}, {:,}, {}".format(totalCount, recordedCount, hms_string(time.time() - start_time)))

        elem.clear()

time_took = time.time() - start_time
print(f"Total runtime: {hms_string(time_took)}")