## Merging inlinks and outlinks data from multiple threads

### Pre-Requisites
 - Have a folder called 'inlinks' and 'outlinks' which has 1 json file from each thread of the crawler
 - Create a folder called 'final_index'

In [7]:
import os
import json
import re
import string

In [8]:
INDEX_OUTPUT_FOLDER='./index_op'

In [9]:
# Folder paths
INLINKS_FOLDER='./inlinks'
OUTLINKS_FOLDER='./outlinks'

In [10]:
# Patterns
DOCNO_PATTERN = r"<DOCNO>(.*?)<\\DOCNO>"
TEXT_PATTERN = r"<TEXT>(.*?)<\\TEXT>"
HEAD_PATTERN = r"<HEAD>(.*?)<\\HEAD>"
DOC_PATTERN = r"<DOC>(.*?)<\\DOC>"

In [11]:
inlinks_set={}
for inlink_file in os.listdir(INLINKS_FOLDER):
    curr_path=os.path.join(os.getcwd(),f'{INLINKS_FOLDER}/{inlink_file}')
    with open(curr_path,'r') as curr_file:
        inlink_data=json.load(curr_file)
        for url,inlink_list in inlink_data.items():
            if url in inlinks_set:
                for link in inlink_list:
                    if inlinks_set[url].count(link) == 0:
                        inlinks_set[url].append(link)
            else:
                inlinks_set[url]=inlink_list

with open(f'{INDEX_OUTPUT_FOLDER}/links/merged_inlinks.json','w') as final_inlinks_file:
    json.dump(inlinks_set,final_inlinks_file,indent=2)
    print("Merging process complete for inlinks")

Merging process complete for inlinks


In [12]:
outlinks_set={}
for outlink_file in os.listdir(OUTLINKS_FOLDER):
    curr_path=os.path.join(os.getcwd(),f'{OUTLINKS_FOLDER}/{outlink_file}')
    with open(curr_path,'r') as curr_file:
        outlink_data=json.load(curr_file)
        for url,outlink_list in outlink_data.items():
            if url in outlinks_set:
                for link in outlink_list:
                    if outlinks_set[url].count(link) == 0:
                        outlinks_set[url].append(link)
            else:
                outlinks_set[url]=outlink_list

with open(f'{INDEX_OUTPUT_FOLDER}/links/merged_outlinks.json','w') as final_outlinks_file:
    json.dump(outlinks_set,final_outlinks_file,indent=2)
    print("Merging process complete for outlinks")

Merging process complete for outlinks


## Processing Web Content

### Pre-Requisites
 - Have a folder called 'webdata' that has all the files from all the threads that the crawler created 

In [13]:
WEBDATA_FOLDER='./webdata'

In [14]:
from nltk.stem import PorterStemmer
import string

swPath = "./stoplist.txt"

with open(swPath) as file:
    stopwords = file.readlines()
    for index, stopword in enumerate(stopwords):
        stopwords[index] = stopword.split("\n")[0]
        
# Adding punctuations in the stopwords list
punctuations = list(string.punctuation)

# Extra
extraPunc = ["``", "'s'", "'", "''"]
[punctuations.append(el) for el in extraPunc]

for p in punctuations:
    stopwords.append(p)
        
print(f'Total number of stopwords: {len(stopwords)}')

ps = PorterStemmer()

def inStopWords(word):
    return word.lower() in stopwords
    
def processWord(word):
    return ps.stem(word)

Total number of stopwords: 531


In [15]:
from nltk.tokenize import word_tokenize

def number(word):
    if word[0].isdigit() or word.isdigit():
        return True
    
    try:
        float_value = float(word)
        return True
    except ValueError:
        return False
    
# Helper method for processing text
def processText(text):
    words = word_tokenize(text)
    processedText = []
    
    for index, word in enumerate(words):
        if not number(word):
            processedText.append(processWord(word))
            
    return " ".join(processedText)

In [16]:
file_id=1
empty_inlinks=0
empty_outlinks=0
empty_content=0


webdata_files = os.listdir(WEBDATA_FOLDER)

print(f'Total files: {len(webdata_files)}')

for index, webdata_file in enumerate(webdata_files):
    print(f'On index: {index}')
    
    index_data={}
    curr_web_file=os.path.join(os.getcwd(),f'{WEBDATA_FOLDER}/{webdata_file}')
    with open(curr_web_file,'r',encoding='utf-8') as curr_file:
        curr_web_data=curr_file.read()
        documents=re.findall(DOC_PATTERN,curr_web_data,re.DOTALL)
        for document in documents:
            curr_index_info={}
            doc_url=re.findall(DOCNO_PATTERN,document,re.DOTALL)[0]
            doc_text=re.findall(TEXT_PATTERN,document,re.DOTALL)
            doc_text='\n'.join(doc_text)
            head_text=re.findall(HEAD_PATTERN,document,re.DOTALL)
            head_text='\n'.join(head_text)
            content_text = processText(doc_text) 
            if "not found" in head_text.lower() or content_text == "":
                empty_content+=1
                continue
            
            curr_index_info["content"]=content_text
            curr_index_info["title"]=head_text
            curr_index_info["inlinks"]=inlinks_set.get(doc_url,[])
            curr_index_info["outlinks"]=outlinks_set.get(doc_url,[])
            if(len(curr_index_info["inlinks"])==0):
                empty_inlinks+=1
            if(len(curr_index_info["outlinks"])==0):
                empty_outlinks+=1
            index_data[doc_url]=curr_index_info
        with open(f'{INDEX_OUTPUT_FOLDER}/index_file_{file_id}.json','w') as index_file:
            json.dump(index_data,index_file,indent=2)
        file_id+=1
        
print(f"Total empty inlinks : {empty_inlinks}")
print(f"Total empty outlinks : {empty_outlinks}")
print(f"Total empty content files : {empty_content}")

Total files: 49
On index: 0
On index: 1
On index: 2
On index: 3
On index: 4
On index: 5
On index: 6
On index: 7
On index: 8
On index: 9
On index: 10
On index: 11
On index: 12
On index: 13
On index: 14
On index: 15
On index: 16
On index: 17
On index: 18
On index: 19
On index: 20
On index: 21
On index: 22
On index: 23
On index: 24
On index: 25
On index: 26
On index: 27
On index: 28
On index: 29
On index: 30
On index: 31
On index: 32
On index: 33
On index: 34
On index: 35
On index: 36
On index: 37
On index: 38
On index: 39
On index: 40
On index: 41
On index: 42
On index: 43
On index: 44
On index: 45
On index: 46
On index: 47
On index: 48
Total empty inlinks : 0
Total empty outlinks : 44
Total empty content files : 7163


## Merging all index files

In [None]:
import os
import json
from collections import defaultdict

# Initialize the merged data structure
merged_data = defaultdict(lambda: {
    "content": "",
    "title": "",
    "inlinks": set(),
    "outlinks": set(),
})

for json_file in os.listdir(INDEX_OUTPUT_FOLDER):
    # Construct the full path to the current file
    curr_path = os.path.join(INDEX_OUTPUT_FOLDER, json_file)

    if "json" not in curr_path or os.path.isdir(curr_path):
            continue
        
    # Open and read the JSON file
    with open(curr_path, 'r') as file:
        file_data = json.load(file)
        print(f"Working with file: {curr_path}")
        
        # Merge each URL's data
        for url, data in file_data.items():        
            merged_data[url]["content"] = merged_data[url]["content"] or data.get("content", "")
            merged_data[url]["title"] = merged_data[url]["title"] or data.get("title", "")
            
            # Merge inlinks and outlinks using set
            merged_data[url]["inlinks"].update(data.get("inlinks", []))
            merged_data[url]["outlinks"].update(data.get("outlinks", []))

# Convert sets back to lists for the final output
for url in merged_data:
    merged_data[url]["inlinks"] = list(merged_data[url]["inlinks"])
    merged_data[url]["outlinks"] = list(merged_data[url]["outlinks"])

# Save the merged data to a file or use it as needed
output_path = "final_index.json"
with open(output_path, 'w') as outfile:
    json.dump(merged_data, outfile, indent=4)

print(f"Merged JSON saved to {output_path}")

Working with file: ./index_op/index_file_25.json
Working with file: ./index_op/index_file_33.json
Working with file: ./index_op/index_file_48.json
Working with file: ./index_op/index_file_29.json
Working with file: ./index_op/index_file_7.json
Working with file: ./index_op/index_file_44.json
Working with file: ./index_op/index_file_13.json
Working with file: ./index_op/index_file_12.json
Working with file: ./index_op/index_file_45.json
Working with file: ./index_op/index_file_6.json
Working with file: ./index_op/index_file_28.json
Working with file: ./index_op/index_file_49.json
Working with file: ./index_op/index_file_32.json
Working with file: ./index_op/index_file_24.json
Working with file: ./index_op/index_file_39.json
Working with file: ./index_op/index_file_1.json
Working with file: ./index_op/index_file_42.json
Working with file: ./index_op/index_file_15.json
Working with file: ./index_op/index_file_23.json
Working with file: ./index_op/index_file_35.json
Working with file: ./in