In [1]:
import os
import psutil
import json
import shelve
from HelperClass import InvertedIndex, Token
from helper import tokenizeHtml

rootDir = 'D:\\RJ\\UCI\\Ralph School\\2023 Spring\\CS 121\\Assignments\\.vscode\\Res\\Test'
jsonFiles = []
for root, dirs, files in os.walk(rootDir):
    for name in files:
        if name.endswith((".json")):
            full_path = os.path.join(root, name)
            jsonFiles.append(full_path)

jsonFiles = jsonFiles[:5]

In [2]:
def writeDoc(docId:str, url:str):
    with shelve.open(f'DevShelve/Url', 'c') as shelf:
        shelf[str(docId)] = url

def getDocNum():
    with shelve.open(f'DevShelve/Url', 'c') as shelf:
        if 'totalDoc' not in shelf:
            shelf['totalDoc'] = 0
        return int(shelf['totalDoc'])
    
def storeDocNum(totalDoc:int):
    with shelve.open(f'DevShelve/Url', 'c') as shelf:
        shelf['totalDoc'] = totalDoc

def readDocShelve(key):
    with shelve.open(f'DevShelve/Url', 'c') as shelf:
        return shelf[key]

def getDocUrl(docId:int):
    with shelve.open(f'DevShelve/Url', 'c') as shelf:
        if str(docId) in shelf:
            return shelf[str(docId)]
        
# Opening JSON file
def getJsonData(filePath):
    jFile = open(filePath)
    data = json.load(jFile)
    url = data['url']
    htmlContent = data['content']
    jFile.close()
    return (url, htmlContent)

def isValidJsonSize(file):
    fileSize = os.path.getsize(file)/(1024*1024*1024)
    ramUsedGb = psutil.virtual_memory()[3]/1000000000
    totalRam = psutil.virtual_memory().total/(1024*1024*1024)
    if fileSize + ramUsedGb >= (totalRam * .75):
        return False
    return True

def isMemoryFull(limit=75):
    if psutil.virtual_memory()[2] >= limit:
       print(psutil.virtual_memory()[2])
       return True
    return False

def writeData(invIndex, dList, docId):
    invIndex.write('DevShelve')
    invIndex.clear()
    for dItem in dList:
        writeDoc(dItem[0], dItem[1])
    storeDocNum(docId)
    dList.clear()
    print("----Wrote Data to File----")

In [3]:
invIndex = InvertedIndex() #Create inverted index to hold tokens from parser
docId = 0
dList = []
for jFile in jsonFiles:
    docId += 1
    if not isValidJsonSize(jFile):
        writeData(invIndex, dList, docId)

    #Dont Load json file
    if not isValidJsonSize(jFile):
        docId -= 1
    else:
        url, htmlContent = getJsonData(jFile)

        #Check if already parsed
        if url == getDocUrl(docId):
            print(f'Parsed Already -- {docId}:{url}')
        else:
            # Cleans and parses HTML content into tokens then adds it to Inverted index
            tokenizeHtml(docId=docId, invIndex=invIndex, htmlContent=htmlContent)
            dList.append((docId,url))
            print(docId, url)

            if isMemoryFull():
                print('memFull')
                writeData(invIndex, dList, docId)

if docId != getDocNum():
    writeData(invIndex, dList, docId)

1 https://www.ics.uci.edu/~irani/s15-6D/ClassNotes/23_GeneratingPermsSubsets.html
2 https://grape.ics.uci.edu/wiki/public/wiki/cs222-2019-fall-git?version=4
3 http://archive.ics.uci.edu/ml/datasets/Gas+sensor+array+exposed+to+turbulent+gas+mixtures
4 http://alderis.ics.uci.edu/links.html
5 https://aiclub.ics.uci.edu/
----Wrote Data to File----


In [4]:
theTok = Token('the')
theTok.readShelve('DevShelve')
theTok

Token: the
	DocId: 2, Freq: 38
		Pos: {1027, 1161, 267, 656, 914, 1045, 534, 790, 152, 921, 1050, 158, 34, 547, 808, 298, 683, 54, 186, 958, 831, 73, 844, 848, 984, 90, 988, 736, 97, 992, 739, 874, 749, 111, 125, 120, 1021, 639}
		Weights: {'normal': {1027, 1161, 267, 656, 914, 1045, 534, 790, 152, 921, 1050, 158, 34, 547, 808, 298, 683, 54, 186, 958, 831, 73, 844, 848, 984, 90, 988, 736, 97, 992, 739, 874, 749, 111, 125, 120, 1021, 639}}
	DocId: 3, Freq: 66
		Pos: {769, 902, 263, 780, 141, 272, 657, 277, 923, 668, 283, 672, 290, 676, 549, 679, 428, 686, 689, 818, 692, 309, 313, 441, 570, 444, 700, 575, 193, 833, 836, 198, 454, 328, 202, 75, 332, 458, 716, 719, 80, 337, 466, 723, 848, 972, 728, 350, 478, 866, 739, 229, 742, 869, 360, 998, 620, 748, 1006, 624, 252, 374, 887, 378, 764, 890}
		Weights: {'normal': {769, 902, 263, 780, 141, 272, 657, 277, 923, 668, 283, 672, 290, 676, 549, 679, 428, 686, 689, 818, 692, 309, 313, 441, 570, 444, 700, 575, 193, 833, 836, 198, 454, 328, 202, 75

In [5]:
dList

[]

In [6]:
invIndex

Total Tokens: 0