# Process as text

Using NLP pipelines is not feasible because those are language-dependent, and establishing the language of the content of a file is not trivial.

This notebooks tries to process these files as chunks, extracting text from them.

In [6]:
def chunk_file(file_path: str, size: int):
    with open(file_path) as f:
        while content := f.readline():
            for _ in range(size - 1):
                content += f.readline()

            yield content.splitlines()

In [7]:
from text_preprocessing import preprocess_text
from text_preprocessing import (
    to_lower,
    remove_email,
    remove_url,
    remove_punctuation,
    lemmatize_word,
)

preprocess_functions = [
    to_lower,
    remove_email,
    remove_url,
    remove_punctuation,
    lemmatize_word,
]

In [8]:
from collections import Counter
from statistics import mean, stdev


def remove_frequent_items(text: str) -> list:
    # calculate frequent items in the text
    splits = text.split()
    counter = Counter(splits)

    # calculate the upper threshold
    average = mean(counter.values())
    std_dev = stdev(counter.values())
    upper_threshold = average + (2 * std_dev)  # top 30%

    fi = list()

    for key, frequency in counter.most_common():
        if frequency < upper_threshold:
            fi.append(key)

    return fi

In [9]:
#file_path = "datasets/13368/all-geonames.rdf"
file_path = "red/867/rows.rdf"

chunk_size = 10000

items = list()

for chunk in chunk_file(file_path, chunk_size):
    line = " ".join(chunk)
    items += remove_frequent_items(line)

In [10]:
for e in items:
    print(preprocess_text(e, preprocess_functions))

dsofmyla311appdownloads0dsofmyla311appdownloadsdsbasetsrcszkh
rdfrdf
xmlnsrdf
xmlnsrdfs
xmlnssocrata
xmlnsdcat
xmlnsods
xmlnsdcterm
xmlnsgeo
xmlnsskos
xmlnsfoaf
xmlnsdsbase
xmlnsds
rdfabout
socratarowidrowjvdrdwbmm854socratarowid
dsdate20130101t000000dsdate
dsdatenamejan13dsdatename
rdfabout
socratarowidrowb23utvey9mfisocratarowid
dsdate20130201t000000dsdate
dsdatenamefeb13dsdatename
rdfabout
socratarowidrowb2kx35knm27asocratarowid
dsdate20130301t000000dsdate
dsdatenamemar13dsdatename
dsofmyla311appdownloads898dsofmyla311appdownloadsdsbasetsrcszkh
rdfabout
socratarowidrowr8eiyfkff4ensocratarowid
dsdate20130401t000000dsdate
dsdatenameapr13dsdatename
dsofmyla311appdownloads9592dsofmyla311appdownloadsdsbasetsrcszkh
rdfabout
socratarowidrowjtqmqmt2h49psocratarowid
dsdate20130501t000000dsdate
dsdatenamemay13dsdatename
dsofmyla311appdownloads820dsofmyla311appdownloadsdsbasetsrcszkh
rdfabout
socratarowidrow52rirph5f3cmsocratarowid
dsdate20130601t000000dsdate
dsdatenamejun13dsdatename
dsofmyla