In [1]:
import csv
import sparknlp
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.training import CoNLL

import web_scrapping
import preprocessing_html

# Opening csv file and put it's content into a list of urls

In [28]:
csv_contents = []

with open("resources/furniture stores pages.csv", "r") as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        csv_contents.append(row[0])

csv_contents.remove("max(page)")

print(len(csv_contents))

704


# Fetch HTML content for each URL

In [29]:
html_contents = []

for url in csv_contents:
    html_content = web_scrapping.fetch_html_content(url)
    if html_content:
        html_contents.append([url, html_content])

print(f"Fetched HTML content from {len(html_contents)} URLs")

Error fetching content from https://furniturefetish.com.au/products/oslo-office-chair-white: HTTPSConnectionPool(host='furniturefetish.com.au', port=443): Max retries exceeded with url: /products/oslo-office-chair-white (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000027B80C28370>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
Failed to fetch content from https://hemisphereliving.com.au/products/. Status code:403
Failed to fetch content from https://home-buy.com.au/products/bridger-pendant-larger-lamp-metal-brass. Status code:404
Error fetching content from https://beckurbanfurniture.com.au/products/page/2/: HTTPSConnectionPool(host='beckurbanfurniture.com.au', port=443): Max retries exceeded with url: /products/page/2/ (Caused by SSLError(SSLCertVerificationE

# Save fatched HTML content to file for faster following processing

In [30]:
with open("resources/html_content.csv", "w", newline="") as f:
    write = csv.writer(f)
    write.writerows(html_contents)

# Setting new limit for csv reading because the html content si bigger that the actual size can read


In [12]:
max_int = sys.maxsize

while True:
    # decrease the maxInt value by factor 10
    # as long as the OverflowError occurs.

    try:
        csv.field_size_limit(max_int)
        break
    except OverflowError:
        max_int = int(max_int/10)

# Load fatched HTML content to file for faster following processing

In [13]:
html_contents = []
with open("resources/html_content.csv", "r") as f:
    read = csv.reader(f)
    for row in read:
        html_contents.append(row)

In [7]:
print(f"url:{html_contents[0][0]}\n")
print(html_contents[0][1][:1000])

url:https://www.factorybuys.com.au/products/euro-top-mattress-king

b'<!doctype html>\n<html class="no-js" lang="en">\n  <head>\n    <meta charset="utf-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge">\n    <meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=1">\n    <meta name="theme-color" content="">\n    <meta name="google-site-verification" content="S4-p-M4ar3SxPdQrcXVG_IF3iM3C8cRx_FTaPLF4xjk" />\n    \n      <link rel="canonical" href="https://www.factorybuys.com.au/products/euro-top-mattress-king" />\n    \n    \n    <link rel="preconnect" href="https://cdn.shopify.com" crossorigin><link rel="icon" type="image/png" href="//www.factorybuys.com.au/cdn/shop/files/FB-Favicon_1.png?crop=center&height=32&v=1659852040&width=32"><link rel="preconnect" href="https://fonts.shopifycdn.com" crossorigin><title>\n      Factory Buys 32cm Euro Top Mattress - King\n</title>\n\n    \n      <meta name="description" content="Pamper yourself with Factory Buys

# Preprocessing HTML content by removing unecessary tags

In [8]:
for i in range(len(html_contents)):
    html_contents[i][1] = preprocessing_html.preprocess_html(html_contents[i][1])
    print(f"Preprocessed HTML content for analysis {i + 1}/{len(html_contents)}")

Preprocessed HTML content for analysis 1/302
Preprocessed HTML content for analysis 2/302
Preprocessed HTML content for analysis 3/302
Preprocessed HTML content for analysis 4/302
Preprocessed HTML content for analysis 5/302
Preprocessed HTML content for analysis 6/302
Preprocessed HTML content for analysis 7/302
Preprocessed HTML content for analysis 8/302
Preprocessed HTML content for analysis 9/302
Preprocessed HTML content for analysis 10/302
Preprocessed HTML content for analysis 11/302
Preprocessed HTML content for analysis 12/302
Preprocessed HTML content for analysis 13/302
Preprocessed HTML content for analysis 14/302
Preprocessed HTML content for analysis 15/302
Preprocessed HTML content for analysis 16/302
Preprocessed HTML content for analysis 17/302
Preprocessed HTML content for analysis 18/302
Preprocessed HTML content for analysis 19/302
Preprocessed HTML content for analysis 20/302
Preprocessed HTML content for analysis 21/302
Preprocessed HTML content for analysis 22/3

In [9]:
# print(html_contents[0])
for sentence in html_contents[0][1]:
    print(sentence)
# print(max(html_contents[0], key=len))


b'
Skip to content
FREE SHIPPINGALL MATTRESSES*
Buy Now Pay Later Available!
Fast Shipping Australia Wide!
Continue Shopping
Skip to product information
Open media 1 in modal
Open media 2 in modal
Open media 3 in modal
Open media 4 in modal
Open media 5 in modal
Open media 6 in modal
Open media 7 in modal
Open media 8 in modal
1
 / 
of
9
This item ships for FREE in Australia
Description
Pamper yourself with Factory Buys Euro Top Mattress that will give you a soothing sleep night after night.Incorporating the cutting-edge technology, the lavish construction addresses all elements that will offer you a comfortable and luxurious experience - Euro top padding with high-density foam and an independent coil system - made to provide you with an undisturbed sleep all night long.
Product Features
Plush Euro Top padding 5 zoned pocket springs - Different levels of support Independent coil system - Excellent support and sublime comfort 6 working turn coil system High-density foam Medium firmness 

In [11]:
print(f"url:{html_contents[0][0]}")

url:https://www.factorybuys.com.au/products/euro-top-mattress-king


# Converting preprocessed html content into CoNLL format for testing

In [10]:
import re

conll_filename = "resources/test_data_from_html.conll"
with open(conll_filename, "w", newline="") as f:
    for line in html_contents[0][1]:
        splitted_line = re.findall(r'\w+|\S', line)
        for i in range(len(splitted_line)):
            splitted_line[i] = splitted_line[i] + " _ O O"
        processed_line = '\n'.join(splitted_line)
        f.write(processed_line + '\n' + '\n')
    print(f"url:{html_contents[0][0]}")


url:https://www.factorybuys.com.au/products/euro-top-mattress-king


# Initializing spark session

In [11]:

spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2")\
    .getOrCreate()
sc = spark.sparkContext

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  4.4.4
Apache Spark version:  3.2.3


In [None]:
spark = sparknlp.start(gpu=True)

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

# Reading training dataset into CoNLL format

In [12]:
training_data = CoNLL().readDataset(spark, "resources/training_data.conll")
# training_data.printSchema()

In [13]:
training_data.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 b '|[{document, 0, 2,...|[{document, 0, 2,...|[{token, 0, 0, b,...|[{pos, 0, 0, _, {...|[{named_entity, 0...|
|     Skip to content|[{document, 0, 14...|[{document, 0, 14...|[{token, 0, 3, Sk...|[{pos, 0, 3, _, {...|[{named_entity, 0...|
|FREE SHIPPINGALL ...|[{document, 0, 28...|[{document, 0, 28...|[{token, 0, 3, FR...|[{pos, 0, 3, _, {...|[{named_entity, 0...|
|Buy Now Pay Later...|[{document, 0, 28...|[{document, 0, 28...|[{token, 0, 2, Bu...|[{pos, 0, 2, _, {...|[{named_entity, 0...|
|Fast Shipping Aus...|[{document, 0, 29...|[{document, 0, 29...|[{token, 0, 3, Fa...|[{pos, 0, 3, _, {..

In [14]:
training_data.head()

Row(text="b '", document=[Row(annotatorType='document', begin=0, end=2, result="b '", metadata={'training': 'true'}, embeddings=[])], sentence=[Row(annotatorType='document', begin=0, end=2, result="b '", metadata={'sentence': '0'}, embeddings=[])], token=[Row(annotatorType='token', begin=0, end=0, result='b', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=2, end=2, result="'", metadata={'sentence': '0'}, embeddings=[])], pos=[Row(annotatorType='pos', begin=0, end=0, result='_', metadata={'sentence': '0', 'word': 'b'}, embeddings=[]), Row(annotatorType='pos', begin=2, end=2, result='_', metadata={'sentence': '0', 'word': "'"}, embeddings=[])], label=[Row(annotatorType='named_entity', begin=0, end=0, result='O', metadata={'sentence': '0', 'word': 'b'}, embeddings=[]), Row(annotatorType='named_entity', begin=2, end=2, result='O', metadata={'sentence': '0', 'word': "'"}, embeddings=[])])

In [15]:
import pyspark.sql.functions as F

training_data.select(F.explode(F.arrays_zip(training_data.token.result,
                                            training_data.label.result)).alias("cols")) \
             .select(F.expr("cols['0']").alias("token"),
                     F.expr("cols['1']").alias("ground_truth")).groupBy('ground_truth').count().orderBy('count', ascending=False).show(100,truncate=False)

+------------+-----+
|ground_truth|count|
+------------+-----+
|O           |2879 |
|PROD        |69   |
+------------+-----+



In [16]:
bert = BertEmbeddings.pretrained('bert_base_cased', 'en') \
    .setInputCols(["sentence", 'token']) \
    .setOutputCol("bert") \
    .setCaseSensitive(False)

bert_base_cased download started this may take some time.
Approximate size to download 384.9 MB
[OK!]


In [19]:
nerTagger = NerDLApproach() \
    .setInputCols(["sentence", "token", "bert"]) \
    .setLabelColumn("label") \
    .setOutputCol("ner") \
    .setMaxEpochs(50) \
    .setRandomSeed(0) \
    .setVerbose(1) \
    .setValidationSplit(0.2) \
    .setEvaluationLogExtended(False) \
    .setEnableOutputLogs(True) \
    .setIncludeConfidence(True)

In [18]:
test_data = CoNLL().readDataset(spark, 'resources/test_data_from_html.conll')

test_data.select(F.explode(F.arrays_zip(test_data.token.result,
                                            test_data.label.result)).alias("cols")) \
             .select(F.expr("cols['0']").alias("token"),
                     F.expr("cols['1']").alias("ground_truth")).groupBy('ground_truth').count().orderBy('count', ascending=False).show(100,truncate=False)

+------------+-----+
|ground_truth|count|
+------------+-----+
|O           |1143 |
+------------+-----+



In [20]:
ner_pipeline = Pipeline(stages = [bert, nerTagger])
ner_model = ner_pipeline.fit(training_data)

In [21]:
preds =  ner_model.transform(test_data)
preds.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|                bert|                 ner|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 b '|[{document, 0, 2,...|[{document, 0, 2,...|[{token, 0, 0, b,...|[{pos, 0, 0, _, {...|[{named_entity, 0...|[{word_embeddings...|[{named_entity, 0...|
|     Skip to content|[{document, 0, 14...|[{document, 0, 14...|[{token, 0, 3, Sk...|[{pos, 0, 3, _, {...|[{named_entity, 0...|[{word_embeddings...|[{named_entity, 0...|
|FREE SHIPPINGALL ...|[{document, 0, 28...|[{document, 0, 28...|[{token, 0, 3, FR...|[{pos, 0, 3, _, {...|[{named_entity, 0...|[{word_embeddings...|[{

In [22]:
preds.select("token.result", "ner.result").show(n=10000, truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# import pyspark.sql.functions as F
#
# preds.select(F.explode(F.arrays_zip()))

In [23]:
ner_model.stages[1].write().save('NER_bert_50epochs')