In [1]:
from bs4 import BeautifulSoup
import pandas as pd
from copy import deepcopy

In [2]:
def is_valid_html(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')
    tags = [tag.name for tag in soup.find_all()]
    valid = len(tags) > 0
    if not valid:
        print(html_string)
    return valid

In [3]:
def get_tag_names(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')
    tags = [tag.name for tag in soup.find_all()]
    return tags

In [4]:
def get_tags(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')
    tags = [tag for tag in soup.find_all()]
    return tags

In [5]:
def get_attrs(html_tags):
    attrs = []
    for tag in html_tags:
        for key in tag.attrs.keys():
            if key not in attrs:
                attrs.append(key)
    return attrs


In [6]:
def get_attr_values(html_tags):
    attr_values = []
    for tag in html_tags:
        for key in tag.attrs.keys():
            if tag.attrs[key] not in attr_values:
                attr_values.append(tag.attrs[key]) 
    return attr_values

In [7]:
def has_required_attr_vals(html_attr_vals):
    if "description" not in html_attr_vals:
        return False
    return True

In [20]:
def remove_cprcode_from_description(df1):
    df = deepcopy(df1)
    for cprcode in df.cprcode:
        index = df[df["cprcode"] == cprcode].index.item()
        cpr_string = ("0" * (7 - len(str(cprcode)))) + str(cprcode)
        title_found = False
        description_found = False
        try:
            newSeoOutput = df[df["cprcode"] == cprcode].seoOutputText.item().replace(cpr_string, "")
            newSeoOutput = newSeoOutput.replace(str(cprcode), "")
            if "Title: " in newSeoOutput:
                # print(df[df["cprcode"] == cprcode].seoOutputText.item())
                title_found = True
                newSeoOutput = newSeoOutput.replace("Title: ", "<title>")
                
            if "Description: " in newSeoOutput:
                description_found = True
                newSeoOutput = newSeoOutput.replace("Description:", '</title><meta name="description" content="')
            if title_found and description_found:
                newSeoOutput = newSeoOutput + '">'
            elif title_found:
                newSeoOutput = newSeoOutput + '</title>'
            df.loc[index, "seoOutputText"] = newSeoOutput
            if title_found:
                print(df[df["cprcode"] == cprcode].seoOutputText.item())
        except Exception as e:
            print("error: ", e)
        
    
    return df

In [21]:
df = pd.read_csv(f"dynamodb_export_full_output_{0}.csv")
df = remove_cprcode_from_description(df)

<title>Buy Cadbury Caramel Filled Chocolate Eggs 48g from Villa Market | IntoGroceryMeta </title><meta name="description" content=" Get the best Cadbury Caramel Filled Chocolate Eggs 48g from Villa Market. Our selection of Confectionery & Snacks, Chocolate, Sharing & Gifts, Dry Grocery, Grocery, Candies, Chocolate, and Marshmallows will make your shopping experience a breeze. Buy now and enjoy the best prices.">
<title>Buy Weilong Yam Chips Mixed Flavours 270g - Villa MarketMeta Tag: Weilong, Yam Chips Mixed Flavours, Confectionery & Snacks, Crisps & Snacks, Potato Chips, Dry Grocery, Grocery, Cookies & Snacks, Savoury Snacks</title><meta name="description" content=" Get the best of both worlds with Weilong Yam Chips Mixed Flavours. This delicious snack is made with a mix of yam and potato chips, giving you a unique and tasty snack. Perfect for sharing with friends and family, these chips are sure to be a hit. Enjoy the crunchy texture and the mix of flavours, including sweet, salty, a

In [22]:
df.head(10).seoOutputText

0    <title>KONJAC LINGUINI - Moku Konjac Flat Nood...
1    <title>Stute Apricot Jam 430g - Low Sugar & Su...
2    <title>COLGATE TOTAL CHARCOAL DEEP CLEAN 150G ...
3    <title>DOUGLAS-CT1725-AMBER FOX Doll | Villa M...
4    <title>Herr's Medium Salsa Dip 454g - Villa Ma...
5    <title> Sprite Lemon Lime Flavour No Sugar 500...
6    <title>NIVEA DEO DRY COMFORT STICK 40ML | Vill...
7    <title>OGX Conditioner Biotin & Collagen 385ml...
8    <title>Vicenzi Matilde Lemon Cream 150g | Vill...
9    <title>Sloane's Bacon Back 200g - Villa Market...
Name: seoOutputText, dtype: object

In [23]:

df["valid_html"] = df.seoOutputText.apply(is_valid_html)
df["tag_names"] = df.seoOutputText.apply(get_tag_names)
df["tags"] = df.seoOutputText.apply(get_tags)
df["attributes"] = df.tags.apply(get_attrs)
df["attr_values"] = df.tags.apply(get_attr_values)
df["has_required_attr_values"] = df.attr_values.apply(has_required_attr_vals)

In [24]:
df.head(10).tag_names

0          [title, meta]
1    [title, meta, meta]
2    [title, meta, meta]
3    [title, meta, meta]
4    [title, meta, meta]
5    [title, meta, meta]
6    [title, meta, meta]
7    [title, meta, meta]
8                [title]
9    [title, meta, meta]
Name: tag_names, dtype: object

In [25]:
df.head(10).tags

0    [[KONJAC LINGUINI - Moku Konjac Flat Noodle 16...
1    [[Stute Apricot Jam 430g - Low Sugar & Sugar F...
2    [[COLGATE TOTAL CHARCOAL DEEP CLEAN 150G | Vil...
3    [[DOUGLAS-CT1725-AMBER FOX Doll | Villa Market...
4    [[Herr's Medium Salsa Dip 454g - Villa Market]...
5    [[ Sprite Lemon Lime Flavour No Sugar 500ml - ...
6    [[NIVEA DEO DRY COMFORT STICK 40ML | Villa Mar...
7    [[OGX Conditioner Biotin & Collagen 385ml - Vi...
8    [[Vicenzi Matilde Lemon Cream 150g | Villa Mar...
9    [[Sloane's Bacon Back 200g - Villa Market], []...
Name: tags, dtype: object

In [26]:
df.head(10).attributes

0    [name, content]
1    [name, content]
2    [name, content]
3    [name, content]
4    [name, content]
5    [name, content]
6    [name, content]
7    [name, content]
8                 []
9    [name, content]
Name: attributes, dtype: object

In [27]:
df.head(10).attr_values

0    [description, KONJAC LINGUINI is a Moku Konjac...
1    [description, Stute Foods' No Sugar Added Apri...
2    [description, COLGATE TOTAL CHARCOAL DEEP CLEA...
3    [description, Introducing the DOUGLAS-CT1725-A...
4    [description, Herr's Medium Salsa Dip 454g is ...
5    [description,  Sprite Lemon Lime Flavour No Su...
6    [description, Villa Market offers NIVEA DEO DR...
7    [description, OGX Conditioner Biotin & Collage...
8                                                   []
9    [description, Sloane's Bacon Back 200g is a de...
Name: attr_values, dtype: object

In [28]:
df.head(10).has_required_attr_values

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8    False
9     True
Name: has_required_attr_values, dtype: bool

In [31]:
invalid_htmls = 0
doesnt_have_required_attrs = 0
for i in range(10):
    df = pd.read_csv(f"dynamodb_export_full_output_{i}.csv").fillna("")
    df = remove_cprcode_from_description(df)
    df["valid_html"] = df.seoOutputText.apply(is_valid_html)
    df["tag_names"] = df.seoOutputText.apply(get_tag_names)
    df["tags"] = df.seoOutputText.apply(get_tags)
    df["attributes"] = df.tags.apply(get_attrs)
    df["attr_values"] = df.tags.apply(get_attr_values)
    df["has_required_attr_values"] = df.attr_values.apply(has_required_attr_vals)
    invalid_htmls += len(df[df["valid_html"] == False])
    doesnt_have_required_attrs += len(df[df["has_required_attr_values"] == False])
    df.to_csv(f"reformatted_seo_output{i}.csv", index=False)


<title>Buy Cadbury Caramel Filled Chocolate Eggs 48g from Villa Market | IntoGroceryMeta </title><meta name="description" content=" Get the best Cadbury Caramel Filled Chocolate Eggs 48g from Villa Market. Our selection of Confectionery & Snacks, Chocolate, Sharing & Gifts, Dry Grocery, Grocery, Candies, Chocolate, and Marshmallows will make your shopping experience a breeze. Buy now and enjoy the best prices.">
<title>Buy Weilong Yam Chips Mixed Flavours 270g - Villa MarketMeta Tag: Weilong, Yam Chips Mixed Flavours, Confectionery & Snacks, Crisps & Snacks, Potato Chips, Dry Grocery, Grocery, Cookies & Snacks, Savoury Snacks</title><meta name="description" content=" Get the best of both worlds with Weilong Yam Chips Mixed Flavours. This delicious snack is made with a mix of yam and potato chips, giving you a unique and tasty snack. Perfect for sharing with friends and family, these chips are sure to be a hit. Enjoy the crunchy texture and the mix of flavours, including sweet, salty, a

In [32]:
print("invalid_htmls: ", invalid_htmls)
print("doesnt_have_required_attrs: ", doesnt_have_required_attrs)

invalid_htmls:  8
doesnt_have_required_attrs:  2633
