In [81]:
import requests
from selenium import webdriver as wbdv
import time
from bs4 import BeautifulSoup as bs
import re
import pandas as pd

# Data Extraction 

## Extracting all designers

In [189]:
all_designers_url="https://ap0cene.com/pages/ap0cene-designers-a-z"

In [190]:
all_designers_page=requests.get(all_designers_url)
soup=bs(all_designers_page.content, "html.parser")

In [191]:
all_designers_soup=soup.find_all(href=re.compile("https:\/\/ap0cene.com\/collections.+"))[2:]

In [192]:

df=pd.DataFrame()
designer_names=[]
designer_links=[]


for i,node in enumerate(all_designers_soup):
    designer_names.append(node.text)
    designer_links.append(node["href"])
    
df['designer_name']=designer_names
df['designer_page']=designer_links

#dropping invalide page
df.drop(21,inplace=True)

df.head(25)

Unnamed: 0,designer_name,designer_page
0,A Beautiful Orgasm,https://ap0cene.com/collections/a-beautiful-or...
1,Aephotika,https://ap0cene.com/collections/aephotika
2,Aiyana Monae,https://ap0cene.com/collections/aiyana-monae
3,Alexandra Zhukova,https://ap0cene.com/collections/alexandra-zhukova
4,Alisa Cayoo,https://ap0cene.com/collections/alisa-cayoo
5,Andagain,https://ap0cene.com/collections/andagain
6,Antispacetravel,https://ap0cene.com/collections/antispacetravel
7,Aporeei,https://ap0cene.com/collections/aporeei
8,Arezou,https://ap0cene.com/collections/arezou
9,Asterisk,https://ap0cene.com/collections/asterisk


In [200]:
#finding missing designer info

(df=='').sum()

designer_name    0
designer_page    0
dtype: int64

In [201]:
designers_with_name_missing=df[df['designer_name']=='']

In [202]:
designers_with_name_missing

Unnamed: 0,designer_name,designer_page


In [203]:
df.loc[0]["designer_name"]

'A Beautiful Orgasm'

In [197]:
#extract designer name from their personal page

def extract_designer_name(ind):
    url=df.loc[ind]["designer_page"]
    
    r=requests.get(url)
    soup=bs(r.content, "html.parser")
    designer_name=soup.find("h1", class_="font-heading text-lg").text
    
    return designer_name
    

In [198]:
#imputing missing names

for ind in designers_with_name_missing.index:
    try :
        df.loc[ind]["designer_name"]=extract_designer_name(ind)
    except:
        print(ind,"problem")

In [204]:
designers_with_name_missing.index

Int64Index([], dtype='int64')

In [205]:
#test again
(df=='').sum()

designer_name    0
designer_page    0
dtype: int64

In [218]:
#extracting designer description from their personal page
def extract_designer_description(url):
    r=requests.get(url)
    soup=bs(r.content, "html.parser")
    try :
        description=soup.find("div", class_="rte mt-4").text
        return description
    except:
        return "A designer."

In [219]:
df["description"]=df["designer_page"].apply(extract_designer_description)

In [220]:
##ffs where are the probelms
##ok upon inspection the probelm was some of these designers have no description, 
# should i give them one? 
#absolutely not, because they can find more more info on their product description 
#thank you!


df[(df["description"]=="A designer.")]

Unnamed: 0,designer_name,designer_page,description
45,Jae Kim,https://ap0cene.com/collections/jae-kim,A designer.
52,Kerne.Milk,https://ap0cene.com/collections/vendors?q=Kern...,A designer.
58,Maran,https://ap0cene.com/collections/maran,A designer.
85,Shulian,https://ap0cene.com/collections/vendors?q=shulian,A designer.
92,Spirit Guide 9,https://ap0cene.com/collections/spirit-guide-9,A designer.
93,Stomachofyourdeadsoulmate,https://ap0cene.com/collections/stomachofyourd...,A designer.


In [230]:
df['description']=df['description'].apply(lambda x: x.replace('\n',''))
df_designer=df

In [232]:
df_designer.head(2)

Unnamed: 0,designer_name,designer_page,description
0,A Beautiful Orgasm,https://ap0cene.com/collections/a-beautiful-or...,Lucky Truong is a self-taught designer based i...
1,Aephotika,https://ap0cene.com/collections/aephotika,"As a designer, Katya views fashion as a form o..."


# Product Extraction

In [2]:
url='https://ap0cene.com/collections/yue-qing-wei-1'

In [235]:
#scrap the main page of ap0cene and return designer items

def designer_page_scrap(url):
    r=requests.get(url)
    soup=bs(r.content, "html.parser")
    names=[]
    items=[]
    for i in soup.find_all("img", class_="responsive-image block absolute top-0 left-0 w-full h-full lazyload transition-opacity duration-200 ease-in-out w-full max-w-full h-auto" ):
        names.append(i['alt'])
        items.append('-'.join(i['alt'].lower().split()))
        
    return names,items
        

In [251]:
scrapped=df_designer['designer_page'].apply(designer_page_scrap)

In [261]:
[scrapped.apply(lambda x:x[0]),scrapped.apply(lambda x:x[1])]

[0      [Infinity Pants, Spider Top, Cool Girl Denim, ...
 1      [Realm Beyond, Calcified Heart Necklace, Fresh...
 2      [Bucket Hat, Bonnet, Mosaic Crochet Halter, Be...
 3      [Machine Knitted Grunge Top, Hairy Pants, Wrap...
 4      [Transparent Piercing Necklace, Nice Girl Knit...
                              ...                        
 109    [Crater Dress, Yo+ Bag, Flood Top, Pollution C...
 110    [Artemis Wool Hat, Undine Jester Headpiece, Na...
 111    [Cut Out Lace Sleeve, Chestnut Cut Out Corset ...
 112    [Blob Chrome Nails, Organic Unknown, Hybrid, C...
 113    [Angelic Devil Jersey Top, Teddy Boots Tan, Fl...
 Name: designer_page, Length: 113, dtype: object,
 0      [infinity-pants, spider-top, cool-girl-denim, ...
 1      [realm-beyond, calcified-heart-necklace, fresh...
 2      [bucket-hat, bonnet, mosaic-crochet-halter, be...
 3      [machine-knitted-grunge-top, hairy-pants, wrap...
 4      [transparent-piercing-necklace, nice-girl-knit...
                      

In [269]:
df_product=pd.DataFrame()

In [273]:
df_product=pd.DataFrame(data={'names':scrapped.apply(lambda x:x[0]),'items':scrapped.apply(lambda x:x[1])})

In [274]:
df_product

Unnamed: 0,names,items
0,"[Infinity Pants, Spider Top, Cool Girl Denim, ...","[infinity-pants, spider-top, cool-girl-denim, ..."
1,"[Realm Beyond, Calcified Heart Necklace, Fresh...","[realm-beyond, calcified-heart-necklace, fresh..."
2,"[Bucket Hat, Bonnet, Mosaic Crochet Halter, Be...","[bucket-hat, bonnet, mosaic-crochet-halter, be..."
3,"[Machine Knitted Grunge Top, Hairy Pants, Wrap...","[machine-knitted-grunge-top, hairy-pants, wrap..."
4,"[Transparent Piercing Necklace, Nice Girl Knit...","[transparent-piercing-necklace, nice-girl-knit..."
...,...,...
109,"[Crater Dress, Yo+ Bag, Flood Top, Pollution C...","[crater-dress, yo+-bag, flood-top, pollution-c..."
110,"[Artemis Wool Hat, Undine Jester Headpiece, Na...","[artemis-wool-hat, undine-jester-headpiece, na..."
111,"[Cut Out Lace Sleeve, Chestnut Cut Out Corset ...","[cut-out-lace-sleeve, chestnut-cut-out-corset-..."
112,"[Blob Chrome Nails, Organic Unknown, Hybrid, C...","[blob-chrome-nails, organic-unknown, hybrid, c..."


In [240]:
df_product['names']
df

(['Infinity Pants', 'Spider Top', 'Cool Girl Denim', 'Vermicelli Top'],
 ['infinity-pants', 'spider-top', 'cool-girl-denim', 'vermicelli-top'])

In [241]:
df_designer['designer_page'][0]

'https://ap0cene.com/collections/a-beautiful-orgasm'

In [4]:
names=designer_page_scrap(url)[0]

In [5]:
items=designer_page_scrap(url)[1]

In [6]:
names

['Cut Out Lace Sleeve',
 'Chestnut Cut Out Corset Top',
 'Ocean Wave Cut Out Mini Dress',
 'Cut Out Lace Tights',
 'Violet Knit Dress']

In [7]:
items

['cut-out-lace-sleeve',
 'chestnut-cut-out-corset-top',
 'ocean-wave-cut-out-mini-dress',
 'cut-out-lace-tights',
 'violet-knit-dress']

In [8]:
#scrapping the website using selenium webdriver 

def scrap_image(items,names):

    images=[]
    for x,item in enumerate(items):
        url=f'https://ap0cene.com/collections/yue-qing-wei-1/products/{item}'

        webdriver = wbdv.Chrome()
        webdriver.get(url)
        time.sleep(2)

        webdriver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)
        html = bs(webdriver.page_source,'html.parser')

        uls = html.find_all('img',alt=f'{names[x]}')
        for i in uls:
            try: 
                images.append(f'http:{i["src"]}')
            except:
                pass 
    return images

In [9]:
images=scrap_image(items,names)

In [10]:
images

['http://cdn.shopify.com/s/files/1/0542/1803/1277/products/kJ7OcwB19I_2048x2048.jpg?v=1648057231',
 'http://cdn.shopify.com/s/files/1/0542/1803/1277/products/XChyWB7wYF_2048x2048.jpg?v=1648057231',
 'http://cdn.shopify.com/s/files/1/0542/1803/1277/products/yiFg2oXMmk_2048x2048.jpg?v=1648057231',
 'http://cdn.shopify.com/s/files/1/0542/1803/1277/products/bEGnAsQmmA_2048x2048.jpg?v=1647045765',
 'http://cdn.shopify.com/s/files/1/0542/1803/1277/products/3NaMNkriya_2048x2048.jpg?v=1647045765',
 'http://cdn.shopify.com/s/files/1/0542/1803/1277/products/uFYtv1ZWmZ_2048x2048.jpg?v=1648065987',
 'http://cdn.shopify.com/s/files/1/0542/1803/1277/products/flVuTOBq2r_2048x2048.jpg?v=1648065969',
 'http://cdn.shopify.com/s/files/1/0542/1803/1277/products/mgKU3s3ok5_2048x2048.jpg?v=1648065969',
 'http://cdn.shopify.com/s/files/1/0542/1803/1277/products/e7Z83YVjZP_2048x2048.jpg?v=1648065969',
 'http://cdn.shopify.com/s/files/1/0542/1803/1277/products/xeinVZckST_2048x2048.jpg?v=1648065969',
 'http://c

In [11]:
#save the images locally from the url

def save_images(images):
    for x,image in enumerate(images):
        r = requests.get(image, stream=True).content
        with open(f'images/image_{x}.jpg', 'wb') as handler:
            handler.write(r)
    
    print('all images saved locally')
    pass

In [12]:
save_images(images)

all images saved locally


In [13]:
r = requests.get(images[0], stream=True).content

In [14]:
with open('images/image_test.jpg', 'wb') as file: 
    file.write(r)

In [15]:
!ls 

README.md     ap0cene.ipynb [1m[36mimages[m[m


In [16]:
for x,image in enumerate(images):
    r = requests.get(image, stream=True).content
    with open(f'images/image_{x}.jpg', 'wb') as handler:
            handler.write(r)
    
print('all images saved locally')

all images saved locally


In [17]:
images

['http://cdn.shopify.com/s/files/1/0542/1803/1277/products/kJ7OcwB19I_2048x2048.jpg?v=1648057231',
 'http://cdn.shopify.com/s/files/1/0542/1803/1277/products/XChyWB7wYF_2048x2048.jpg?v=1648057231',
 'http://cdn.shopify.com/s/files/1/0542/1803/1277/products/yiFg2oXMmk_2048x2048.jpg?v=1648057231',
 'http://cdn.shopify.com/s/files/1/0542/1803/1277/products/bEGnAsQmmA_2048x2048.jpg?v=1647045765',
 'http://cdn.shopify.com/s/files/1/0542/1803/1277/products/3NaMNkriya_2048x2048.jpg?v=1647045765',
 'http://cdn.shopify.com/s/files/1/0542/1803/1277/products/uFYtv1ZWmZ_2048x2048.jpg?v=1648065987',
 'http://cdn.shopify.com/s/files/1/0542/1803/1277/products/flVuTOBq2r_2048x2048.jpg?v=1648065969',
 'http://cdn.shopify.com/s/files/1/0542/1803/1277/products/mgKU3s3ok5_2048x2048.jpg?v=1648065969',
 'http://cdn.shopify.com/s/files/1/0542/1803/1277/products/e7Z83YVjZP_2048x2048.jpg?v=1648065969',
 'http://cdn.shopify.com/s/files/1/0542/1803/1277/products/xeinVZckST_2048x2048.jpg?v=1648065969',
 'http://c