# Scraping Poshmark for Recent Postings
### _Date of Scrape: May 28, 2020_

### Imports

In [1]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import threading

### Poshmark - Men

In [2]:
base_url = 'https://poshmark.com/category/Men'
res = get(base_url)
print(res.status_code)

200


In [4]:
def get_poshmark(url):
    """
    Returns BeautifulSoup object is status_code == 200
    else, returns status code
    """
    res = get(base_url)
    if res.status_code == 200:
        return BeautifulSoup(res.text)
    else:
        return res.status_code

In [5]:
res = get_poshmark(base_url)
res

<Response [200]>

In [6]:
soup = BeautifulSoup(res.text)

# Create a list off all div containers that have a class of 'tile'
# These are the boxes that represent a single item when browsing the site
tiles = soup.find_all('div', class_ = 'tile')
print(len(tiles))

48


Poshmark definitely has more than 48 pieces for sale on their site. The reason why we have only captured 48 is because that is the amount of tiles that are initially loaded to the page. This is done to alleviate the initial load time of Poshmark.

In short, the current scrape will be limited to the 48 most recent postings. 

For now, we will worry about being able to put values into a Pandas dataframe. Getting more observations will be a problem to tackle for tomorrow.

#### Extracting Values
https://www.tablesgenerator.com/markdown_tables#

Want to also scrape the time it was posted, but this requires me to go into the Page URL to grab that info. Same case for color. Another day.

|   Column  | dtype |                  Example Values                  |       Description      |
|:---------:|:-----:|:------------------------------------------------:|:----------------------:|
|   Title   |  str  | Men's New Balance Revlite 1550 Army Green size 9 |  Title given by seller |
| Seller    | str   | example_username                                 | Username of seller     |
|   Price   |  int  |                        $25                       | Asking price by seller |
|    Size   |  str  |                                                  |                        |
|   Brand   |  str  |                                                  |                        |
|  Page URL |  str  |                                                  |                        |
| Image URL |  str  |                                                  |                        |

In [7]:
# For the sake of demonstration for myself, I will be extracting values from a single tile
ex_tile = tiles[0]
print(ex_tile.prettify())

<div class="tile col-x12 col-l6 col-s8">
 <div class="card card--small">
  <a class="tile__covershot" data-et-element-type="image" data-et-name="listing" data-et-prop-listing_id="5e0eacefbbf076b9baa0e724" data-et-prop-location="listing_tile" data-et-prop-unit_position="0" href="/listing/BURBERRY-Classic-Fit-Polo-Shirt-5e0eacefbbf076b9baa0e724">
   <div class="img__container img__container--square">
    <img alt="BURBERRY Classic Fit Polo Shirt" data-src="https://di2ponv0v5otw.cloudfront.net/posts/2020/01/02/5e0eacefbbf076b9baa0e724/s_5e0eacff2cc51539b35ab013.jpg" src="https://di2ponv0v5otw.cloudfront.net/posts/2020/01/02/5e0eacefbbf076b9baa0e724/s_5e0eacff2cc51539b35ab013.jpg"/>
   </div>
   <!-- -->
  </a>
  <div class="item__details">
   <div class="title__condition__container">
    <a class="tile__title tc--b" data-et-element-type="link" data-et-name="listing" data-et-prop-listing_id="5e0eacefbbf076b9baa0e724" data-et-prop-location="listing_tile" data-et-prop-unit_position="0" href=

In [8]:
print(ex_tile.find_all('a', attrs = {
    'data-et-element-type': 'image'
})[0].prettify())

<a class="tile__covershot" data-et-element-type="image" data-et-name="listing" data-et-prop-listing_id="5e0eacefbbf076b9baa0e724" data-et-prop-location="listing_tile" data-et-prop-unit_position="0" href="/listing/BURBERRY-Classic-Fit-Polo-Shirt-5e0eacefbbf076b9baa0e724">
 <div class="img__container img__container--square">
  <img alt="BURBERRY Classic Fit Polo Shirt" data-src="https://di2ponv0v5otw.cloudfront.net/posts/2020/01/02/5e0eacefbbf076b9baa0e724/s_5e0eacff2cc51539b35ab013.jpg" src="https://di2ponv0v5otw.cloudfront.net/posts/2020/01/02/5e0eacefbbf076b9baa0e724/s_5e0eacff2cc51539b35ab013.jpg"/>
 </div>
 <!-- -->
</a>



##### Title and Page URL

Since the title of the post and the link to post is in the same tag, we will grab both in the same cell below.

In [9]:
# Find the first instance of an a tag with the 'tile__title' class.
# Then strip all leading and trailing whitespace from the resulting text.
ex_title_pageurl = ex_tile.find('a', class_ = 'tile__title')
ex_title = ex_title_pageurl.get_text(strip = True)
ex_page_url = ex_title_pageurl.get('href')
print(f'title: {ex_title}')
print(f'page url: https://www.poshmark.com{ex_page_url}')

title: BURBERRY Classic Fit Polo Shirt
page url: https://www.poshmark.com/listing/BURBERRY-Classic-Fit-Polo-Shirt-5e0eacefbbf076b9baa0e724


In [10]:
def get_title(tile):
    try:
        title = tile.find('a', class_ = 'tile__title').get_text(strip = True)
        return title
    except:
        return None

get_title(ex_tile)

'BURBERRY Classic Fit Polo Shirt'

In [11]:
def get_item_page_url(tile):
    "Get's the URL of the item. Is preceeded by 'www.poshmark.com'"
    try:
        return tile.find('a', class_ = 'tile__title').get('href')
    except:
        None
        
get_item_page_url(ex_tile)

'/listing/BURBERRY-Classic-Fit-Polo-Shirt-5e0eacefbbf076b9baa0e724'

##### Seller

In [12]:
ex_seller = ex_tile.find('span', class_ = 'tc--g m--l--1').get_text(strip = True)
print(ex_seller)

cola729


In [13]:
def get_seller(tile):
    try:
        return tile.find('span', class_ = 'tc--g m--l--1').get_text(strip = True)
    except:
        return None

get_seller(ex_tile)

'cola729'

##### Price

In [14]:
ex_price = ex_tile.find('span', class_ = 'fw--bold').get_text(strip = True)
print(ex_price)

$64


In [15]:
def get_price(tile):
    try:
        # Skip the dollar sign and return the rest of the string as an int
        return int(tile.find('span', class_ = 'fw--bold').get_text(strip = True)[1:])
    except:
        None

get_price(ex_tile)

64

##### Size

In [16]:
ex_size = ex_tile.find('a', class_ = \
                       'tile__details__pipe__size').get_text(strip = True).strip('Size: ')
print(ex_size)

L


In [17]:
def get_size(tile):
    try:
        return tile.find('a', class_ = \
                         'tile__details__pipe__size').get_text(strip = True).strip('Size: ')
    except:
        return None

get_size(ex_tile)

'L'

##### Brand

In [18]:
# laziness at it's finest
try:
    ex_brand = ex_tile.find('a', class_ = 'tile__details__pipe__brand').get_text(strip = True)
    print(ex_brand)
except:
    pass

Burberry


In [19]:
def get_brand(tile):
    try:
        return tile.find('a', class_ = 'tile__details__pipe__brand').get_text(strip = True)
    except:
        return None

get_brand(ex_tile)

'Burberry'

##### Image URL

In [20]:
ex_img = ex_tile.find('img').get('data-src')
print(ex_img)

https://di2ponv0v5otw.cloudfront.net/posts/2020/01/02/5e0eacefbbf076b9baa0e724/s_5e0eacff2cc51539b35ab013.jpg


In [22]:
def get_img(tile):
    try:
        return tile.find('img').get('data-src')
    except:
        return None

# Check if the result of the function is the same
# as the link above via copy and paste
get_img(ex_tile)

'https://di2ponv0v5otw.cloudfront.net/posts/2020/01/02/5e0eacefbbf076b9baa0e724/s_5e0eacff2cc51539b35ab013.jpg'

### Checking If the Functions Work on Other Tiles For Consistency

In [23]:
ex_tile_1 = tiles[1]
print(ex_tile_1.prettify())

<div class="tile col-x12 col-l6 col-s8">
 <div class="card card--small">
  <a class="tile__covershot" data-et-element-type="image" data-et-name="listing" data-et-prop-listing_id="5e6af2dde974fb00fe86d02f" data-et-prop-location="listing_tile" data-et-prop-unit_position="1" href="/listing/Levis-iconic-501-button-fly-jeans-Size-34-x-30-5e6af2dde974fb00fe86d02f">
   <div class="img__container img__container--square">
    <img alt="Levi's iconic 501 button fly jeans. Size 34 x 30" data-src="https://di2ponv0v5otw.cloudfront.net/posts/2020/03/12/5e6af2dde974fb00fe86d02f/s_5e6af53f22bd7c10f97bd430.jpg" src="https://di2ponv0v5otw.cloudfront.net/posts/2020/03/12/5e6af2dde974fb00fe86d02f/s_5e6af53f22bd7c10f97bd430.jpg"/>
   </div>
   <!-- -->
  </a>
  <div class="item__details">
   <div class="title__condition__container">
    <a class="tile__title tc--b" data-et-element-type="link" data-et-name="listing" data-et-prop-listing_id="5e6af2dde974fb00fe86d02f" data-et-prop-location="listing_tile" data

In [24]:
get_title(ex_tile_1)

"Levi's iconic 501 button fly jeans. Size 34 x 30"

In [25]:
get_seller(ex_tile_1)

'by_the_shore'

In [26]:
get_price(ex_tile_1)

34

In [27]:
get_size(ex_tile_1)

'34'

In [28]:
get_brand(ex_tile_1)

"Levi's"

In [29]:
print(get_img(ex_tile_1))

https://di2ponv0v5otw.cloudfront.net/posts/2020/03/12/5e6af2dde974fb00fe86d02f/s_5e6af53f22bd7c10f97bd430.jpg


That's enough testing for me! I'm sure I'm going to encounter a problem as I loop through the list of 46 other entries, but let's go for it!

#### Populating a DataFrame with 48 Poshmark Items

In [28]:
titles, sellers, prices, sizes, brands, p_urls, i_urls = [], [], [], [], [], [], []
for tile in tiles:
    titles.append(get_title(tile))
    sellers.append(get_seller(tile))
    prices.append(get_price(tile))
    sizes.append(get_size(tile))
    brands.append(get_brand(tile))
    p_urls.append(get_item_page_url(tile))
    i_urls.append(get_img(tile))
df = pd.DataFrame({
    'title': titles,
    'seller': sellers,
    'price': prices,
    'size': sizes,
    'brand': brands,
    'page_url': p_urls,
    'img_url': i_urls
})

df.head()

Unnamed: 0,title,seller,price,size,brand,page_url,img_url
0,Men’s Jean Shorts,everydaytrend,35,32,Desert Dunes,/listing/Mens-Jean-Shorts-5db5e6f9de696a9ecd6a...,https://di2ponv0v5otw.cloudfront.net/posts/201...
1,Rhode Island is for Haters T,reberebequita,25,,,/listing/Rhode-Island-is-for-Haters-T-589a6ec6...,https://di2ponv0v5otw.cloudfront.net/posts/201...
2,NWT Michael Kors Men’s Smartwatch,sosi0204,299,O,Michael Kors,/listing/NWT-Michael-Kors-Mens-Smartwatch-5e4a...,https://di2ponv0v5otw.cloudfront.net/posts/202...
3,Air Nike Diamond Turf ‘Football’ Size 10,aj_clayton,59,10,Nike,/listing/Air-Nike-Diamond-Turf-Football-Size-1...,https://di2ponv0v5otw.cloudfront.net/posts/202...
4,SIGNUM CHECKERED BUTTON UP SHORT SLEEVE SHIRT,itsonbackwards,25,XL,Signum,/listing/SIGNUM-CHECKERED-BUTTON-UP-SHORT-SLEE...,https://di2ponv0v5otw.cloudfront.net/posts/201...


In [None]:
get_poshmark()

In [65]:
df.dtypes

title       object
seller      object
price        int64
size        object
brand       object
page_url    object
img_url     object
dtype: object

Sizes are all over the place. Are the XL shirts? Size 11... shoes, hats, pants? Will need to see if we can categorize these item pieces as we scrape.

#### Additional Info
We need to navigate to each item's dedicated page to get information such as posting datetime, type of apparel, and colors.

Ideally, this should be done as the above scrape happens. So let's make two threads. One that scrapes the tiles and another that scrapes every dedicated page.

In [66]:
def go_to_item_url(df, index):
    url = 'https://www.poshmark.com' + df['page_url'].iloc[index]
    print(url)
    return url

In [67]:
go_to_page(df, 1)

https://www.poshmark.com/listing/Gerry-Mens-Bearwood-Workwear-Puffer-Jacket-Slate-5e6e77eb80afe1f67943cfee


'https://www.poshmark.com/listing/Gerry-Mens-Bearwood-Workwear-Puffer-Jacket-Slate-5e6e77eb80afe1f67943cfee'

In [30]:
def scrape_poshmark(url = 'https://www.poshmark.com/category/Men'):
    # getting access to url
    res = get_poshmark(url)
    soup = BeautifulSoup(res.text)
    
    # getting all div containers for tiles
    tiles = soup.find_all('div', class_ = 'tile')
    
    titles, sellers, prices, sizes, brands, p_urls, i_urls = [], [], [], [], [], [], []
    for i, tile in enumerate(tiles[:1]):
        titles.append(get_title(tile))
        sellers.append(get_seller(tile))
        prices.append(get_price(tile))
        sizes.append(get_size(tile))
        brands.append(get_brand(tile))
        p_urls.append(get_item_page_url(tile))
        i_urls.append(get_img(tile))
#         curr_item = go_to_page(p_urls[-1])

#         item_res = get_poshmark(curr_item)
#         item_soup = BeautifulSoup(item_res.text)
    df = pd.DataFrame({
        'title': titles,
        'seller': sellers,
        'price': prices,
        'size': sizes,
        'brand': brands,
        'page_url': p_urls,
        'img_url': i_urls
    })
    return df
df = scrape_poshmark()
df

Unnamed: 0,title,seller,price,size,brand,page_url,img_url
0,AE Shirt,glitzgal15,5,L,American Eagle Outfitters,/listing/AE-Shirt-5bc3883fa31c338b3c50b69d,https://di2ponv0v5otw.cloudfront.net/posts/201...


In [82]:
def go_to_item_page_url(index, df = df):
    "Print and return URL of item"
    p_url = df['page_url'].iloc[index]
    print('https://www.poshmark.com' + p_url)
    return 'https://www.poshmark.com' + p_url

In [81]:
print_page()

https://www.poshmark.com/listing/PRPS-Jeans-Mens-32-x-28-Demon-Selvedge-Button-Fly-5e61a799fe19c7fcc81c83b7


In [83]:
get_poshmark()

'https://www.poshmark.com/listing/PRPS-Jeans-Mens-32-x-28-Demon-Selvedge-Button-Fly-5e61a799fe19c7fcc81c83b7'

In [75]:
print('https://www.poshmark.com' + df['page_url'].iloc[0])

https://www.poshmark.com/listing/PRPS-Jeans-Mens-32-x-28-Demon-Selvedge-Button-Fly-5e61a799fe19c7fcc81c83b7


In [70]:
print(item_soup.prettify())

<!DOCTYPE html>
<html data-vue-meta="%7B%22lang%22:%7B%221%22:%22en%22%7D,%22xml:lang%22:%7B%221%22:%22en%22%7D,%22xmlns%22:%7B%221%22:%22http://www.w3.org/1999/xhtml%22%7D,%22data-vue-meta-server-rendered%22:%7B%221%22:true%7D%7D" data-vue-meta-server-rendered="true" lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <!-- NREUM: (0) -->
  <title>
   Men on Poshmark
  </title>
  <meta charset="utf-8" data-vue-meta="1"/>
  <meta content="IE=edge,chrome=1" data-vue-meta="1" http-equiv="X-UA-Compatible"/>
  <meta content="YzU6jJ3yJ0My_t6b2CmL_z-yjo9gN2QUO2MmQHjAgM0" data-vue-meta="1" name="google-site-verification"/>
  <meta content="09278921fdde" data-vue-meta="1" name="bitly-verification"/>
  <meta content="width=device-width, initial-scale=1.0" data-vue-meta="1" name="viewport"/>
  <meta content="en" data-vue-meta="1" http-equiv="Content-Language"/>
  <meta content="Poshmark" data-vue-meta="1" property="og:site_name"/>
  <meta content="#ffffff" data-vue-meta="1" na