In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pymongo

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# Define database and collection
db = client.craigslist_db
collection = db.items

In [4]:
# URL of page to be scraped
url = 'https://newjersey.craigslist.org/search/sss?sort=rel&query=guitar'

# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')

In [5]:
# Examine the results, then determine element that contains sought info
# results are returned as an iterable list
results = soup.find_all('li', class_='result-row')

# Loop through returned results
for result in results:
    # Error handling
    try:
        # Identify and return title of listing
        title = result.find('a', class_='result-title').text
        # Identify and return price of listing
        price = result.a.span.text
        # Identify and return link to listing
        link = result.a['href']

        # Run only if title, price, and link are available
        if (title and price and link):
            # Print results
            print('-------------')
            print(title)
            print(price)
            print(link)

            # Dictionary to be inserted as a MongoDB document
            post = {
                'title': title,
                'price': price,
                'url': link
            }

            collection.insert_one(post)

    except Exception as e:
        print(e)

-------------
(PRICE DROP) Blackstar HT-20R MkII 20W 1x12 Tube Combo Guitar Amp
$420
https://newjersey.craigslist.org/msg/d/whippany-price-drop-blackstar-ht-20r/7509693928.html
-------------
2017 GIBSON Memphis ES-335 Block Figured Cherry Semi-Hollow Guitar
$3,800
https://newjersey.craigslist.org/msg/d/whippany-2017-gibson-memphis-es-335/7518067066.html
-------------
Ibanez Acoustic-Electric Guitar + Hardshell Case
$200
https://newjersey.craigslist.org/msg/d/wayne-ibanez-acoustic-electric-guitar/7520224879.html
-------------
PRS Electric Guitar
$400
https://newjersey.craigslist.org/msg/d/montvale-prs-electric-guitar/7520193412.html
-------------
Washburn 6-String Acoustic Guitar w/Gig Bag - BRAND NEW
$120
https://newjersey.craigslist.org/msg/d/elizabeth-washburn-string-acoustic/7520163132.html
-------------
Vintage Ampeg 12" Guitar Speaker
$55
https://newjersey.craigslist.org/msg/d/allamuchy-vintage-ampeg-12-guitar/7511934150.html
-------------
Gibson 12 string guitar, B-45
$1,200
http

In [6]:
# Display items in MongoDB collection
listings = db.items.find()

for listing in listings:
    print(listing)

{'_id': ObjectId('62f57de1a5ecf7221bcbcbe7'), 'title': '(PRICE DROP) Blackstar HT-20R MkII 20W 1x12 Tube Combo Guitar Amp', 'price': '$420', 'url': 'https://newjersey.craigslist.org/msg/d/whippany-price-drop-blackstar-ht-20r/7509693928.html'}
{'_id': ObjectId('62f57de1a5ecf7221bcbcbe8'), 'title': '2017 GIBSON Memphis ES-335 Block Figured Cherry Semi-Hollow Guitar', 'price': '$3,800', 'url': 'https://newjersey.craigslist.org/msg/d/whippany-2017-gibson-memphis-es-335/7518067066.html'}
{'_id': ObjectId('62f57de1a5ecf7221bcbcbe9'), 'title': 'Ibanez Acoustic-Electric Guitar + Hardshell Case', 'price': '$200', 'url': 'https://newjersey.craigslist.org/msg/d/wayne-ibanez-acoustic-electric-guitar/7520224879.html'}
{'_id': ObjectId('62f57de1a5ecf7221bcbcbea'), 'title': 'PRS Electric Guitar', 'price': '$400', 'url': 'https://newjersey.craigslist.org/msg/d/montvale-prs-electric-guitar/7520193412.html'}
{'_id': ObjectId('62f57de1a5ecf7221bcbcbeb'), 'title': 'Washburn 6-String Acoustic Guitar w/Gig 

In [7]:
# You can also issue MongoDB queries (MongoDB has its own syntax, and pymongo uses that syntax)
# Here, we return only the "price" field--
# More info here: https://www.mongodb.com/docs/manual/tutorial/project-fields-from-query-results/

prices = db.items.find({}, {'price': 1, '_id': 0})

for price in prices:
    print(price)

{'price': '$420'}
{'price': '$3,800'}
{'price': '$200'}
{'price': '$400'}
{'price': '$120'}
{'price': '$55'}
{'price': '$1,200'}
{'price': '$380'}
{'price': '$500'}
{'price': '$45'}
{'price': '$30'}
{'price': '$280'}
{'price': '$50'}
{'price': '$500'}
{'price': '$100'}
{'price': '$200'}
{'price': '$170'}
{'price': '$75'}
{'price': '$80'}
{'price': '$200'}
{'price': '$150'}
{'price': '$175'}
{'price': '$125'}
{'price': '$75'}
{'price': '$449'}
{'price': '$385'}
{'price': '$385'}
{'price': '$1,700'}
{'price': '$275'}
{'price': '$15'}
{'price': '$5'}
{'price': '$20'}
{'price': '$975'}
{'price': '$1,500'}
{'price': '$15'}
{'price': '$195'}
{'price': '$65'}
{'price': '$30'}
{'price': '$3'}
{'price': '$40'}
{'price': '$175'}
{'price': '$100'}
{'price': '$40'}
{'price': '$45'}
{'price': '$35'}
{'price': '$50'}
{'price': '$5'}
{'price': '$75'}
{'price': '$50'}
{'price': '$60'}
{'price': '$240'}
{'price': '$850'}
{'price': '$200'}
{'price': '$65'}
{'price': '$250'}
{'price': '$700'}
{'price': '

In [8]:
# Those rows look like individual dictionaries with just one entry each.  
# If they are, we could get the values by accessing the dictionaries by key.

prices = db.items.find({}, {'price': 1, '_id': 0})

for price in prices:
    print(price['price'])

$420
$3,800
$200
$400
$120
$55
$1,200
$380
$500
$45
$30
$280
$50
$500
$100
$200
$170
$75
$80
$200
$150
$175
$125
$75
$449
$385
$385
$1,700
$275
$15
$5
$20
$975
$1,500
$15
$195
$65
$30
$3
$40
$175
$100
$40
$45
$35
$50
$5
$75
$50
$60
$240
$850
$200
$65
$250
$700
$60
$275
$50
$395
$295
$175
$950
$125
$2,500
$525
$80
$3,000
$300
$120
$50
$0
$200
$150
$440
$135
$560
$700
$490
$0
$2,650
$35
$225
$150
$90
$1,700
$2,499
$999
$1,449
$125
$90
$599
$10
$25
$100
$50
$20
$20
$20
$75
$179
$0
$275
$250
$125
$375
$235
$75
$200
$600
$250
$300
$150
$130


In [9]:
# We can also strip out the dollar signs, take what's left, and put them in a list.
# Better yet, let's get both the titles and prices and make a single dictionary out of them!

data_dict = {}

listings = db.items.find({}, {'title': 1, 'price': 1, '_id': 0})

for n, listing in enumerate(listings):
    data_dict[n] = {'title': listing['title'], 
                    'price': listing['price']
                   }
    
data_dict 

{0: {'title': '(PRICE DROP) Blackstar HT-20R MkII 20W 1x12 Tube Combo Guitar Amp',
  'price': '$420'},
 1: {'title': '2017 GIBSON Memphis ES-335 Block Figured Cherry Semi-Hollow Guitar',
  'price': '$3,800'},
 2: {'title': 'Ibanez Acoustic-Electric Guitar + Hardshell Case',
  'price': '$200'},
 3: {'title': 'PRS Electric Guitar', 'price': '$400'},
 4: {'title': 'Washburn 6-String Acoustic Guitar w/Gig Bag - BRAND NEW',
  'price': '$120'},
 5: {'title': 'Vintage Ampeg 12" Guitar Speaker', 'price': '$55'},
 6: {'title': 'Gibson 12 string guitar, B-45', 'price': '$1,200'},
 7: {'title': 'Seagull Coastline Spruce Dreadnought Acoustic Guitar',
  'price': '$380'},
 8: {'title': 'X Guitar by Alesis', 'price': '$500'},
 9: {'title': 'Blue Tweed Dreadnaught Guitar Bag', 'price': '$45'},
 10: {'title': 'VOX amPlug 2 Bass Guitar Headphone Amp', 'price': '$30'},
 11: {'title': 'Yamaha RBX 170 Red Metallic Electric Bass Guitar',
  'price': '$280'},
 12: {'title': 'Zoom 505 II Guitar Multi F/X Pedal

In [10]:
# We could also put these results in a pandas DataFrame.
import pandas as pd

def remove_chars(txt):
    char_list = '$,'
    for char in char_list:
        txt = txt.replace(char, '')
    return txt

listings_df = pd.DataFrame(data_dict).transpose()   # Note the transposition--take out the tranpose() and see what it looks like.

listings_df.loc[:, 'price'] = listings_df.loc[:, 'price'].apply(lambda x: float(remove_chars(x)))

listings_df.head(10)


Unnamed: 0,title,price
0,(PRICE DROP) Blackstar HT-20R MkII 20W 1x12 Tu...,420.0
1,2017 GIBSON Memphis ES-335 Block Figured Cherr...,3800.0
2,Ibanez Acoustic-Electric Guitar + Hardshell Case,200.0
3,PRS Electric Guitar,400.0
4,Washburn 6-String Acoustic Guitar w/Gig Bag - ...,120.0
5,"Vintage Ampeg 12"" Guitar Speaker",55.0
6,"Gibson 12 string guitar, B-45",1200.0
7,Seagull Coastline Spruce Dreadnought Acoustic ...,380.0
8,X Guitar by Alesis,500.0
9,Blue Tweed Dreadnaught Guitar Bag,45.0


In [11]:
# Remember pd.cut to bin continuous variables?
# Let's put the prices in some arbitrary bins and get some summary stats to
#   see how many listings there are within those price bins and what the average
#   price is in each of those bins.

bins = [0, 100, 500, 1000, 5000, 10000]
listings_df.groupby(pd.cut(listings_df['price'], bins)).agg(['count','mean'])

Unnamed: 0_level_0,price,price
Unnamed: 0_level_1,count,mean
price,Unnamed: 1_level_2,Unnamed: 2_level_2
"(0, 100]",42,49.47619
"(100, 500]",49,253.938776
"(500, 1000]",10,745.8
"(1000, 5000]",10,2199.8
"(5000, 10000]",0,
