# Libraries

In [67]:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import re

%matplotlib inline

pd.set_option("display.max_colwidth",None)

# Code

In [85]:
#creating html session
s = HTMLSession()

#empty list to store the data
data = []
sr_no = -1

#taking input from user to get keyword 
keyword = "Zimbabwe 100 Trillion"
#input('Enter your keyword here: ')

#gathering urls for extracting data from first six pages
urls = ['https://www.ebay.com/sch/i.html?_from=R40&_nkw={}&_sacat=0&_ipg=240&_pgn={}&rt=nc'.format(keyword, x) for x in range(1,11)]

for url in urls:
    
    page = s.get(url.strip())
    soup = BeautifulSoup(page.text, "html.parser")

    # Find all of the elements with the class "s-item__info.clearfix"
    items = soup.find_all(class_ = "s-item__info clearfix")

    # Loop through the items and extract the data and append it to the data list
    for item in items:
            
        title = item.find(class_="s-item__title").get_text()
        price = item.find(class_="s-item__price").get_text()
        #condition = item.find(class_ = "SECONDARY_INFO").get_text()
        sr_no = sr_no+1
        
        try:
            seller_info = item.find(class_ = "s-item__seller-info-text").get_text()[8:]
        except:
            seller_info = "N/A"
        
        data.append([sr_no,title, price, seller_info])

# DataFrame

In [86]:
df = pd.DataFrame(data, columns=["Sr_No","Title","Price","Seller Info"])
df.drop(0,axis=0,inplace=True)
df.head()

Unnamed: 0,Sr_No,Title,Price,Seller Info
1,1,"100 TRILLION DOLLAR ZIMBABWE AA 2008 SERIES P91 - USED Condition, FAST SHIP, COA",$129.95,"bles-currency (3,041) 99.9%"
2,2,"Zimbabwe 100 Trillion Banknote 1 Note AA/2008, P-91 UNC Authenticity Guaranteed!",$128.99,"ions (6,303) 99.8%"
3,3,Zimbabwe 100 Trillion Dollars 2008 AA P-91 Banknote New UNC Zim Currency w/COA,$217.17,"nandcard (1,083) 100%"
4,4,Zimbabwe 50 TRILLION DOLLAR BILL AA/2008 UNC P-90 100% COA genuine Sale FEW LEFT,$41.00,"ions (6,303) 99.8%"
5,5,Zimbabwe $100 Trillion Dollars Gold Bill Banknote Money Collection Certificate,$2.99,"10 (49,126) 99.8%"


# Data Preparation

**Creating a list of words in the title**

df["Title"] = df["Title"].str.upper()
df["List of words"] = df["Title"].str.split(' ')
df.head()

In [87]:
df["Title"] = df["Title"].str.upper()
df.dtypes

Sr_No           int64
Title          object
Price          object
Seller Info    object
dtype: object

**Removing the rows that had prices in Range format**

In [88]:
mask = df['Price'].str.len() > 11

print(mask.value_counts())

df = df.drop(index=df[mask].index)
df.head()

False    2352
True       10
Name: Price, dtype: int64


Unnamed: 0,Sr_No,Title,Price,Seller Info
1,1,"100 TRILLION DOLLAR ZIMBABWE AA 2008 SERIES P91 - USED CONDITION, FAST SHIP, COA",$129.95,"bles-currency (3,041) 99.9%"
2,2,"ZIMBABWE 100 TRILLION BANKNOTE 1 NOTE AA/2008, P-91 UNC AUTHENTICITY GUARANTEED!",$128.99,"ions (6,303) 99.8%"
3,3,ZIMBABWE 100 TRILLION DOLLARS 2008 AA P-91 BANKNOTE NEW UNC ZIM CURRENCY W/COA,$217.17,"nandcard (1,083) 100%"
4,4,ZIMBABWE 50 TRILLION DOLLAR BILL AA/2008 UNC P-90 100% COA GENUINE SALE FEW LEFT,$41.00,"ions (6,303) 99.8%"
5,5,ZIMBABWE $100 TRILLION DOLLARS GOLD BILL BANKNOTE MONEY COLLECTION CERTIFICATE,$2.99,"10 (49,126) 99.8%"


**Converting the datatype of price from string to numeric**

In [89]:
df['Price'] = df['Price'].str.replace('$', '')
df['Price'] = df['Price'].str.replace(',', '')
df['Price'] = pd.to_numeric(df["Price"])

  df['Price'] = df['Price'].str.replace('$', '')


**Sorting the data in descending order**

In [99]:
df_desc = df.sort_values("Price",ascending=False)
df_desc.head()

Unnamed: 0,Sr_No,Title,Price,Seller Info
119,119,TT PK 91 2008 ZIMBABWE 100 TRILLION DOLLARS 1000 GEM NOTES A REAL ORIGINAL BRICK,199500.0,"raditions.com (37,645) 100%"
107,107,ZIMBABWE 100 TRILLION DOLLAR P-91 X 100 2008 BUNDLE UNC LARGEST DENOMINATION LOT,99999.99,"rencyandcoin (34,634) 100%"
992,992,"100 CONSECUTIVE 2008 100 TRILLION DOLLARS RESERVE BANK OF ZIMBABWE, AA P-91 UNC",34999.95,"1988 (22,077) 99.9%"
655,655,"PACK OF (100) 2008 100 TRILLION DOLLARS RESERVE BANK OF ZIMBABWE, AA P-91 UNC",29999.95,"1988 (22,077) 99.9%"
142,142,ZIMBABWE 100 BILLION DOLLARS AGRO CHEQUE P64 UNC BUNDLE 100 TRILLION P91 BEARER*,26000.0,"(4,851) 100%"


In [91]:
df2 = df[(df['Price'] >= 50) & (df['Price'] <= 300)]

In [100]:
df2.sort_values("Price",ascending=False).head()

Unnamed: 0,Sr_No,Title,Price,Seller Info
1255,1255,"** 2 - ZIMBABWE BANKNOTE, $100 TRILLION DOLLARS, AA SERIES, 2008 - AUTHENTIC **",300.0,national (16) 100%
875,875,10 X ZIMBABWE 10 TRILLION DOLLARS CIRCULATED AA/2008 / $100 TRILLION SERIES,300.0,"efinds (14,666) 100%"
1476,1476,100 TRILLION ZIMBABWE NOTE PCGS 64 AUTHENTICATED GENUINE NEW UNCIRCULATED.,300.0,an_ever_4 (2) 100%
1927,1927,FANCY SERIAL NUMBER 1 100 TRILLION DOLLAR BILL BIRTHDAY ZIMBABWE NOTE AA1998817,300.0,"urcash (1,272) 96.9%"
1070,1070,"** 2 - ZIMBABWE BANKNOTE, $100 TRILLION DOLLARS, AA SERIES, 2008 - AUTHENTIC **",300.0,national (16) 100%


In [97]:
df2.shape

(704, 4)

In [112]:
pd.options.display.max_rows = None
common = pd.Series(' '.join(df['Title']).split()).value_counts()
common

ZIMBABWE                          2216
TRILLION                          1928
100                               1780
DOLLARS                           1096
BANKNOTE                           983
2008                               828
DOLLAR                             730
UNC                                498
GOLD                               485
AA                                 446
BILLION                            397
50                                 396
SERIES                             368
10                                 352
COLLECTION                         349
X                                  342
BANKNOTES                          274
$100                               268
COA                                265
ONE                                241
HUNDRED                            241
FOIL                               238
SET                                238
GIFT                               235
AUTHENTIC                          234
BILL                     

In [102]:
#Identify uncommon words
#uncommon =  pd.Series(' '.join(df['Title']).split()).value_counts()[-25:]
#uncommon

In [108]:
# trim title - leading and trailing spaces
df['Title'] = df['Title'].str.strip()

In [109]:
df.head()

Unnamed: 0,Sr_No,Title,Price,Seller Info
1,1,"100 TRILLION DOLLAR ZIMBABWE AA 2008 SERIES P91 - USED CONDITION, FAST SHIP, COA",129.95,"bles-currency (3,041) 99.9%"
2,2,"ZIMBABWE 100 TRILLION BANKNOTE 1 NOTE AA/2008, P-91 UNC AUTHENTICITY GUARANTEED!",128.99,"ions (6,303) 99.8%"
3,3,ZIMBABWE 100 TRILLION DOLLARS 2008 AA P-91 BANKNOTE NEW UNC ZIM CURRENCY W/COA,217.17,"nandcard (1,083) 100%"
4,4,ZIMBABWE 50 TRILLION DOLLAR BILL AA/2008 UNC P-90 100% COA GENUINE SALE FEW LEFT,41.0,"ions (6,303) 99.8%"
5,5,ZIMBABWE $100 TRILLION DOLLARS GOLD BILL BANKNOTE MONEY COLLECTION CERTIFICATE,2.99,"10 (49,126) 99.8%"


In [124]:
condition_filter = ["UNC","CIRCULATED","UNCIRCULATED","NEW","DAMAGED","USED","NON-CIRCULATING"]

def check_word(sentence):
    for word in condition_filter:
        if word in sentence:
            return word
    return None

df['Condition'] = df["Title"].apply(check_word)
df.head(50)

Unnamed: 0,Sr_No,Title,Price,Seller Info,Condition
1,1,"100 TRILLION DOLLAR ZIMBABWE AA 2008 SERIES P91 - USED CONDITION, FAST SHIP, COA",129.95,"bles-currency (3,041) 99.9%",USED
2,2,"ZIMBABWE 100 TRILLION BANKNOTE 1 NOTE AA/2008, P-91 UNC AUTHENTICITY GUARANTEED!",128.99,"ions (6,303) 99.8%",UNC
3,3,ZIMBABWE 100 TRILLION DOLLARS 2008 AA P-91 BANKNOTE NEW UNC ZIM CURRENCY W/COA,217.17,"nandcard (1,083) 100%",UNC
4,4,ZIMBABWE 50 TRILLION DOLLAR BILL AA/2008 UNC P-90 100% COA GENUINE SALE FEW LEFT,41.0,"ions (6,303) 99.8%",UNC
5,5,ZIMBABWE $100 TRILLION DOLLARS GOLD BILL BANKNOTE MONEY COLLECTION CERTIFICATE,2.99,"10 (49,126) 99.8%",
6,6,"1 ZIMBABWE 10 TRILLION, AA/2008, P-88, CIRCULATED 100 TRILLION SERIES COA USA",13.99,"ions (6,303) 99.8%",CIRCULATED
7,7,"50 TRILLION DOLLARS ZIMBABWE, 2008 AA COA UNC 100 SERIES UV AUTHENTIC",45.99,"ign (25,851) 100%",UNC
8,8,"✔ 100 TRILLION DOLLAR ZIMBABWE AA 2008 SERIES P91 UNCIRCULATED, FAST SHIP, COA",129.95,"bles-currency (3,041) 99.9%",UNC
9,9,"ZIM ZIMBABWE 50 TRILLION AA/2008 UNC 100% COA GENUINE 2 BANKNOTES, BUNDLE, USA",82.01,"ions (6,303) 99.8%",UNC
10,10,ZIM ZIMBABWE 100 TRILLION 1 BANKNOTE NOTE AA 2008 P-91 UNC UV AUTHENTIC COA,124.99,"ign (25,851) 100%",UNC


Unnamed: 0,Sr_No,Title,Price,Seller Info,Condition
1,1,"100 TRILLION DOLLAR ZIMBABWE AA 2008 SERIES P91 - USED CONDITION, FAST SHIP, COA",129.95,"bles-currency (3,041) 99.9%",USED
2,2,"ZIMBABWE 100 TRILLION BANKNOTE 1 NOTE AA/2008, P-91 UNC AUTHENTICITY GUARANTEED!",128.99,"ions (6,303) 99.8%",UNC
3,3,ZIMBABWE 100 TRILLION DOLLARS 2008 AA P-91 BANKNOTE NEW UNC ZIM CURRENCY W/COA,217.17,"nandcard (1,083) 100%",UNC
4,4,ZIMBABWE 50 TRILLION DOLLAR BILL AA/2008 UNC P-90 100% COA GENUINE SALE FEW LEFT,41.0,"ions (6,303) 99.8%",UNC
5,5,ZIMBABWE $100 TRILLION DOLLARS GOLD BILL BANKNOTE MONEY COLLECTION CERTIFICATE,2.99,"10 (49,126) 99.8%",


# Exporting as CSV

In [12]:
df.to_csv(f'{keyword}.csv', index=False)

In [13]:
df.shape

(2361, 5)

1. new/unc/uncirculated
2. used/circulated/damaged

- uppercase title
- trim title - leading and trailing spaces
- zimbabwe 100 trillion
- for 1 banknote