In [231]:
# installing neccesary libraries

# pip install requests_html
# pip install bs4
# pip install pandas
# pip install re
# pip install seaborn
# pip install matplotlib

# Libraries

In [232]:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import re

%matplotlib inline

pd.set_option("display.max_colwidth",None)

# Webscraping Code

In [233]:
#creating html session
s = HTMLSession()

#empty list to store the data
data = []
sr_no = -1

#taking input from user to get keyword 
keyword = "Zimbabwe 100 Trillion"
#input('Enter your keyword here: ')

#gathering urls for extracting data from first six pages
urls = ['https://www.ebay.com/sch/i.html?_from=R40&_nkw={}&_sacat=0&_ipg=240&_pgn={}&rt=nc'.format(keyword, x) for x in range(1,2)]

for url in urls:
    page = s.get(url.strip())
    soup = BeautifulSoup(page.text, "html.parser")

    # Find all of the elements with the class "s-item__info.clearfix"
    items = soup.find_all(class_ = "s-item__info clearfix")

    # Loop through the items and extract the data and append it to the data list
    for item in items:
        title = item.find(class_="s-item__title").get_text()
        price = item.find(class_="s-item__price").get_text()
        #condition = item.find(class_ = "SECONDARY_INFO").get_text()
        sr_no = sr_no+1
        
        try:
            seller_info = item.find(class_="s-item__seller-info").get_text()
        except:
            seller_info = "N/A"
    
        data.append([sr_no,title, price, seller_info])


# DataFrame

In [234]:
df = pd.DataFrame(data, columns=["Sr_No","Title","Price","Seller Info"])
df.drop(0,axis=0,inplace=True)
df.head(120)

Unnamed: 0,Sr_No,Title,Price,Seller Info
1,1,"Zimbabwe 100 Trillion Banknote 1 Note AA/2008, P-91 UNC Authenticity Guaranteed!",$128.99,"100trillions (7,384) 99.8%"
2,2,"100 TRILLION DOLLAR ZIMBABWE AA 2008 SERIES P91 - USED Condition, FAST SHIP, COA",$129.95,"collectibles-currency (3,291) 99.9%"
3,3,Zimbabwe 100 Trillion Dollars 2008 AA P-91 Banknote New UNC Zim Currency w/COA,$217.17,"oceancoinandcard (1,106) 100%"
4,4,Zimbabwe 50 TRILLION DOLLAR BILL AA/2008 UNC P-90 100% COA genuine Sale FEW LEFT,$41.99,"100trillions (7,384) 99.8%"
5,5,10X Zimbabwe 100 Trillion Dollar Banknote Non-circulating World Collection Bill,$18.79,"arthehan (2,294) 99.3%"
...,...,...,...,...
116,116,New Listing20Pcs Zimbabwe 100 Trillion Dollar Banknote Non-circulating Collection Gift,$13.99,dainini (170) 96.2%
117,117,10× $100 One Hundred Trillion Dollar Zimbabwe Silver Blue Banknote Set /w Roc WR,$16.99,"usa-videogameparts (14,282) 98.6%"
118,118,New Listing10Pcs Zimbabwe $100 Trillion Dollars Silver Banknote Novelty Non-Currency US,$15.88,dainini (170) 96.2%
119,119,TT PK 91 2007 ZIMBABWE 100 TRILLION AND 1 DOLLAR SET OF 2 EXTREMES PMG 67 EPQ,$87.00,"trustedtraditions.com (37,840) 100%"


# Data Preparation

**Converting to uppercase**

df["Title"] = df["Title"].str.upper()
df["List of words"] = df["Title"].str.split(' ')
df.head()

In [235]:
df["Title"] = df["Title"].str.upper()
df.dtypes

Sr_No           int64
Title          object
Price          object
Seller Info    object
dtype: object

**Removing the rows that had prices in Range format**

In [236]:
mask = df['Price'].str.len() > 11

print(mask.value_counts())

df = df.drop(index=df[mask].index)

False    240
Name: Price, dtype: int64


**Converting the datatype of price from string to numeric**

In [237]:
df['Price'] = df['Price'].str.replace('$', '')
df['Price'] = df['Price'].str.replace(',', '')
df['Price'] = pd.to_numeric(df["Price"])

  df['Price'] = df['Price'].str.replace('$', '')


**Sorting the data in descending order**

In [238]:
df_desc = df.sort_values("Price",ascending=False)

**Data between price range of 50 and 300**

In [239]:
df2 = df[(df['Price'] >= 50) & (df['Price'] <= 300)]

In [240]:
df2_sorted = df2.sort_values("Price",ascending=False)

In [241]:
df2.shape

(81, 4)

**Common words**

In [242]:
#pd.options.display.max_rows = None
common = pd.Series(' '.join(df['Title']).split()).value_counts()
common[0:5]

ZIMBABWE    227
TRILLION    202
100         200
2008         97
DOLLARS      94
dtype: int64

In [243]:
# trim title - leading and trailing spaces
df['Title'] = df['Title'].str.strip()

**Remove unnecesary special characters**

In [244]:
def clean_sentence(sentence):
    return re.sub(r'[^a-zA-Z0-9\s\%]+', '', sentence)

df['Title'] = df['Title'].apply(clean_sentence)

**Condition Filter**

In [245]:
# making a new column for condition of the prod

condition_filter = ["UNC","CIRCULATED","UNCIRCULATED","NEW","DAMAGED","USED","NON-CIRCULATING"]

def check_word(sentence):
    for word in condition_filter:
        if word in sentence:
            return word
    return None

df['Condition'] = df["Title"].apply(check_word)
df.head()

Unnamed: 0,Sr_No,Title,Price,Seller Info,Condition
1,1,ZIMBABWE 100 TRILLION BANKNOTE 1 NOTE AA2008 P91 UNC AUTHENTICITY GUARANTEED,128.99,"100trillions (7,384) 99.8%",UNC
2,2,100 TRILLION DOLLAR ZIMBABWE AA 2008 SERIES P91 USED CONDITION FAST SHIP COA,129.95,"collectibles-currency (3,291) 99.9%",USED
3,3,ZIMBABWE 100 TRILLION DOLLARS 2008 AA P91 BANKNOTE NEW UNC ZIM CURRENCY WCOA,217.17,"oceancoinandcard (1,106) 100%",UNC
4,4,ZIMBABWE 50 TRILLION DOLLAR BILL AA2008 UNC P90 100% COA GENUINE SALE FEW LEFT,41.99,"100trillions (7,384) 99.8%",UNC
5,5,10X ZIMBABWE 100 TRILLION DOLLAR BANKNOTE NONCIRCULATING WORLD COLLECTION BILL,18.79,"arthehan (2,294) 99.3%",


In [246]:
combinations= ["NON CURRENCY","X","GIFT","10X","10 TRILLION","20 TRILLION","50 TRILLION","PMG","PCGS","RADAR", "ERROR","MISSING",
               "MILLION","BILLIONS","BILLION","MILLIONS","NONCURRENCY","SET","PIECES","LOT","CONSECUTIVE","REPLACEMENT",
               "BUNDLE","BRICK","FOIL","NON-CURRENCY","PCS","SHOP ON EBAY","SOUVENIR","SLIVER","GOLD"]

def check_combinationsNA(sentence):
    for word in combinations:
        if word in sentence:
            return False
    return True

df_copy = df[df["Title"].apply(check_combinationsNA)]

#because a single 100 trillion banknote can range between 50 to 700
df_copy = df_copy[(df["Price"] >= 50) & (df["Price"] <= 700)]

  df_copy = df_copy[(df["Price"] >= 50) & (df["Price"] <= 700)]


In [247]:
df = df_copy.sort_values("Price",ascending=False)

# Exporting as CSV

In [248]:
df.to_excel(f'{"Zimbabwe 100 Trillion"}.xlsx', sheet_name="100T", index=False)

In [249]:
#df.to_csv("Zimbabwe 100 Trillion",index=False)

In [250]:
df

Unnamed: 0,Sr_No,Title,Price,Seller Info,Condition
83,83,SEQUENTIAL 2008 P91 100 TRILLION DOLLAR ZIMBABWE BANKNOTES,250.0,jamu_92 (22) 87.5%,
3,3,ZIMBABWE 100 TRILLION DOLLARS 2008 AA P91 BANKNOTE NEW UNC ZIM CURRENCY WCOA,217.17,"oceancoinandcard (1,106) 100%",UNC
101,101,AUTHENTIC 100 TRILLION ZIM BOND DOLLARS 2008 ZIMBABWE 2008 AA UNC WITH COA,188.95,"banknotecorner (2,070) 100%",UNC
55,55,100 TRILLION DOLLAR AUTHENTIC ZIM ZIMBABWE 2008 AA BOND COA CERTIFICATE UV PASS,188.95,"banknotecorner (2,070) 100%",
139,139,UNC 2008 100 TRILLION DOLLARS ZIMBABWE BANKNOTE P91 LARGEST DENOM NOTE CURRENCY,138.95,"mbarrcoins (441,366) 100%",UNC
64,64,2008 100 TRILLION DOLLARS ZIMBABWE BANKNOTE AA P91 GEM UNC NOTE CURRENCY,138.9,"mbarrcoins (441,366) 100%",UNC
7,7,ZIMBABWE 100 TRILLION DOLLAR BILL AA2008 UNCIRCULATED 100% COA GENUINE,129.99,"currencyadventure (4,896) 100%",UNC
222,222,ZIMBABWE 100 TRILLION BANKNOTE 1 NOTE AA2008 P91 UNC AUTHENTICITY GUARANTEED,129.95,"rigiddesign (26,075) 100%",UNC
22,22,100 TRILLION DOLLAR ZIMBABWE AA 2008 SERIES P91 UNCIRCULATED FAST SHIP COA,129.95,"collectibles-currency (3,291) 99.9%",UNC
2,2,100 TRILLION DOLLAR ZIMBABWE AA 2008 SERIES P91 USED CONDITION FAST SHIP COA,129.95,"collectibles-currency (3,291) 99.9%",USED


1. new/unc/uncirculated
2. used/circulated/damaged

- uppercase title
- trim title - leading and trailing spaces
- zimbabwe 100 trillion
- for 1 banknote