In [89]:
# installing neccesary libraries

# pip install requests_html
# pip install bs4
# pip install pandas
# pip install re
# pip install seaborn
# pip install matplotlib

# Libraries

In [90]:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import re

%matplotlib inline

pd.set_option("display.max_colwidth",None)

# Webscraping Code

In [91]:
#creating html session
s = HTMLSession()

#empty list to store the data
data = []
sr_no = -1

#taking input from user to get keyword 
keyword = "Zimbabwe 50 Trillion"
#input('Enter your keyword here: ')

#gathering urls for extracting data from first six pages
urls = ['https://www.ebay.com/sch/i.html?_from=R40&_nkw={}&_sacat=0&_ipg=240&_pgn={}&rt=nc'.format(keyword, x) for x in range(1,11)]

for url in urls:
    
    page = s.get(url.strip())
    soup = BeautifulSoup(page.text, "html.parser")

    # Find all of the elements with the class "s-item__info.clearfix"
    items = soup.find_all(class_ = "s-item__info clearfix")

    # Loop through the items and extract the data and append it to the data list
    for item in items:
            
        title = item.find(class_="s-item__title").get_text()
        price = item.find(class_="s-item__price").get_text()
        #condition = item.find(class_ = "SECONDARY_INFO").get_text()
        sr_no = sr_no+1
        
        try:
            seller_info = item.find(class_ = "s-item__seller-info").get_text()
        except:
            seller_info = "N/A"
        
        data.append([sr_no,title, price, seller_info])

# DataFrame

In [92]:
df = pd.DataFrame(data, columns=["Sr_No","Title","Price","Seller Info"])
df.drop(0,axis=0,inplace=True)
df.head()

Unnamed: 0,Sr_No,Title,Price,Seller Info
1,1,Zimbabwe 50 TRILLION DOLLAR BILL AA/2008 uncirculated 100% COA genuine 1 bill,$43.00,"100trillions (7,384) 99.8%"
2,2,"2008 50 TRILLION DOLLARS ZIM ZIMBABWE AA SERIES P90 CIRCULATED, FAST SHIP",$49.55,"collectibles-currency (3,291) 99.9%"
3,3,"ZIM Zimbabwe 50 TRILLION AA/2008 UNC 100% COA genuine 2 BANKNOTES, bundle, USA",$80.00,"100trillions (7,384) 99.8%"
4,4,Zimbabwe 50 TRILLION DOLLAR BILL AA/2008 uncirculated 100% COA genuine 10 bills,$400.00,"100trillions (7,384) 99.8%"
5,5,Zimbabwe 50 TRILLION DOLLAR BILL AA/2008 UNC P-90 100% COA genuine Sale FEW LEFT,$41.99,"100trillions (7,384) 99.8%"


# Data Preparation

**converting to uppercase**

df["Title"] = df["Title"].str.upper()
df["List of words"] = df["Title"].str.split(' ')
df.head()

In [93]:
df["Title"] = df["Title"].str.upper()
df.dtypes

Sr_No           int64
Title          object
Price          object
Seller Info    object
dtype: object

**Removing the rows that had prices in Range format**

In [94]:
mask = df['Price'].str.len() > 11

print(mask.value_counts())

df = df.drop(index=df[mask].index)

False    1821
True       12
Name: Price, dtype: int64


**Converting the datatype of price from string to numeric**

In [95]:
df['Price'] = df['Price'].str.replace('$', '')
df['Price'] = df['Price'].str.replace(',', '')
df['Price'] = pd.to_numeric(df["Price"])

  df['Price'] = df['Price'].str.replace('$', '')


**Sorting the data in descending order**

In [96]:
df_desc = df.sort_values("Price",ascending=False)

**data between price range of 50 and 300**

In [97]:
df2 = df[(df['Price'] >= 50) & (df['Price'] <= 300)]

In [98]:
df2_sorted = df2.sort_values("Price",ascending=False)

In [99]:
df2.shape

(1062, 4)

**Common words**

In [100]:
#pd.options.display.max_rows = None
common = pd.Series(' '.join(df['Title']).split()).value_counts()
common[0:5]

ZIMBABWE    1727
50          1657
TRILLION    1410
DOLLAR       822
UNC          782
dtype: int64

In [101]:
# trim title - leading and trailing spaces
df['Title'] = df['Title'].str.strip()

**Remove unnecesary special characters**

In [102]:
def clean_sentence(sentence):
    return re.sub(r'[^a-zA-Z0-9\s\%]+', '', sentence)

df['Title'] = df['Title'].apply(clean_sentence)

**Condition Filter**

In [103]:
# making a new column for condition of the prod

condition_filter = ["UNC","CIRCULATED","UNCIRCULATED","NEW","DAMAGED","USED","NON-CIRCULATING"]

def check_word(sentence):
    for word in condition_filter:
        if word in sentence:
            return word
    return None

df['Condition'] = df["Title"].apply(check_word)
df.head()

Unnamed: 0,Sr_No,Title,Price,Seller Info,Condition
1,1,ZIMBABWE 50 TRILLION DOLLAR BILL AA2008 UNCIRCULATED 100% COA GENUINE 1 BILL,43.0,"100trillions (7,384) 99.8%",UNC
2,2,2008 50 TRILLION DOLLARS ZIM ZIMBABWE AA SERIES P90 CIRCULATED FAST SHIP,49.55,"collectibles-currency (3,291) 99.9%",CIRCULATED
3,3,ZIM ZIMBABWE 50 TRILLION AA2008 UNC 100% COA GENUINE 2 BANKNOTES BUNDLE USA,80.0,"100trillions (7,384) 99.8%",UNC
4,4,ZIMBABWE 50 TRILLION DOLLAR BILL AA2008 UNCIRCULATED 100% COA GENUINE 10 BILLS,400.0,"100trillions (7,384) 99.8%",UNC
5,5,ZIMBABWE 50 TRILLION DOLLAR BILL AA2008 UNC P90 100% COA GENUINE SALE FEW LEFT,41.99,"100trillions (7,384) 99.8%",UNC


In [104]:
combinations= ["NON CURRENCY","X","GIFT","10X","10 TRILLION","20 TRILLION","100 TRILLION","PMG","PCGS","RADAR", "ERROR","MISSING",
               "MILLION","BILLIONS","BILLION","MILLIONS","NONCURRENCY","SET","PIECES","LOT","CONSECUTIVE", "REPLACEMENT",
               "BUNDLE","BRICK","FOIL","NON-CURRENCY","PCS","SHOP ON EBAY","SOUVENIR","SLIVER","GOLD","ONE HUNDRED TRILLION","100 HUNDRED TRILLION"]

def check_combinationsNA(sentence):
    for word in combinations:
        if word in sentence:
            return False
    return True

df_copy = df[df["Title"].apply(check_combinationsNA)]

#because a single 50 TRILLION bank note exist between the price range of 50 and 700
df_copy = df_copy[(df["Price"] >= 50) & (df["Price"] <= 700)]


  df_copy = df_copy[(df["Price"] >= 50) & (df["Price"] <= 700)]


In [105]:
df = df_copy.sort_values("Price",ascending=False)

# Exporting as CSV

In [106]:
#df.to_csv(f'{"Zimbabwe 50 Trillion"}.csv', index=False)

In [107]:
df.to_excel(f'{"Zimbabwe 50 Trillion"}.xlsx', sheet_name="50T", index=False)

In [108]:
df

Unnamed: 0,Sr_No,Title,Price,Seller Info,Condition
705,705,ZIMBABWE 2008 50 TRILLION DOLLAR 8 BANKNOTES AA UNC TOTAL 400 TRILLION ZIM,549.95,vegibud038m (850) 100%,UNC
821,821,SAME END SERIAL NUMBER ZIMBABWE 20 50 TRILLION DOLLARS 2008 UNC 100% VINTAGE Z,435.08,chubbycatcoins-tmnpre89 (474) 99%,UNC
4,4,ZIMBABWE 50 TRILLION DOLLAR BILL AA2008 UNCIRCULATED 100% COA GENUINE 10 BILLS,400.00,"100trillions (7,384) 99.8%",UNC
677,677,ZIMBABWE 50 TRILLION DOLLARS BANKNOTES,399.00,jacktram (473) 100%,
262,262,10 NOTE ZIMBABWE 50 TRILLION DOLLARS 2008 P88 AA 100 % AUTHENTIC UV UNC COA,389.95,"j.expressions (2,160) 100%",UNC
...,...,...,...,...,...
965,965,50 TRILLION DOLLARS ZIMBABWE BANKNOTE,53.26,"world_paper_money_emporium (6,321) 99.6%",
1546,1546,50 TRILLION DOLLARS ZIMBABWE BANKNOTE,53.26,"world_paper_money_emporium (6,321) 99.6%",
533,533,50 TRILLION DOLLARS ZIMBABWE 2008 AA COA UNC AUTHENTIC BANKNOTE,51.12,mr.coincanada (529) 100%,UNC
81,81,2008 50 TRILLION DOLLARS ZIMBABWE BANKNOTE AA 100 SERIES,50.00,2urgse (594) 100%,


1. new/unc/uncirculated
2. used/circulated/damaged

- uppercase title
- trim title - leading and trailing spaces
- zimbabwe 100 trillion
- for 1 banknote