## Scrape, Clean, Alert on Roman Coin Sales 

This script scrapes a user-determined number of pages from ebay, cleans the data and extracts features, saves the data at two intervals in two csv's (for later error testing and model refitting),
requests a max-price from the user (budget for coin price) from the user, then uses a pickled random forest regressor model (fitted in the 'Roman Coin Model Building' script) to predict price and emails the user a list of 5 coin URLs with the largest positive difference between predicted/current price.

*NOTE*: This unified script is an amalgamation of my previous cleaning/eda/webscraping scripts, that show more of my logic in construction.

In [1]:
import datetime
from ebaysdk.finding import Connection as Finding
from ebaysdk.exception import ConnectionError
from tqdm import tqdm 
import numpy as np
import pandas as pd
import requests
import re
import smtplib
import pickle
from email_pw import pw

In [2]:
#must be executed from same directory where 
#the ebay.yaml file is located for config_file (or change the path)

solutions = []

num_pages = input('How many pages would you like to scrape? Enter an integer between 1 and 100: ')

for page in tqdm(range(1, int(num_pages))):
    try:
        api = Finding(config_file='./ebay.yaml')
        response = api.execute('findItemsAdvanced', {'keywords': 'Roman Coin',
                                 'paginationInput': {'pageNumber': page}})
        coin = response.dict()
        solutions.append(coin)

    except ConnectionError as e:
        print(e)

How many pages would you like to scrape? Enter an integer between 1 and 100: 15


100%|██████████| 14/14 [00:21<00:00,  1.56s/it]


In [3]:
#The data comes buried within dictionaries, at varying levels, 
#the following 4 cells are just extracting it all and then merging into 
#a unified dataframe. 

df = pd.DataFrame(solutions)

new = []
for item in df['searchResult']:
    new.append(item)

df2 = pd.DataFrame(new)

In [4]:
coins = []
for item in df2['item']:
    for listing in item:
        coins.append(listing)

roman = pd.DataFrame(coins)

In [5]:
primary_cat = []
for val in roman['primaryCategory']:
    primary_cat.append(val)
    
primary = pd.DataFrame(primary_cat)

In [6]:
#Extracting features from dictionaries...
prices = []

for price in roman['sellingStatus']:
    prices.append(price)
    
price_df = pd.DataFrame(prices)
price_df['currencyid_conv'] = [val['_currencyId'] for 
                               val in price_df['convertedCurrentPrice']]

price_df['value_conv'] = [val['value'] for 
                               val in price_df['convertedCurrentPrice']]

price_df['current_price'] = [val['value'] for 
                               val in price_df['currentPrice']]

price_df['current_currency'] = [val['_currencyId'] for 
                               val in price_df['currentPrice']]

In [7]:
#these are now redundant
price_df = price_df.drop(['convertedCurrentPrice', 'currentPrice'], axis=1)

In [8]:
#merging
roman_coins = pd.concat([roman, primary, price_df], axis=1)

In [9]:
#saving the raw data, don't mind the inefficient name for my directory..
file_name = './old cleaned:used datasets /uncleaned_data_' + re.sub('-', '_', str(datetime.datetime.now())[5:10])
roman_coins.to_csv(file_name)

In [10]:
#reloading like this as a convenience measure to auto-change some data types
roman_coins = pd.read_csv(file_name, index_col=0)

In [11]:
#listingInfo has lots of useful information and I want to add these variables separately
listings = roman_coins['listingInfo'].map(eval).apply(pd.Series)
#merging the new columns onto the dataframe
roman_coins = pd.concat([roman_coins, listings], axis=1)
#listings column now redundant
roman_coins = roman_coins.drop('listingInfo', axis=1)

In [12]:
#PayPal is dominant category, so I am just swapping the 1/0
roman_coins['paymentMethod'] = [0 if x == 'PayPal' else 1 for 
                                x  in roman_coins['paymentMethod']]

In [13]:
#again swapping to have 1 be the interesting feature in dummy variable
roman_coins['returnsNotAccepted'] = roman_coins['returnsAccepted'].astype(int) * -1 + 1

In [14]:
shipping = roman_coins['shippingInfo'].map(eval).apply(pd.Series)

In [15]:
#Extracting shipping information from dictionaries and merging
ship_cost_dicts = []
shipping_currency = []

for val in shipping['shippingServiceCost']:
    try:
        currency = val['_currencyId']
        cost = val['value']
        ship_cost_dicts.append(cost)
        shipping_currency.append(currency)
    
    except:
        
        ship_cost_dicts.append(np.nan)
        shipping_currency.append(np.nan)

shipping['shipping_cost'] = ship_cost_dicts
shipping['shipping_cost_currency'] = shipping_currency

#filling missing values (usually not many) with mean values
shipping['handlingTime'] = shipping['handlingTime'].astype(float)
shipping['handlingTime'] = shipping['handlingTime'].fillna(
                                    shipping['handlingTime'].mean())

mapper = {'false' : 0, 'true': 1}
shipping['expeditedShipping'] = shipping['expeditedShipping'].map(mapper)

In [16]:
roman_coins = pd.concat([roman_coins, shipping], axis=1)

In [17]:
#converting some features 
roman_coins['topRatedListing'] = roman_coins['topRatedListing'].astype(int)

roman_coins['URLTitle'] = [val[24:-13] for val in roman_coins['viewItemURL']]

roman_coins['autoPay'] = roman_coins['autoPay'].astype(int)

roman_coins['categoryId'] = roman_coins['categoryId'].astype(object)

In [18]:
#making datetimes
roman_coins['startTime'] = pd.to_datetime(roman_coins['startTime'])
roman_coins['endTime'] = pd.to_datetime(roman_coins['endTime'])

In [19]:
#this step cleans time left and makes it a timedelta
#The except shouldn't be an issue, but everyonce in a while, a coin finishes
#during scrape and is marked completed. 

times = []
try:
    for val in roman_coins['timeLeft']:
        x = re.sub(r'[HM]+', ':', val)
        time = re.sub(r'[SPT]', '', x)
        new_time = pd.Timedelta(time)
        times.append(new_time)
except KeyError:
    print('You are running completed sales, this step unnecessary')

roman_coins['timeLeft'] = times

In [20]:
roman_coins['watchCount'] = roman_coins['watchCount'].astype(float)
roman_coins['shipping_cost'] = roman_coins['shipping_cost'].astype(float)

In [21]:
#If pulling smaller amounts of coins (less than 3 or 4 pages), there is a small probability 
#of not having 'BuyItNowPrices', account for this with try/except statements. 
try: 
    buyitnow = []
    for val in roman_coins['buyItNowPrice']:
        try:
            price = val['value']
            buyitnow.append(price)
        except:
            buyitnow.append(np.nan)
    
    roman_coins['buyItNowPrice'] = buyitnow
    roman_coins['buyItNowPrice'] = roman_coins['buyItNowPrice'].astype(float)

    roman_coins['buyItNowPrice'] = buyitnow
    roman_coins['buyItNowPrice'] = roman_coins['buyItNowPrice'].astype(float)

except:
    print('Warning: no BuyItNowPrice, inputting 0')
    roman_coins['buyItNowPrice'] = 0

In [22]:
mapper = {'false' : 0, 'true': 1}
roman_coins['bestOfferEnabled'] = roman_coins['bestOfferEnabled'].map(mapper)
roman_coins['buyItNowAvailable'] = roman_coins['buyItNowAvailable'].map(mapper)

In [23]:
#eliminating duplicate ItemIds (keeping first (most recent, unlikely to be an issue))
roman_coins = roman_coins.drop(list(roman_coins[roman_coins.duplicated(
                            subset='itemId', keep='first')].index), axis = 0)

In [24]:
#turning timeLeft into an float variable representing number of hours remaining
roman_coins['timeLeft'] = roman_coins['timeLeft'] / pd.Timedelta(hours=1)


#turning hours into categorical variables
roman_coins['timeLeft'] = pd.cut(roman_coins['timeLeft'], 
               [0, 1, 6, 24, 48, 168, roman_coins['timeLeft'].max()], 
                labels=['less_than_hour_left', 'less_than_6hours_left', 
                  'less_than_day_left', 'less_than_2day_left', 'less_than_week_left',
                  'more_than_week_left'])

roman_coins['timeLeft'] = roman_coins['timeLeft'].astype(object).fillna('completed_no_time_left')

roman_coins = pd.concat([roman_coins, pd.get_dummies(roman_coins['timeLeft']).drop('more_than_week_left', 
                                                        axis=1)], axis=1)
roman_coins = roman_coins.drop('timeLeft', axis=1)

In [25]:
#dropping roman coin jewelry categories

roman_coins = roman_coins.drop(list(roman_coins.loc[roman_coins['categoryId'] == 137843].index), axis=0)
roman_coins = roman_coins.drop(list(roman_coins.loc[roman_coins['categoryId'] == 164343].index), axis=0)
roman_coins = roman_coins.drop(list(roman_coins.loc[roman_coins['categoryId'] == 548].index), axis=0)
roman_coins = roman_coins.drop(list(roman_coins.loc[roman_coins['categoryId'] == 164341].index), axis=0)
roman_coins = roman_coins.drop(list(roman_coins.loc[roman_coins['categoryId'] == 45157].index), axis=0)
roman_coins = roman_coins.drop(list(roman_coins.loc[roman_coins['categoryId'] == 29223].index), axis=0)
roman_coins = roman_coins.drop(list(roman_coins.loc[roman_coins['categoryId'] == 13993].index), axis=0)

#dropping additional categories likely not to be coins
drop_cols = roman_coins['categoryId'].value_counts().reset_index()
val_to_drop = list(drop_cols.loc[drop_cols['categoryId'] < 12]['index'])
roman_coins = roman_coins.drop(roman_coins[roman_coins['categoryId'].isin(val_to_drop)].index, axis=0)

In [26]:
#dropping sales not in USD
roman_coins = roman_coins.drop(list(roman_coins[roman_coins['current_currency'] != 'USD'].index), axis=0)

In [27]:
roman_coins['seller_in_us'] = roman_coins['country'].apply(lambda x: 1 if x == 'US' else 0)

In [28]:
roman_coins['startTime'] = pd.to_datetime(roman_coins['startTime'])
roman_coins['endTime'] = pd.to_datetime(roman_coins['endTime'])

In [29]:
roman_coins['shippingType'] = roman_coins['shippingType'].apply(lambda x: 'International' if x in ['CalculatedDomesticFlatInternational',
                                                                                'FlatDomesticCalculatedInternational'] else x)

In [30]:
#pulling individual emperors from titles

#splitting and rejoining URLtitles
roman_coins['URLTitle'] = roman_coins['URLTitle'].str.split('-').apply(lambda x: " ".join(x))
roman_coins['URLTitle'] = roman_coins['URLTitle'].str.lower()

roman_emperors = ['Augustus', 'Tiberius', 'Caligula', 'Claudius', 'Nero', 'Galba', 'Otho', 'Aulus Vitellius', 'Vespasian', 'Titus', 
                  'Domitian', 'Nerva', 'Trajan', 'Hadrian', 'Antoninus Pius', 'Marcus Aurelius', 'Lucius Verus', 'Commodus', 'Publius Helvius Pertinax', 
                  'Marcus Didius Severus Julianus', 'Septimius Severus', 'Caracalla', 'Publius Septimius Geta', 'Macrinus', 'Elagabalus', 'Severus Alexander', 
                  'Maximinus', 'Gordian I', 'Gordian II', 'Pupienus Maximus', 'Balbinus', 'Gordian III', 'Philip', 'Decius', 'Hostilian', 'Gallus', 'Aemilian', 'Valerian', 'Gallienus', 
                  'Claudius II Gothicus', 'Quintillus', 'Aurelian', 'Tacitus', 'Florian', 'Probus', 'Carus', 'Numerian', 'Carinus', 'Diocletian', 'Maximian', 'Constantius I',
                  'Galerius',  'Constantine I ', 'Galerius Valerius Maximinus', 'Licinius', 'Constantine II ', 'Constantius II', 'Constans I', 'Gallus Caesar', 'Julian', 'Jovian', 
                  'Valentinian I', 'Valens', 'Gratian', 'Valentinian II', 'Theodosius I', 'Arcadius', 'Magnus Maximus', 'Honorius', 'Theodosius II', 'Constantius III', 'Valentinian III',
                  'Marcian', 'Petronius Maximus', 'Avitus', 'Majorian', 'Anthemius', 'Olybrius', 'Glycerius', 'Julius Nepos', 'Romulus Augustulus', 'Leo I', 'Leo II', 'Zeno']

roman_emperors = [x.lower() for x in roman_emperors]

emperors = []

for title in roman_coins['URLTitle']:
    if 'lot' in title: 
        emperors.append('LOT')
    else:
        emps = re.findall(r"(?=("+'|'.join(roman_emperors)+r"))",title)
        if emps:
            emperors.append(emps)
        else: 
            emperors.append('unknown')

roman_coins['emperor_in_title'] = emperors

In [31]:
#Eliminating additional non-coin sales
posters = []

for num, val in enumerate(roman_coins['URLTitle']):
    if 'poster' in val:
        posters.append(num)

roman_coins = roman_coins.drop(posters, axis=0).reset_index(drop=True)

In [32]:
#Identifying whether there are one or more emperors in the title (i.e. is seller selling two or more coins)
#creating separate columns for dummy variable creation later
singles = []
for num, val in enumerate(roman_coins['emperor_in_title']):
    if val != 'unknown':
        if len(val) > 1:
            singles.append('multiple_emps_in_title')
        elif len(val) == 1:
            singles.append(val[0])
    else:
        singles.append('unknown')
            
roman_coins['emp_prep_dummies'] = singles

In [33]:
def emperor_cleaner(col):
    """eliminating duplicate emperor names within same listing, making them strings"""   
    if col == 'unknown' or col == 'LOT':
        return col
    else:
        if len(col) > 1:
            emp = list(set(col))
            clean = ' '.join(emp)
            return clean
        elif len(col) == 1:
            emp = col[0]
            return emp

roman_coins['emperor_in_title'] = roman_coins['emperor_in_title'].apply(emperor_cleaner)

In [34]:
#Extracting coin material from titles 
metals = ['gold', 'silver', 'bronze', 'orichalcum', 'copper']

metal = []
for title in roman_coins['URLTitle']:
    coin = re.findall(r"(?=("+'|'.join(metals)+r"))",title)
    if coin:
        metal.append(coin)
    else: 
        metal.append(['unknown'])
        
roman_coins['coin_metal'] = metal

In [35]:
def metal_cleaner(col):
    """Removes metal values from lists"""
    try:
        multi = ' '.join(col)
        return multi
    except:
        single = col[0]
        return single

roman_coins['coin_metal'] = roman_coins['coin_metal'].apply(metal_cleaner)

In [36]:
#checking one last time to eliminate non-coin sales
jewelery = []

for num, val in enumerate(roman_coins['URLTitle']):
    if 'earring' in val:
        jewelery.append(num)
    elif 'pendant' in val:
        jewelery.append(num)
    elif 'bracelet'in val:
        jewelery.append(num)
    elif 'necklace' in val:
        jewelery.append(num)

roman_coins = roman_coins.drop(jewelery, axis=0).reset_index(drop=True)

In [37]:
#Adding dummies for gold and silver
roman_coins['gold'] = roman_coins['coin_metal'].apply(lambda x: 1 if x == 'gold' else 0)
roman_coins['silver'] = roman_coins['coin_metal'].apply(lambda x: 1 if x == 'silver' else 0 )

In [38]:
#Creating dummy variable for lots (i.e. more than 1 coin for sale)
roman_coins['is_a_lot'] = roman_coins['emperor_in_title'].apply(lambda x: 1 if x == 'LOT' else 0)

In [39]:
#uncleaned is a common indicator of low quality, creating dummy

uncleaned = []
for val in roman_coins['URLTitle']:
    has = re.search('uncleaned', val)
    if has:
        uncleaned.append(1)
    else:
        uncleaned.append(0)

roman_coins['uncleaned'] = uncleaned

In [40]:
#Creating total sale time feature, putting it into days
roman_coins['total_sale_time'] = roman_coins['endTime'] - roman_coins['startTime']
roman_coins['total_sale_time'] = roman_coins.total_sale_time.dt.days

#dropping lots that have negative total time values (likely put up and immediately 
#taken down)
roman_coins = roman_coins.drop(list(roman_coins.loc[roman_coins['total_sale_time'] < 0].index), axis=0)


#Creating categorical bins for duration
roman_coins['total_sale_time'] = pd.cut(roman_coins['total_sale_time'], 
           [0, 7, 30, 365, roman_coins['total_sale_time'].max()], 
          labels=['Less_than_week', 'Less_than_month', 'less_than_year', 'more_than_year'])

roman_coins = pd.concat([roman_coins, pd.get_dummies(roman_coins['total_sale_time']).drop('less_than_year', 
                                                               axis=1)], axis=1 )
roman_coins = roman_coins.drop('total_sale_time', axis=1)

In [41]:
#Queens (zip: 11374) seems to house a professional high-volume seller, creating dummy
#41% of volume eminates from this zip
roman_coins['Queens_seller'] = roman_coins['postalCode'].apply(lambda x: 1 if x == '11374' else 0)

In [42]:
#creating dummies based on 'quality' indications in the title
def low_quality(title):
    word = re.findall(r'low|lower', title)
    if word:
        return 1
    else:
        return 0

def high_quality(title):
    word = re.findall(r'high|higher|premium|rare', title)
    if word:
        return 1
    else:
        return 0
    
roman_coins['high_quality_words'] = roman_coins['URLTitle'].apply(high_quality)
roman_coins['low_quality_words'] = roman_coins['URLTitle'].apply(low_quality)

In [43]:
#turning shipping type into dummy variables
roman_coins = pd.concat([roman_coins, pd.get_dummies(roman_coins['shippingType'])], axis=1)


In [44]:
#getting listing type dummies, 
roman_coins = pd.concat([roman_coins, pd.get_dummies(roman_coins['listingType'])], axis=1)

#dropping start/endTime
roman_coins = roman_coins.drop(['endTime', 'startTime'], axis=1)

#dropping categoryId
roman_coins = roman_coins.drop('categoryId', axis=1)

In [45]:
#creating two more dummies for coin metals before dropping
roman_coins['bronze'] = roman_coins['coin_metal'].apply(lambda x: 1 if x == 'bronze' else 0)
roman_coins['copper'] = roman_coins['coin_metal'].apply(lambda x: 1 if x == 'copper' else 0)
roman_coins = roman_coins.drop('coin_metal', axis=1)

In [46]:
#filling NaNs with zeros for numeric cats
roman_coins['bidCount'] = roman_coins['bidCount'].fillna(0)
roman_coins['watchCount'] = roman_coins['watchCount'].fillna(0)

In [47]:
#adding the emperor dummy variables and concatting to main df
roman_coins = pd.concat([roman_coins, pd.get_dummies(roman_coins['emp_prep_dummies']).drop(
    'unknown', axis=1)], axis=1)

roman_coins = roman_coins.drop('emp_prep_dummies', axis=1)
roman_coins = roman_coins.drop('emperor_in_title', axis=1)

In [48]:
#saving data to refit model later
file_name = './old cleaned:used datasets /to_fit_to_model_later_' + re.sub('-', '_', str(datetime.datetime.now())[5:10])
roman_coins.to_csv(file_name)

In [49]:
#Columns from the model
model_cols= ['autoPay', 'bestOfferEnabled', 'bidCount', 'buyItNowAvailable',
       'expeditedShipping', 'handlingTime', 'paymentMethod',
       'returnsNotAccepted', 'topRatedListing', 'watchCount',
       'less_than_6hours_left', 'less_than_day_left', 'seller_in_us', 'gold',
       'silver', 'is_a_lot', 'uncleaned', 'Less_than_week', 'Less_than_month',
       'more_than_year', 'Queens_seller', 'high_quality_words',
       'low_quality_words', 'Calculated', 'Flat', 'Free', 'Auction',
       'FixedPrice', 'bronze', 'copper', 'arcadius', 'augustus', 'caligula',
       'caracalla', 'commodus', 'constantine i ', 'constantine ii ',
       'constantius i', 'diocletian', 'domitian', 'elagabalus', 'florian',
       'galerius', 'gallus', 'gratian', 'hadrian', 'honorius', 'hostilian',
       'julian', 'lucius verus', 'macrinus', 'magnus maximus', 'marcian',
       'marcus aurelius', 'maximian', 'maximinus', 'multiple_emps_in_title',
       'nero', 'nerva', 'otho', 'philip', 'probus', 'quintillus',
       'septimius severus', 'severus alexander', 'tiberius', 'titus', 'trajan',
       'valens', 'valentinian i', 'vespasian', 'zeno']

In [50]:
#dropping outliers and coins priced below 3 dollars (not part of market of interest)
roman_coins = roman_coins.drop(list(roman_coins.loc[(roman_coins['current_price'] < 3)| 
                          (roman_coins['current_price']> 1000)].index), axis=0).reset_index(drop=True)


In [51]:
#saving useful columns for later
add_later = roman_coins[['URLTitle', 'itemId', 'viewItemURL', 'current_price']]
roman_coins = roman_coins.drop(['URLTitle', 'itemId', 'viewItemURL', 'current_price'], axis=1)

#dropping columns not present in model
for col in roman_coins.columns:
    if col not in model_cols:
        roman_coins.drop(col, axis=1, inplace=True)

In [52]:
#Introducing the model
with open('final_model.pickle', 'rb') as file_handle:
         random_forest = pickle.load(file_handle)

In [55]:
#infrequently an Emperor is not present in sample, need to add 0 values for these. 
missing_cols = []
for val in model_cols:
    if val not in roman_coins.columns:
        print("This column was not present in the dataset, filling with 0:", val)
        missing_cols.append(val)
        
if missing_cols:
    for val in missing_cols:
        roman_coins[val] = 0

This column was not present in the dataset, filling with 0: current_price
This column was not present in the dataset, filling with 0: caligula
This column was not present in the dataset, filling with 0: marcian
This column was not present in the dataset, filling with 0: otho


In [56]:
X = roman_coins.values

In [57]:
predictions = random_forest.predict(X)

In [58]:
add_later['predictions'] = np.exp(predictions)

In [59]:
add_later['difference'] = add_later['predictions'] - add_later['current_price']

In [60]:
#Setting a budget- i.e. max current price

budget = int(input('What is your maximum spending amount? (give as integer USD)'))
add_later = add_later.loc[add_later['current_price'] < budget]

What is your maximum spending amount? (give as integer USD)180


In [61]:
#5 largest deviations to send
TEXT = ''

for val in add_later.sort_values(by='difference', ascending=False)['viewItemURL'].head().values:
    TEXT += val + '\n' +'\n'
    
SUBJECT = 'Roman Coins below normal price'
message = 'Subject: {}\n\n{}'.format(SUBJECT, TEXT)

In [62]:
#Sending top 5 links to user defined address

server = smtplib.SMTP('smtp.mail.yahoo.com', 587)
server.starttls()
server.login('roman_coins_swl@yahoo.com', pw)
 
email_to_send = input('What email address would you like to recieve the coins at?')    
    
server.sendmail('roman_coins_swl@yahoo.com', email_to_send, message)
server.quit()

What email address would you like to recieve the coins at?slevin886@gmail.com


(221, b'Service Closing transmission')