In [None]:
import datetime
from ebaysdk.finding import Connection as Finding
from ebaysdk.exception import ConnectionError
from tqdm import tqdm 
import numpy as np
import pandas as pd
import requests
import re

In [None]:
#must be executed from same directory where ebay.yaml file is located (or change path)

solutions = []

for page in tqdm(range(1, 100)):
    try:
        api = Finding(config_file='./ebay.yaml')
        response = api.execute('findItemsAdvanced', {'keywords': 'Roman Coin',
                                 'paginationInput': {'pageNumber': page}})
        coin = response.dict()
        solutions.append(coin)

    except ConnectionError as e:
        print(e)

In [None]:
df = pd.DataFrame(solutions)

new = []
for item in df['searchResult']:
    new.append(item)

df2 = pd.DataFrame(new)

In [None]:
coins = []
for item in df2['item']:
    for listing in item:
        coins.append(listing)

roman = pd.DataFrame(coins)

In [None]:
primary_cat = []
for val in roman['primaryCategory']:
    primary_cat.append(val)
    
primary = pd.DataFrame(primary_cat)

In [None]:
prices = []

for price in roman['sellingStatus']:
    prices.append(price)
    
price_df = pd.DataFrame(prices)
price_df['currencyid_conv'] = [val['_currencyId'] for 
                               val in price_df['convertedCurrentPrice']]

price_df['value_conv'] = [val['value'] for 
                               val in price_df['convertedCurrentPrice']]

price_df['current_price'] = [val['value'] for 
                               val in price_df['currentPrice']]

price_df['current_currency'] = [val['_currencyId'] for 
                               val in price_df['currentPrice']]

In [None]:
price_df = price_df.drop(['convertedCurrentPrice', 'currentPrice'], axis=1)

In [None]:
roman_coins = pd.concat([roman, primary, price_df], axis=1)

In [None]:
roman_coins = roman_coins.drop(['condition', 'discountPriceInfo', 'galleryPlusPictureURL',
                               'galleryURL', 'globalId', 'isMultiVariationListing',
                               'sellingStatus', 'primaryCategory'],axis=1)

In [None]:
listings = roman_coins['listingInfo'].map(eval).apply(pd.Series)
roman_coins = pd.concat([roman_coins, listings], axis=1)
roman_coins = roman_coins.drop('listingInfo', axis=1)

In [None]:
roman_coins['paymentMethod'] = [0 if x == 'PayPal' else 1 for 
                                x  in roman_coins['paymentMethod']]

In [None]:
roman_coins['returnsNotAccepted'] = roman_coins['returnsAccepted'].astype(int) * -1 + 1

roman_coins = roman_coins.drop(['returnsAccepted','secondaryCategory'] axis=1)

In [None]:
shipping = roman_coins['shippingInfo'].map(eval).apply(pd.Series)

In [None]:
ship_cost_dicts = []
shipping_currency = []

for val in shipping['shippingServiceCost']:
    try:
        currency = val['_currencyId']
        cost = val['value']
        ship_cost_dicts.append(cost)
        shipping_currency.append(currency)
    
    except:
        
        ship_cost_dicts.append(np.nan)
        shipping_currency.append(np.nan)

shipping['shipping_cost'] = ship_cost_dicts
shipping['shipping_cost_currency'] = shipping_currency

shipping = shipping.drop('shippingServiceCost', axis=1)

shipping['handlingTime'] = shipping['handlingTime'].astype(float)
shipping['handlingTime'] = shipping['handlingTime'].fillna(
                                    shipping['handlingTime'].mean())

mapper = {'false' : 0, 'true': 1}
shipping['expeditedShipping'] = shipping['expeditedShipping'].map(mapper)


shipping = shipping.drop(['oneDayShippingAvailable', 'shipToLocations',
                         'shipping_cost_currency'], axis=1)



In [None]:
roman_coins = pd.concat([roman_coins, shipping], axis=1)

roman_coins = roman_coins.drop('shippingInfo', axis=1)

In [None]:
roman_coins = roman_coins.drop('subtitle', axis=1)

roman_coins['topRatedListing'] = roman_coins['topRatedListing'].astype(int)

roman_coins['URLTitle'] = [val[24:-13] for val in roman_coins['viewItemURL']]

roman_coins = roman_coins.drop(['categoryName','gift','sellingState'], axis=1)


roman_coins['autoPay'] = roman_coins['autoPay'].astype(int)

roman_coins['categoryId'] = roman_coins['categoryId'].astype(object)

In [None]:
roman_coins['startTime'] = pd.to_datetime(roman_coins['startTime'])
roman_coins['endTime'] = pd.to_datetime(roman_coins['endTime'])

In [None]:
times = []
try:
    for val in roman_coins['timeLeft']:
        x = re.sub(r'[HM]+', ':', val)
        time = re.sub(r'[SPT]', '', x)
        new_time = pd.Timedelta(time)
        times.append(new_time)
except KeyError:
    print('You are running completed sales, this step unnecessary')

roman_coins['timeLeft'] = times

In [None]:
roman_coins['watchCount'] = roman_coins['watchCount'].astype(float)
roman_coins['shipping_cost'] = roman_coins['shipping_cost'].astype(float)

In [None]:
buyitnow = []
for val in roman_coins['buyItNowPrice']:
    try:
        price = val['value']
        buyitnow.append(price)
    except:
        buyitnow.append(np.nan)
        
roman_coins['buyItNowPrice'] = buyitnow
roman_coins['buyItNowPrice'] = roman_coins['buyItNowPrice'].astype(float)

roman_coins = roman_coins.drop('convertedBuyItNowPrice', axis=1)

In [None]:
mapper = {'false' : 0, 'true': 1}
roman_coins['bestOfferEnabled'] = roman_coins['bestOfferEnabled'].map(mapper)
roman_coins['buyItNowAvailable'] = roman_coins['buyItNowAvailable'].map(mapper)

In [None]:
roman_coins = roman_coins.drop(list(roman_coins[roman_coins.duplicated(
                            subset='itemId', keep='first')].index), axis = 0)

In [None]:
df = roman_coins.copy()

In [None]:
#turning timeLeft into an float variable representing number of hours remaining
df['timeLeft'] = df['timeLeft'] / pd.Timedelta(hours=1)


#turning hours into categorical variables
df['timeLeft'] = pd.cut(df['timeLeft'], 
               [0, 1, 6, 24, 48, 168, df['timeLeft'].max()], 
                labels=['less_than_hour_left', 'less_than_6hours_left', 
                  'less_than_day_left', 'less_than_2day_left', 'less_than_week_left',
                  'more_than_week_left'])

df['timeLeft'] = df['timeLeft'].astype(object).fillna('completed_no_time_left')

df = pd.concat([df, pd.get_dummies(df['timeLeft']).drop('more_than_week_left', 
                                                        axis=1)], axis=1)
df = df.drop('timeLeft', axis=1)

In [None]:
#dropping roman coin jewelry categories

df = df.drop(list(df.loc[df['categoryId'] == 137843].index), axis=0)
df = df.drop(list(df.loc[df['categoryId'] == 164343].index), axis=0)
df = df.drop(list(df.loc[df['categoryId'] == 548].index), axis=0)
df = df.drop(list(df.loc[df['categoryId'] == 164341].index), axis=0)
df = df.drop(list(df.loc[df['categoryId'] == 45157].index), axis=0)
df = df.drop(list(df.loc[df['categoryId'] == 29223].index), axis=0)
df = df.drop(list(df.loc[df['categoryId'] == 13993].index), axis=0)

#dropping additional categories likely not to be coins
drop_cols = df['categoryId'].value_counts().reset_index()
val_to_drop = list(drop_cols.loc[drop_cols['categoryId'] < 12]['index'])
df = df.drop(df[df['categoryId'].isin(val_to_drop)].index, axis=0)

In [None]:
#dropping sales not in USD
df = df.drop(list(df[df['current_currency'] != 'USD'].index), axis=0)
df = df.drop('current_currency', axis=1)

#all USD, dropping
df = df.drop('currencyid_conv', axis=1)

#dropping title, URLTitle slightly more effective, nearly identical 
df = df.drop('title', axis=1)

#already have dummy variable
df = df.drop('buyItNowPrice', axis=1)

In [None]:
df['seller_in_us'] = df['country'].apply(lambda x: 1 if x == 'US' else 0)
df = df.drop('country', axis=1)

In [None]:
df['startTime'] = pd.to_datetime(df['startTime'])
df['endTime'] = pd.to_datetime(df['endTime'])

In [None]:
df['shippingType'] = df['shippingType'].apply(lambda x: 'International' if x in ['CalculatedDomesticFlatInternational',
                                                                                'FlatDomesticCalculatedInternational'] else x)

In [None]:
#pulling individual emperors from titles

#splitting and rejoining URLtitles
df['URLTitle'] = df['URLTitle'].str.split('-').apply(lambda x: " ".join(x))
df['URLTitle'] = df['URLTitle'].str.lower()

roman_emperors = ['Augustus', 'Tiberius', 'Caligula', 'Claudius', 'Nero', 'Galba', 'Otho', 'Aulus Vitellius', 'Vespasian', 'Titus', 
                  'Domitian', 'Nerva', 'Trajan', 'Hadrian', 'Antoninus Pius', 'Marcus Aurelius', 'Lucius Verus', 'Commodus', 'Publius Helvius Pertinax', 
                  'Marcus Didius Severus Julianus', 'Septimius Severus', 'Caracalla', 'Publius Septimius Geta', 'Macrinus', 'Elagabalus', 'Severus Alexander', 
                  'Maximinus', 'Gordian I', 'Gordian II', 'Pupienus Maximus', 'Balbinus', 'Gordian III', 'Philip', 'Decius', 'Hostilian', 'Gallus', 'Aemilian', 'Valerian', 'Gallienus', 
                  'Claudius II Gothicus', 'Quintillus', 'Aurelian', 'Tacitus', 'Florian', 'Probus', 'Carus', 'Numerian', 'Carinus', 'Diocletian', 'Maximian', 'Constantius I',
                  'Galerius',  'Constantine I ', 'Galerius Valerius Maximinus', 'Licinius', 'Constantine II ', 'Constantius II', 'Constans I', 'Gallus Caesar', 'Julian', 'Jovian', 
                  'Valentinian I', 'Valens', 'Gratian', 'Valentinian II', 'Theodosius I', 'Arcadius', 'Magnus Maximus', 'Honorius', 'Theodosius II', 'Constantius III', 'Valentinian III',
                  'Marcian', 'Petronius Maximus', 'Avitus', 'Majorian', 'Anthemius', 'Olybrius', 'Glycerius', 'Julius Nepos', 'Romulus Augustulus', 'Leo I', 'Leo II', 'Zeno']

roman_emperors = [x.lower() for x in roman_emperors]

emperors = []

for title in df['URLTitle']:
    if 'lot' in title: 
        emperors.append('LOT')
    else:
        emps = re.findall(r"(?=("+'|'.join(roman_emperors)+r"))",title)
        if emps:
            emperors.append(emps)
        else: 
            emperors.append('unknown')

df['emperor_in_title'] = emperors

In [None]:
#Eliminating additional non-coin sales
posters = []

for num, val in enumerate(df['URLTitle']):
    if 'poster' in val:
        posters.append(num)

df = df.drop(posters, axis=0).reset_index(drop=True)

In [None]:
#Identifying whether there are one or more emperors in the title (i.e. is seller selling two or more coins)
#creating separate columns for dummy variable creation later
singles = []
for num, val in enumerate(df['emperor_in_title']):
    if val != 'unknown':
        if len(val) > 1:
            singles.append('multiple_emps_in_title')
        elif len(val) == 1:
            singles.append(val[0])
    else:
        singles.append('unknown')
            
df['emp_prep_dummies'] = singles

In [None]:
def emperor_cleaner(col):
    """eliminating duplicate emperor names within same listing, making them strings"""   
    if col == 'unknown' or col == 'LOT':
        return col
    else:
        if len(col) > 1:
            emp = list(set(col))
            clean = ' '.join(emp)
            return clean
        elif len(col) == 1:
            emp = col[0]
            return emp

df['emperor_in_title'] = df['emperor_in_title'].apply(emperor_cleaner)

In [None]:
#Extracting coin material from titles 
metals = ['gold', 'silver', 'bronze', 'orichalcum', 'copper']

metal = []
for title in df['URLTitle']:
    coin = re.findall(r"(?=("+'|'.join(metals)+r"))",title)
    if coin:
        metal.append(coin)
    else: 
        metal.append(['unknown'])
        
df['coin_metal'] = metal

In [None]:
def metal_cleaner(col):
    """Removes metal values from lists"""
    try:
        multi = ' '.join(col)
        return multi
    except:
        single = col[0]
        return single

df['coin_metal'] = df['coin_metal'].apply(metal_cleaner)

In [None]:
#checking one last time to eliminate non-coin sales
jewelery = []

for num, val in enumerate(df['URLTitle']):
    if 'earring' in val:
        jewelery.append(num)
    elif 'pendant' in val:
        jewelery.append(num)
    elif 'bracelet'in val:
        jewelery.append(num)
    elif 'necklace' in val:
        jewelery.append(num)

df = df.drop(jewelery, axis=0).reset_index(drop=True)

In [None]:
#Adding dummies for gold and silver
df['gold'] = df['coin_metal'].apply(lambda x: 1 if x == 'gold' else 0)
df['silver'] = df['coin_metal'].apply(lambda x: 1 if x == 'silver' else 0 )

In [None]:
#Creating dummy variable for lots (i.e. more than 1 coin for sale)
df['is_a_lot'] = df['emperor_in_title'].apply(lambda x: 1 if x == 'LOT' else 0)

In [None]:
#uncleaned is a common indicator of low quality, creating dummy

uncleaned = []
for val in df['URLTitle']:
    has = re.search('uncleaned', val)
    if has:
        uncleaned.append(1)
    else:
        uncleaned.append(0)

df['uncleaned'] = uncleaned

In [None]:
#Creating total sale time feature, putting it into days
df['total_sale_time'] = df['endTime'] - df['startTime']
df['total_sale_time'] = df.total_sale_time.dt.days

#dropping lots that have negative total time values (likely put up and immediately 
#taken down)
df = df.drop(list(df.loc[df['total_sale_time'] < 0].index), axis=0)


#Creating categorical bins for duration
df['total_sale_time'] = pd.cut(df['total_sale_time'], 
           [0, 7, 30, 365, df['total_sale_time'].max()], 
          labels=['Less_than_week', 'Less_than_month', 'less_than_year', 'more_than_year'])

df = pd.concat([df, pd.get_dummies(df['total_sale_time']).drop('less_than_year', 
                                                               axis=1)], axis=1 )
df = df.drop('total_sale_time', axis=1)

In [None]:
#Queens (zip: 11374) seems to house a professional high-volume seller, creating dummy
#41% of volume eminates from this zip
df['Queens_seller'] = df['postalCode'].apply(lambda x: 1 if x == '11374' else 0)
df = df.drop('postalCode', axis=1)

In [None]:
#creating dummies based on 'quality' indications in the title
def low_quality(title):
    word = re.findall(r'low|lower', title)
    if word:
        return 1
    else:
        return 0

def high_quality(title):
    word = re.findall(r'high|higher|premium|rare', title)
    if word:
        return 1
    else:
        return 0
    
df['high_quality_words'] = df['URLTitle'].apply(high_quality)
df['low_quality_words'] = df['URLTitle'].apply(low_quality)

In [None]:
#turning shipping type into dummy variables
df = pd.concat([df, pd.get_dummies(df['shippingType']).drop([
                'International', 'FreePickup'], axis=1)], axis=1)

#dropping shipping_cost, shippingType
df = df.drop(['shipping_cost', 'shippingType'], axis=1)
#dropping location, productId
df = df.drop(['location', 'productId'], axis=1)
#perfectly correlated with current_price, dropping
df = df.drop('value_conv', axis=1)

In [None]:
#getting listing type dummies, 
df = pd.concat([df, pd.get_dummies(df['listingType']).drop(
            ['AuctionWithBIN', 'StoreInventory'], axis=1)], axis=1)
df = df.drop('listingType', axis=1)

#dropping start/endTime
df = df.drop(['endTime', 'startTime'], axis=1)

#dropping categoryId
df = df.drop('categoryId', axis=1)

In [None]:
#creating two more dummies for coin metals before dropping
df['bronze'] = df['coin_metal'].apply(lambda x: 1 if x == 'bronze' else 0)
df['copper'] = df['coin_metal'].apply(lambda x: 1 if x == 'copper' else 0)
df = df.drop('coin_metal', axis=1)

In [None]:
#filling NaNs with zeros for numeric cats
df['bidCount'] = df['bidCount'].fillna(0)
df['watchCount'] = df['watchCount'].fillna(0)

In [None]:
#adding the emperor dummy variables and concatting to main df
df = pd.concat([df, pd.get_dummies(df['emp_prep_dummies']).drop(
    'unknown', axis=1)], axis=1)

df = df.drop('emp_prep_dummies', axis=1)
df = df.drop('emperor_in_title', axis=1)

In [None]:
#updated through cleaning, need to add url from original scraper..'viewItemURL' SAVE and exclude at 
#END!!