This code can be used to clean ebay scrapes, following an initial parse in Roman_Coin_web_scrape- tested to work with search results for 'completedsales' and 'findItemsAdvanced'. Text in cells refers to rationale for action during initial cleaning. This is also (for now) just a primary cleaning, further cleaning is required in 'Roman_Coin_EDA'.

In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
%matplotlib inline

In [25]:
#read in the relevant csv to clean here after scraping
roman_coins = pd.read_csv('./completed_sales_03_17', index_col=0)

In [26]:
#Only 10 observations have a conditionId 
roman_coins = roman_coins.drop('condition', axis=1)

In [27]:
#only one value here
roman_coins = roman_coins.drop('discountPriceInfo', axis=1)
#I already scraped photos and this column is incomplete anyways
roman_coins = roman_coins.drop('galleryPlusPictureURL', axis=1)
roman_coins = roman_coins.drop('galleryURL', axis=1)

In [28]:
#overwhelmingly ebay US (only interested in US anyways)
roman_coins = roman_coins.drop('globalId', axis=1)
#all false
roman_coins = roman_coins.drop('isMultiVariationListing', axis=1)

In [29]:
#listingInfo has lots of useful information and I want to add these variables separately
listings = roman_coins['listingInfo'].map(eval).apply(pd.Series)
#merging the new columns onto the dataframe
roman_coins = pd.concat([roman_coins, listings], axis=1)
#listings column now redundant
roman_coins = roman_coins.drop('listingInfo', axis=1)

In [30]:
#overwhelmingly PayPal, but I will convert this to a dummy variable for now, 0 just paypal, 1 paypal or other method
roman_coins['paymentMethod'] = [0 if x == 'PayPal' else 1 for 
                                x  in roman_coins['paymentMethod']]

In [31]:
#these variables are already converted to columns
roman_coins = roman_coins.drop('sellingStatus', axis=1)

#already have this variable elsewhere
roman_coins = roman_coins.drop('primaryCategory', axis=1)

In [32]:
#I want to swap swap the values here, to make true 0 and false 1, (1 is relevant here)
roman_coins['returnsNotAccepted'] = roman_coins['returnsAccepted'].astype(int) * -1 + 1
#dropping old columns
roman_coins = roman_coins.drop('returnsAccepted', axis=1)

In [33]:
#these are too varied to make a dummy or make as separate variables, dropping 
roman_coins = roman_coins.drop('secondaryCategory', axis=1)

In [34]:
#the shipping information is interesting to me and I am going to keep it, may affect cost/interest 
shipping = roman_coins['shippingInfo'].map(eval).apply(pd.Series)

In [35]:
#There are 287 null values in shippingServiceCost- they have 'calculated' shipping type
#I am going to extract relevant values for new columns

ship_cost_dicts = []
shipping_currency = []

for val in shipping['shippingServiceCost']:
    try:
        currency = val['_currencyId']
        cost = val['value']
        ship_cost_dicts.append(cost)
        shipping_currency.append(currency)
    
    except:
        
        ship_cost_dicts.append(np.nan)
        shipping_currency.append(np.nan)


#adding these columns
shipping['shipping_cost'] = ship_cost_dicts
shipping['shipping_cost_currency'] = shipping_currency

#dropping old column
shipping = shipping.drop('shippingServiceCost', axis=1)

In [36]:
#Converting NaN to mean column value in handlingTime
shipping['handlingTime'] = shipping['handlingTime'].astype(float)
shipping['handlingTime'] = shipping['handlingTime'].fillna(
                                    shipping['handlingTime'].mean())

In [37]:
#creating dummy variables for expedited shipping availability
mapper = {'false' : 0, 'true': 1}
shipping['expeditedShipping'] = shipping['expeditedShipping'].map(mapper)

#only 22 true values, dropping
shipping = shipping.drop('oneDayShippingAvailable', axis=1)

#all ship worldwide, dropping
shipping = shipping.drop('shipToLocations', axis=1)

#all USD or NAN, dropping 
shipping = shipping.drop('shipping_cost_currency', axis=1)

In [38]:
#will have to decide what to do with shipping types later on, will likely make dummies 
#merging shipping back into main DF 
roman_coins = pd.concat([roman_coins, shipping], axis=1)
#dropping shipping info
roman_coins = roman_coins.drop('shippingInfo', axis=1)

In [39]:
#only 34 subtitles, dropping
roman_coins = roman_coins.drop('subtitle', axis=1)

#converting to dummy variable 1 is toprated, 0 not toprated
roman_coins['topRatedListing'] = roman_coins['topRatedListing'].astype(int)

#Extracting the meaningful text from the item urls
roman_coins['URLTitle'] = [val[24:-13] for val in roman_coins['viewItemURL']]

#dropping itemURL from above
roman_coins = roman_coins.drop('viewItemURL', axis=1)

#category Id is sufficient, dropping
roman_coins = roman_coins.drop('categoryName', axis=1)

#all gift false, dropping
roman_coins = roman_coins.drop('gift', axis=1)

#all values are active, dropping
roman_coins = roman_coins.drop('sellingState', axis=1)

#autoPay as int dummy
roman_coins['autoPay'] = roman_coins['autoPay'].astype(int)

#turning categoryId into object variable
roman_coins['categoryId'] = roman_coins['categoryId'].astype(object)

In [40]:
#beginning to convert  time columns
roman_coins['startTime'] = pd.to_datetime(roman_coins['startTime'])
roman_coins['endTime'] = pd.to_datetime(roman_coins['endTime'])

In [42]:
#converting 'time left' to timedeltas 
#no time left for completed sales, do not run
times = []
try:
    for val in roman_coins['timeLeft']:
        x = re.sub(r'[HM]+', ':', val)
        time = re.sub(r'[SPT]', '', x)
        new_time = pd.Timedelta(time)
        times.append(new_time)
except KeyError:
    print('You are running completed sales, this step unnecessary')

roman_coins['timeLeft'] = times

You are running completed sales, this step unnecessary


ValueError: Length of values does not match length of index

In [43]:
#Converting watchcount/shipping_Cost to float
roman_coins['watchCount'] = roman_coins['watchCount'].astype(float)
roman_coins['shipping_cost'] = roman_coins['shipping_cost'].astype(float)

In [44]:
#converting column into prices from dicts
buyitnow = []
for val in roman_coins['buyItNowPrice']:
    try:
        price = val['value']
        buyitnow.append(price)
    except:
        buyitnow.append(np.nan)
        
roman_coins['buyItNowPrice'] = buyitnow
roman_coins['buyItNowPrice'] = roman_coins['buyItNowPrice'].astype(float)

#all in USD, dropping
roman_coins = roman_coins.drop('convertedBuyItNowPrice', axis=1)

In [45]:
#bestoffer means that the seller accepts a sale if the price is less than the buy it now price
mapper = {'false' : 0, 'true': 1}
roman_coins['bestOfferEnabled'] = roman_coins['bestOfferEnabled'].map(mapper)
#converting to dummy variable
roman_coins['buyItNowAvailable'] = roman_coins['buyItNowAvailable'].map(mapper)

In [46]:
#roman_coins.to_csv('./cleaned_csv_3_8')
#roman_coins.to_csv('./cleaned_csv_3_1')
#roman_coins.to_csv('./cleaned_completed_3_4')
#roman_coins.to_csv('./cleaned_csv_3_17')
#roman_coins.to_csv('./cleaned_completed_3_17')