This code can be used to clean ebay scrapes, following an initial parse in Roman_Coin_web_scrape- tested to work with search results for 'completedsales' and 'findItemsAdvanced'. Text in cells refers to rationale for action during initial cleaning. This is also (for now) just a primary cleaning, further cleaning is required in 'Roman_Coin_EDA'.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
%matplotlib inline

In [2]:
#read in the relevant csv to clean here after scraping
roman_coins = pd.read_csv('./roman_coins_03_08', index_col=0)

In [4]:
roman_coins.columns

Index(['autoPay', 'condition', 'country', 'discountPriceInfo',
       'galleryPlusPictureURL', 'galleryURL', 'globalId',
       'isMultiVariationListing', 'itemId', 'listingInfo', 'location',
       'paymentMethod', 'postalCode', 'primaryCategory', 'returnsAccepted',
       'secondaryCategory', 'sellingStatus', 'shippingInfo', 'subtitle',
       'title', 'topRatedListing', 'viewItemURL', 'categoryId', 'categoryName',
       'bidCount', 'sellingState', 'timeLeft', 'currencyid_conv', 'value_conv',
       'current_price', 'current_currency'],
      dtype='object')

In [5]:
#Only 10 observations have a conditionId 
roman_coins['condition'].value_counts()
roman_coins = roman_coins.drop('condition', axis=1)

In [6]:
#only one value here
roman_coins['discountPriceInfo'].value_counts()
roman_coins = roman_coins.drop('discountPriceInfo', axis=1)

#I already scraped photos and this column is incomplete anyways
roman_coins = roman_coins.drop('galleryPlusPictureURL', axis=1)
roman_coins = roman_coins.drop('galleryURL', axis=1)

In [7]:
#overwhelmingly ebay US (9891 of obs.)
roman_coins['globalId'].value_counts()
roman_coins = roman_coins.drop('globalId', axis=1)

#all false
roman_coins = roman_coins.drop('isMultiVariationListing', axis=1)

In [8]:
#listingInfo has lots of useful information and I want to add these variables separately
print(roman_coins['listingInfo'][0])
listings = roman_coins['listingInfo'].map(eval).apply(pd.Series)

#merging the new columns onto the dataframe
roman_coins = pd.concat([roman_coins, listings], axis=1)

#listings column now redundant
roman_coins = roman_coins.drop('listingInfo', axis=1)

{'bestOfferEnabled': 'false', 'buyItNowAvailable': 'false', 'startTime': '2016-08-31T04:00:20.000Z', 'endTime': '2018-03-24T04:00:20.000Z', 'listingType': 'FixedPrice', 'gift': 'false', 'watchCount': '1062'}


In [9]:
#overwhelmingly PayPal, but I will convert this to a dummy variable for now, 0 just paypal, 1 paypal or other method
roman_coins['paymentMethod'] = [0 if x == 'PayPal' else 1 for 
                                x  in roman_coins['paymentMethod']]
print(roman_coins['paymentMethod'].value_counts())

0    9807
1      93
Name: paymentMethod, dtype: int64


In [10]:
#these variables are already converted to columns
roman_coins = roman_coins.drop('sellingStatus', axis=1)

#already have this variable elsewhere
roman_coins = roman_coins.drop('primaryCategory', axis=1)

In [11]:
#I want to swap swap the values here, to make true 0 and false 1 
print(roman_coins['returnsAccepted'].value_counts())
roman_coins['returnsNotAccepted'] = roman_coins['returnsAccepted'].astype(int) * -1 + 1

#dropping old columns
roman_coins = roman_coins.drop('returnsAccepted', axis=1)

True     9290
False     610
Name: returnsAccepted, dtype: int64


In [12]:
#these are too varied to make a dummy or make as separate variables, dropping 
roman_coins = roman_coins.drop('secondaryCategory', axis=1)

In [13]:
#the shipping information is interesting to me and I am going to keep it, affects cost 
print(roman_coins['shippingInfo'][0])
shipping = roman_coins['shippingInfo'].map(eval).apply(pd.Series)

{'shippingServiceCost': {'_currencyId': 'USD', 'value': '3.95'}, 'shippingType': 'Flat', 'shipToLocations': 'Worldwide', 'expeditedShipping': 'false', 'oneDayShippingAvailable': 'false', 'handlingTime': '1'}


In [14]:
#There are 287 null values in shippingServiceCost- they have 'calculated' shipping type
#I am going to extract relevant values for new columns

ship_cost_dicts = []
shipping_currency = []

for val in shipping['shippingServiceCost']:
    try:
        currency = val['_currencyId']
        cost = val['value']
        ship_cost_dicts.append(cost)
        shipping_currency.append(currency)
    
    except:
        
        ship_cost_dicts.append(np.nan)
        shipping_currency.append(np.nan)


#adding these columns
shipping['shipping_cost'] = ship_cost_dicts
shipping['shipping_cost_currency'] = shipping_currency

#dropping old column
shipping = shipping.drop('shippingServiceCost', axis=1)

In [15]:
#Converting NaN to mean column value in handlingTime
shipping['handlingTime'] = shipping['handlingTime'].astype(float)
shipping['handlingTime'] = shipping['handlingTime'].fillna(
                                    shipping['handlingTime'].mean())

In [16]:
#creating dummy variables for expedited shipping availability
mapper = {'false' : 0, 'true': 1}
shipping['expeditedShipping'] = shipping['expeditedShipping'].map(mapper)

#only 22 true values, dropping
shipping = shipping.drop('oneDayShippingAvailable', axis=1)

#all ship worldwide, dropping
shipping = shipping.drop('shipToLocations', axis=1)

#all USD or NAN, dropping 
shipping = shipping.drop('shipping_cost_currency', axis=1)

In [17]:
#will have to decide what to do with shipping types later on, will likely make dummies 
print(shipping['shippingType'].value_counts())

#merging shipping back into main DF 
roman_coins = pd.concat([roman_coins, shipping], axis=1)

#dropping shipping info
roman_coins = roman_coins.drop('shippingInfo', axis=1)

Free                                   6344
Flat                                   2548
FlatDomesticCalculatedInternational     724
Calculated                              255
CalculatedDomesticFlatInternational      28
FreePickup                                1
Name: shippingType, dtype: int64


In [18]:
#only 34 subtitles, dropping
roman_coins = roman_coins.drop('subtitle', axis=1)

#converting to dummy variable 1 is toprated, 0 not toprated
roman_coins['topRatedListing'] = roman_coins['topRatedListing'].astype(int)

#Extracting the meaningful text from the item urls
roman_coins['URLTitle'] = [val[24:-13] for val in roman_coins['viewItemURL']]

#dropping itemURL from above
roman_coins = roman_coins.drop('viewItemURL', axis=1)

#category Id is sufficient, dropping
roman_coins = roman_coins.drop('categoryName', axis=1)

#all gift false, dropping
roman_coins = roman_coins.drop('gift', axis=1)

#all values are active, dropping
roman_coins = roman_coins.drop('sellingState', axis=1)

#autoPay as int dummy
roman_coins['autoPay'] = roman_coins['autoPay'].astype(int)

#turning categoryId into object variable
roman_coins['categoryId'] = roman_coins['categoryId'].astype(object)

In [19]:
#beginning to convert  time columns
roman_coins['startTime'] = pd.to_datetime(roman_coins['startTime'])
roman_coins['endTime'] = pd.to_datetime(roman_coins['endTime'])

In [20]:
#converting 'time left' to timedeltas 
#no time left for completed sales, do not run
times = []
for val in roman_coins['timeLeft']:
    x = re.sub(r'[HM]+', ':', val)
    time = re.sub(r'[SPT]', '', x)
    new_time = pd.Timedelta(time)
    times.append(new_time)

roman_coins['timeLeft'] = times

In [21]:
#Converting watchcount/shipping_Cost to float
roman_coins['watchCount'] = roman_coins['watchCount'].astype(float)
roman_coins['shipping_cost'] = roman_coins['shipping_cost'].astype(float)

In [22]:
#converting column into prices from dicts
buyitnow = []
for val in roman_coins['buyItNowPrice']:
    try:
        price = val['value']
        buyitnow.append(price)
    except:
        buyitnow.append(np.nan)
        
roman_coins['buyItNowPrice'] = buyitnow
roman_coins['buyItNowPrice'] = roman_coins['buyItNowPrice'].astype(float)

#all in USD, dropping
roman_coins = roman_coins.drop('convertedBuyItNowPrice', axis=1)

In [23]:
#bestoffer means that the seller accepts a sale if the price is less than the buy it now price
mapper = {'false' : 0, 'true': 1}
roman_coins['bestOfferEnabled'] = roman_coins['bestOfferEnabled'].map(mapper)
#converting to dummy variable
roman_coins['buyItNowAvailable'] = roman_coins['buyItNowAvailable'].map(mapper)

In [25]:
#roman_coins.to_csv('./cleaned_csv_3_8')
#roman_coins.to_csv('./cleaned_csv_3_1')
#roman_coins.to_csv('./cleaned_completed_3_4')