In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import nltk
%matplotlib inline

In [2]:
roman_coins = pd.read_csv('./roman_coins_3_1', index_col=0)

In [3]:
#autopay allows you to automatically pay with paypal without first going to that url

In [4]:
#12 observations are being sold with a charity ID, meaning they give the proceeds to charity
roman_coins['charityId'].value_counts()
#these are all for the same coin type/seller it appears
roman_coins = roman_coins.drop('charityId', axis=1)

In [5]:
#Only 10 observations have a conditionId 
roman_coins['condition'].value_counts()
roman_coins = roman_coins.drop('condition', axis=1)

In [6]:
#Overwhelmingly US, but may be worth keeping for now
roman_coins['country'].value_counts()

US    9720
BG      22
DE      20
AT      18
RS      18
BA      12
SK      11
CY      11
GB      10
YU      10
UA       8
CA       8
NZ       7
HR       6
LT       4
HU       4
RO       4
ES       3
GR       2
NO       2
Name: country, dtype: int64

In [7]:
#only one value here
roman_coins['discountPriceInfo'].value_counts()
roman_coins = roman_coins.drop('discountPriceInfo', axis=1)

#I already scraped photos and this column is incomplete anyways
roman_coins = roman_coins.drop('galleryPlusPictureURL', axis=1)
roman_coins = roman_coins.drop('galleryURL', axis=1)

In [8]:
#overwhelmingly ebay US (9891 of obs.)
roman_coins['globalId'].value_counts()
roman_coins = roman_coins.drop('globalId', axis=1)

#all false
roman_coins = roman_coins.drop('isMultiVariationListing', axis=1)

In [9]:
#listingInfo has lots of useful information and I want to add these variables separately
print(roman_coins['listingInfo'][0])
listings = roman_coins['listingInfo'].map(eval).apply(pd.Series)

#merging the new columns onto the dataframe
roman_coins = pd.concat([roman_coins, listings], axis=1)

#listings column now redundant
roman_coins = roman_coins.drop('listingInfo', axis=1)

{'bestOfferEnabled': 'false', 'buyItNowAvailable': 'false', 'startTime': '2016-08-31T04:00:20.000Z', 'endTime': '2018-03-24T04:00:20.000Z', 'listingType': 'FixedPrice', 'gift': 'false', 'watchCount': '1050'}


In [10]:
#overwhelmingly PayPal, but I will convert this to a dummy variable for now, 0 just paypal, 1 paypal or other method
roman_coins['paymentMethod'] = [0 if x == 'PayPal' else 1 for 
                                x  in roman_coins['paymentMethod']]
print(roman_coins['paymentMethod'].value_counts())

0    9812
1      88
Name: paymentMethod, dtype: int64


In [11]:
#these variables are already converted to columns
roman_coins = roman_coins.drop('sellingStatus', axis=1)

#already have this variable elsewhere
roman_coins = roman_coins.drop('primaryCategory', axis=1)

In [12]:
#I want to swap swap the values here, to make true 0 and false 1 
print(roman_coins['returnsAccepted'].value_counts())
roman_coins['returnsNotAccepted'] = roman_coins['returnsAccepted'].astype(int) * -1 + 1

#dropping old columns
roman_coins = roman_coins.drop('returnsAccepted', axis=1)

True     9292
False     608
Name: returnsAccepted, dtype: int64


In [13]:
#these are too varied to make a dummy or make as separate variables, dropping 
roman_coins = roman_coins.drop('secondaryCategory', axis=1)

In [14]:
#the shipping information is interesting to me and I am going to keep it, affects cost 
print(roman_coins['shippingInfo'][0])
shipping = roman_coins['shippingInfo'].map(eval).apply(pd.Series)

{'shippingServiceCost': {'_currencyId': 'USD', 'value': '3.95'}, 'shippingType': 'Flat', 'shipToLocations': 'Worldwide', 'expeditedShipping': 'false', 'oneDayShippingAvailable': 'false', 'handlingTime': '1'}


In [15]:
#There are 287 null values in shippingServiceCost- they have 'calculated' shipping type
#I am going to extract relevant values for new columns

ship_cost_dicts = []
shipping_currency = []

for val in shipping['shippingServiceCost']:
    try:
        currency = val['_currencyId']
        cost = val['value']
        ship_cost_dicts.append(cost)
        shipping_currency.append(currency)
    
    except:
        
        ship_cost_dicts.append(np.nan)
        shipping_currency.append(np.nan)


#adding these columns
shipping['shipping_cost'] = ship_cost_dicts
shipping['shipping_cost_currency'] = shipping_currency

#dropping old column
shipping = shipping.drop('shippingServiceCost', axis=1)

In [25]:
#Converting NaN to mean column value in handlingTime
shipping['handlingTime'] = shipping['handlingTime'].astype(float)
shipping['handlingTime'] = shipping['handlingTime'].fillna(
                                    shipping['handlingTime'].mean())

In [26]:
#creating dummy variables for expedited shipping availability
mapper = {'false' : 0, 'true': 1}
shipping['expeditedShipping'] = shipping['expeditedShipping'].map(mapper)

#only 22 true values, dropping
shipping = shipping.drop('oneDayShippingAvailable', axis=1)

#all ship worldwide, dropping
shipping = shipping.drop('shipToLocations', axis=1)

#all USD or NAN, dropping 
shipping = shipping.drop('shipping_cost_currency', axis=1)

In [27]:
#will have to decide what to do with shipping types later on, will likely make dummies 
print(shipping['shippingType'].value_counts())

#merging shipping back into main DF 
roman_coins = pd.concat([roman_coins, shipping], axis=1)

#dropping shipping info
roman_coins = roman_coins.drop('shippingInfo', axis=1)

Free                                   6062
Flat                                   2817
FlatDomesticCalculatedInternational     731
Calculated                              286
CalculatedDomesticFlatInternational       3
FreePickup                                1
Name: shippingType, dtype: int64


In [28]:
#only 34 subtitles, dropping
roman_coins = roman_coins.drop('subtitle', axis=1)

#converting to dummy variable 1 is toprated, 0 not toprated
roman_coins['topRatedListing'] = roman_coins['topRatedListing'].astype(int)

#Extracting the meaningful text from the item urls
roman_coins['URLTitle'] = [val[24:-13] for val in roman_coins['viewItemURL']]

#dropping itemURL from above
roman_coins = roman_coins.drop('viewItemURL', axis=1)

#category Id is sufficient, dropping
roman_coins = roman_coins.drop('categoryName', axis=1)

#all gift false, dropping
roman_coins = roman_coins.drop('gift', axis=1)

#all values are active, dropping
roman_coins = roman_coins.drop('sellingState', axis=1)

#autoPay as int dummy
roman_coins['autoPay'] = roman_coins['autoPay'].astype(int)

#turning categoryId into object variable
roman_coins['categoryId'] = roman_coins['categoryId'].astype(object)

In [30]:
#beginning to convert  time columns
roman_coins['startTime'] = pd.to_datetime(roman_coins['startTime'])
roman_coins['endTime'] = pd.to_datetime(roman_coins['endTime'])

In [31]:
#converting 'time left' to timedeltas 
times = []
for val in roman_coins['timeLeft']:
    x = re.sub(r'[HM]+', ':', val)
    time = re.sub(r'[SPT]', '', x)
    new_time = pd.Timedelta(time)
    times.append(new_time)

roman_coins['timeLeft'] = times

In [37]:
#Converting watchcount/shipping_Cost to float
roman_coins['watchCount'] = roman_coins['watchCount'].astype(float)
roman_coins['shipping_cost'] = roman_coins['shipping_cost'].astype(float)

In [41]:
roman_coins.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9900 entries, 0 to 9899
Data columns (total 29 columns):
autoPay                   9900 non-null int64
country                   9900 non-null object
itemId                    9900 non-null int64
location                  9900 non-null object
paymentMethod             9900 non-null int64
postalCode                9456 non-null object
title                     9900 non-null object
topRatedListing           9900 non-null int64
categoryId                9900 non-null object
bidCount                  636 non-null float64
timeLeft                  9900 non-null timedelta64[ns]
currencyid_conv           9900 non-null object
value_conv                9900 non-null float64
current_price             9900 non-null float64
current_currency          9900 non-null object
bestOfferEnabled          9900 non-null object
buyItNowAvailable         9900 non-null object
buyItNowPrice             49 non-null object
convertedBuyItNowPrice    49 non-null obje

In [45]:
roman_coins.loc[roman_coins['convertedBuyItNowPrice'].notnull()]['convertedBuyItNowPrice']

271       {'_currencyId': 'USD', 'value': '12.0'}
375       {'_currencyId': 'USD', 'value': '25.0'}
506      {'_currencyId': 'USD', 'value': '23.19'}
568      {'_currencyId': 'USD', 'value': '20.33'}
625       {'_currencyId': 'USD', 'value': '8.95'}
6117     {'_currencyId': 'USD', 'value': '695.0'}
6120     {'_currencyId': 'USD', 'value': '950.0'}
6130     {'_currencyId': 'USD', 'value': '495.0'}
6163     {'_currencyId': 'USD', 'value': '695.0'}
6184     {'_currencyId': 'USD', 'value': '395.0'}
6229     {'_currencyId': 'USD', 'value': '495.0'}
6296     {'_currencyId': 'USD', 'value': '275.0'}
6325     {'_currencyId': 'USD', 'value': '329.0'}
6329     {'_currencyId': 'USD', 'value': '269.0'}
6334     {'_currencyId': 'USD', 'value': '349.0'}
6365     {'_currencyId': 'USD', 'value': '195.0'}
6459     {'_currencyId': 'USD', 'value': '199.0'}
6465     {'_currencyId': 'USD', 'value': '199.0'}
6631     {'_currencyId': 'USD', 'value': '169.0'}
6632     {'_currencyId': 'USD', 'value': '329.0'}


In [52]:
#converting column into prices from dicts
buyitnow = []
for val in roman_coins['buyItNowPrice']:
    try:
        price = val['value']
        buyitnow.append(price)
    except:
        buyitnow.append(np.nan)
        
roman_coins['buyItNowPrice'] = buyitnow
roman_coins['buyItNowPrice'] = roman_coins['buyItNowPrice'].astype(float)

#all in USD, dropping
roman_coins = roman_coins.drop('convertedBuyItNowPrice', axis=1)

In [58]:
#bestoffer means that the seller accepts a sale if the price is less than the buy it now price
mapper = {'false' : 0, 'true': 1}
roman_coins['bestOfferEnabled'] = roman_coins['bestOfferEnabled'].map(mapper)
#converting to dummy variable
roman_coins['buyItNowAvailable'] = roman_coins['buyItNowAvailable'].map(mapper)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9900 entries, 0 to 9899
Data columns (total 28 columns):
autoPay               9900 non-null int64
country               9900 non-null object
itemId                9900 non-null int64
location              9900 non-null object
paymentMethod         9900 non-null int64
postalCode            9456 non-null object
title                 9900 non-null object
topRatedListing       9900 non-null int64
categoryId            9900 non-null object
bidCount              636 non-null float64
timeLeft              9900 non-null timedelta64[ns]
currencyid_conv       9900 non-null object
value_conv            9900 non-null float64
current_price         9900 non-null float64
current_currency      9900 non-null object
bestOfferEnabled      9900 non-null int64
buyItNowAvailable     9900 non-null int64
buyItNowPrice         49 non-null float64
endTime               9900 non-null datetime64[ns]
listingType           9900 non-null object
startTime            