In [1]:
# import numpy and pandas libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import recall_score
from sklearn.tree import DecisionTreeRegressor
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# set random seed to ensure that results are repeatable
np.random.seed(1)


In [2]:
# load data
cartier = pd.read_csv("Cartier+3-day+auctions.csv")
cartier.head()

Unnamed: 0,auctionid,bid,bidtime,bidder,bidderrate,openbid,price,Price_cat
0,1649726994,1000.0,0.191238,sandragian,10,500.0,2500.0,1
1,1649726994,656.0,0.454734,vickdan,200,500.0,2500.0,1
2,1649726994,777.0,0.454907,vickdan,200,500.0,2500.0,1
3,1649726994,888.0,0.455162,vickdan,200,500.0,2500.0,1
4,1649726994,1000.0,0.896366,19511969,0,500.0,2500.0,1


In [3]:
# generate a basic summary of the data
cartier.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   auctionid   250 non-null    int64  
 1   bid         250 non-null    float64
 2   bidtime     250 non-null    float64
 3   bidder      250 non-null    object 
 4   bidderrate  250 non-null    int64  
 5   openbid     250 non-null    float64
 6   price       250 non-null    float64
 7   Price_cat   250 non-null    int64  
dtypes: float64(4), int64(3), object(1)
memory usage: 15.8+ KB


In [4]:
# generate a statistical summary of the numeric value in the data
cartier.describe()

Unnamed: 0,auctionid,bid,bidtime,bidderrate,openbid,price,Price_cat
count,250.0,250.0,250.0,250.0,250.0,250.0,250.0
mean,1644726000.0,422.63132,1.968105,20.664,165.72144,678.7798,0.256
std,3192030.0,455.408398,0.973897,119.055786,246.486937,606.148336,0.437297
min,1638894000.0,1.0,0.075671,0.0,1.0,26.0,0.0
25%,1643076000.0,150.9925,1.031583,0.0,99.0,326.0,0.0
50%,1643903000.0,251.98,2.31195,2.0,99.99,405.0,0.0
75%,1647871000.0,500.75,2.873565,13.5,149.99,1225.0,1.0
max,1649727000.0,2500.0,2.999965,1838.0,1200.0,2500.0,1.0


In [5]:
# Check the missing values by summing the total na's for each variable
cartier.isna().sum()

auctionid     0
bid           0
bidtime       0
bidder        0
bidderrate    0
openbid       0
price         0
Price_cat     0
dtype: int64

In [6]:
# create a list of these catagorical variables
category_var_list = list(cartier.select_dtypes(include='object').columns)
category_var_list

['bidder']

In [7]:
# explore the categorical variable values - often there are typos here that need to be fixed.
for cat in category_var_list: # generally, we want to avoid for loops and use a functional style (i.e. list comprehension)
    print(f"Category: {cat} Values: {cartier[cat].unique()}")

Category: bidder Values: ['sandragian' 'vickdan' '19511969' 'mumm29usa' 'wworld@bignet.net'
 'drumzz' 'kht-max' 'daigle1122' 'cbcolqunoun' 'richbaby10@aol.com'
 'bigpoppalarock' 'rplfunding' 'kakruse' 'pressmer1' 'g123y@aol.com'
 'cmtk1' 'tonyfred1@aol.com' 'jas16100' 'fordlower' 'claxonn' 'cars4016'
 'tebbebd' 'olloqui' 'toryx' 'lou1965' 'saphi7171@amexol.net'
 'restdynamics' '61rolls' 'rcs19010' 'bellatmk' 'auntdotbids' 'chi-town7'
 'pereluzi00' 'barginbook' 'treasureprincess' 'beelprez' 'leakang'
 'jcobb74787' 'gracedivine' 'tverna' 'jimboysan' '2gd4u' 'bonerboy-24401'
 'vnvu009' 'phyllis120577' 'mybelladesigns@aol.com' 'rotepat'
 'harleyrusty' 'sb812' 'mesmorado' 'robcmjr@bellsouth.net' 'pensri'
 'yung-wen' 'mdhallin' 'trunkbath' 'lancearmer' 'bigdaddy67' 'jengrif'
 'akryzak' 'alexwestla' 'lass1004' 'shoecrazy' 'k.l.pine' 'jdaddle'
 'jaha803' 'gm492@columbia.edu' '6969.ca' 'kasika5' 'thom54' 'bdsr1'
 'babaranda' 'amysuewarner@hotmail.com' 'kat2911' 'bella@thegrid.net'
 'princess-gi

In [8]:
#cartier.drop(['price'], axis=1, inplace = True)

In [9]:
labelencoder = LabelEncoder()
cartier['bidder'] = labelencoder.fit_transform(cartier['bidder'])


In [10]:
# explore the dataframe columns to verify encoding and dropped columns
cartier.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   auctionid   250 non-null    int64  
 1   bid         250 non-null    float64
 2   bidtime     250 non-null    float64
 3   bidder      250 non-null    int32  
 4   bidderrate  250 non-null    int64  
 5   openbid     250 non-null    float64
 6   price       250 non-null    float64
 7   Price_cat   250 non-null    int64  
dtypes: float64(4), int32(1), int64(3)
memory usage: 14.8 KB


In [16]:
# split the data into validation and training set
train_df, test_df = train_test_split(cartier, test_size=0.4)

# to reduce repetition in later code, create variables to represent the columns
# that are our predictors and target
target = 'Price_cat'
predictors = list(cartier.columns)
predictors.remove(target)


In [18]:
# create a standard scaler and fit it to the training set of predictors
scaler = preprocessing.StandardScaler()
cols_to_stdize = predictors               
               
# Transform the predictors of training and validation sets
train_df[cols_to_stdize] = scaler.fit_transform(train_df[cols_to_stdize]) # train_predictors is not a numpy array
test_df[cols_to_stdize] = scaler.transform(test_df[cols_to_stdize]) # validation_target is now a series object

train_X = train_df[predictors]
train_y = train_df[target] # train_target is now a series objecttrain_df.to_csv('airbnb_train_df.csv', index=False)
test_X = train_df[predictors]
test_y = test_df[target] # validation_target is now a series object

In [19]:
train_x = train_df[predictors]
train_y = train_df[target] # train_target is now a series objecttrain Cartier+3-day+auctions_csv('cartier.csv', index=False)
test_x = test_df[predictors]
test_y = test_df[target] # validation_target is now a series object

train_df.to_csv('./cartier_train_df_price.csv', index=False)
train_x.to_csv('./cartier_train_X_price.csv', index=False)
train_y.to_csv('./cartier_train_y_price.csv', index=False)
test_df.to_csv('./cartier_test_df_price.csv', index=False)
test_x.to_csv('./cartier_test_X_price.csv', index=False)
test_y.to_csv('./cartier_test_y_price.csv', index=False)