In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython .display import display
from sklearn import metrics
from sklearn.model_selection import train_test_split
pd.set_option('display.float_format', lambda x:'%.5f' % x)
import numpy as np

In [4]:
#Set data types
types_dict_train = {'train_id':'int64', 'item_condition_id':'int8', 'price':'float64', 'shipping':'int8'}
types_dict_test = {'test_id':'int64', 'item_condition_id':'int8', 'shipping':'int8'}
train = pd.read_csv('train.tsv', delimiter='\t', low_memory=True, dtype=types_dict_train)
test = pd.read_csv('test.tsv', delimiter='\t', low_memory=True, dtype=types_dict_test)

In [5]:
#Data amount check
train.head()
test.head()
train.shape, test.shape

((1482535, 8), (693359, 7))

In [6]:
#Show the data details 
def display_all(df):
    with pd.option_context("display.max_rows", 1000):
        with pd.option_context("display.max_columns", 1000):
            display(df)
            
display_all(train.describe(include='all').transpose())

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
train_id,1482535.0,,,,741267.0,427971.135,0.0,370633.5,741267.0,1111900.5,1482534.0
name,1482535.0,1225273.0,Bundle,2232.0,,,,,,,
item_condition_id,1482535.0,,,,1.90738,0.90316,1.0,1.0,2.0,3.0,5.0
category_name,1476208.0,1287.0,"Women/Athletic Apparel/Pants, Tights, Leggings",60177.0,,,,,,,
brand_name,849853.0,4809.0,PINK,54088.0,,,,,,,
price,1482535.0,,,,26.73752,38.58607,0.0,10.0,17.0,29.0,2009.0
shipping,1482535.0,,,,0.44727,0.49721,0.0,0.0,0.0,1.0,1.0
item_description,1482531.0,1281426.0,No description yet,82489.0,,,,,,,


In [7]:
#To be category datatype
train.category_name = train.category_name.astype('category')
train.item_description = train.item_description.astype('category')
train.name = train.name.astype('category')
train.brand_name = train.brand_name.astype('category')

test.category_name = test.category_name.astype('category')
test.item_description = test.item_description.astype('category')
test.name = test.name.astype('category')
test.brand_name = test.brand_name.astype('category')

train.dtypes, test.dtypes

(train_id                int64
 name                 category
 item_condition_id        int8
 category_name        category
 brand_name           category
 price                 float64
 shipping                 int8
 item_description     category
 dtype: object, test_id                 int64
 name                 category
 item_condition_id        int8
 category_name        category
 brand_name           category
 shipping                 int8
 item_description     category
 dtype: object)

In [8]:
#Unique data amount
train.apply(lambda x: x.nunique())

train_id             1482535
name                 1225273
item_condition_id          5
category_name           1287
brand_name              4809
price                    828
shipping                   2
item_description     1281426
dtype: int64

In [9]:
#Unique data amount
test.apply(lambda x: x.nunique())

test_id              693359
name                 601117
item_condition_id         5
category_name          1223
brand_name             3900
shipping                  2
item_description     609555
dtype: int64

In [10]:
#Null data check
train.isnull().sum(),train.isnull().sum()/train.shape[0]

(train_id                  0
 name                      0
 item_condition_id         0
 category_name          6327
 brand_name           632682
 price                     0
 shipping                  0
 item_description          4
 dtype: int64, train_id            0.00000
 name                0.00000
 item_condition_id   0.00000
 category_name       0.00427
 brand_name          0.42676
 price               0.00000
 shipping            0.00000
 item_description    0.00000
 dtype: float64)

In [11]:
test.isnull().sum(),test.isnull().sum()/test.shape[0]

(test_id                   0
 name                      0
 item_condition_id         0
 category_name          3058
 brand_name           295525
 shipping                  0
 item_description          0
 dtype: int64, test_id             0.00000
 name                0.00000
 item_condition_id   0.00000
 category_name       0.00441
 brand_name          0.42622
 shipping            0.00000
 item_description    0.00000
 dtype: float64)

In [25]:
train = train.rename(columns = {'train_id': id})
test = test.rename(columns = {'test_id': id})
train['is_train'] = 1
test['is_train'] = 0
#Combine train and test data
train_test_combine = pd.concat([train.drop(['price'], axis=1),test],axis=0)

train_test_combine= train_test_combine.rename(columns = {'<built-in_function_id>': id})
train_test_combine.head()

Unnamed: 0,<built-in function id>,name,item_condition_id,category_name,brand_name,shipping,item_description,is_train
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,1,No description yet,1
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,0,This keyboard is in great condition and works ...,1
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,1,Adorable top with a hint of lace and a key hol...,1
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,1,New with tags. Leather horses. Retail for [rm]...,1
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,0,Complete with certificate of authenticity,1


In [13]:
#Data type check
train_test_combine.shape

(2175894, 8)

In [14]:
train_test_combine.category_name = train_test_combine.category_name.astype('category')
train_test_combine.item_description = train_test_combine.item_description.astype('category')
train_test_combine.name = train_test_combine.name.astype('category')
train_test_combine.brand_name = train_test_combine.brand_name.astype('category')

# String to int by cat codes
train_test_combine.name = train_test_combine.name.cat.codes
train_test_combine.category_name = train_test_combine.category_name.cat.codes
train_test_combine.brand_name = train_test_combine.brand_name.cat.codes
train_test_combine.item_description = train_test_combine.item_description.cat.codes

train_test_combine.head()

Unnamed: 0,<built-in function id>,name,item_condition_id,category_name,brand_name,shipping,item_description,is_train
0,0,916335,3,829,-1,1,1172053,1
1,1,1292428,3,86,3889,0,1585539,1
2,2,131013,1,1277,4588,1,167133,1
3,3,802671,1,503,-1,1,1136643,1
4,4,65051,1,1204,-1,0,531909,1


In [15]:
train_test_combine.dtypes

<built-in function id>    int64
name                      int32
item_condition_id          int8
category_name             int16
brand_name                int16
shipping                   int8
item_description          int32
is_train                  int64
dtype: object

In [4]:
df_test = train_test_combine.loc[train_test_combine['is_train']== 0]
df_train = train_test_combine.loc[train_test_combine['is_train']== 1]

df_test = df_test.drop(['is_train'], axis=1)
df_train = df_train.drop(['is_train'], axis=1)

df_test.shape

NameError: name 'train_test_combine' is not defined

In [3]:
df_train.shape

NameError: name 'df_train' is not defined

In [18]:
df_train['price'] = train.price

df_train['price'] = df_train['price'].apply(lambda x: np.log(x) if x>0 else x)

df_train.head()

Unnamed: 0,<built-in function id>,name,item_condition_id,category_name,brand_name,shipping,item_description,price
0,0,916335,3,829,-1,1,1172053,2.30259
1,1,1292428,3,86,3889,0,1585539,3.95124
2,2,131013,1,1277,4588,1,167133,2.30259
3,3,802671,1,503,-1,1,1136643,3.55535
4,4,65051,1,1204,-1,0,531909,3.78419


In [19]:
#RandomForest
x_train, y_train = df_train.drop(['price'], axis=1), df_train.price

model = RandomForestRegressor(n_jobs=1, min_samples_leaf=5, n_estimators=200)
model.fit(x_train, y_train)

model.score(x_train,y_train)

0.7401111795718427

In [36]:
#test = test.rename(columns ="<built-in function id>" : 'id'})
#df_test.head()

df_test.columns = ["id",'name','item_condition_id','category_name','brand_name','shipping','item_description']
df_test.columns

Index(['id', 'name', 'item_condition_id', 'category_name', 'brand_name',
       'shipping', 'item_description'],
      dtype='object')

In [37]:
preds = model.predict(df_test)

np.exp(preds)

preds = pd.Series(np.exp(preds))

submit = pd.concat([df_test.id, preds], axis=1)
submit.columns = ['test.id', 'price']
submit.to_csv('submit_rf_base.csv', index=False)