In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

!apt-get install p7zip
!apt-get install p7zip-full 

!7za e '../input/mercari-price-suggestion-challenge/train.tsv.7z';
!7za e '../input/mercari-price-suggestion-challenge/test_stg2.tsv.zip';
!7za e '../input/mercari-price-suggestion-challenge/sample_submission.csv.7z';

In [None]:
import numpy as np
import pandas as pd

# -----------------------------------
# 学習データ、テストデータの読み込み
# -----------------------------------

# データタイプを指定
types_dict_train = {'train_id':'int64', 'item_condition_id':'int8', 'price':'float64', 'shipping':'int8'}
types_dict_test = {'test_id':'int64', 'item_condition_id':'int8', 'shipping':'int8'}

# 学習データ、テストデータの読み込み
# tsvファイルからPandas DataFrameへ読み込み
train = pd.read_csv('./train.tsv', delimiter='\t', low_memory=True, dtype=types_dict_train)
test = pd.read_csv('./test_stg2.tsv', delimiter='\t', low_memory=True, dtype=types_dict_test)

In [None]:
## 欠損値の確認と補完

# この後合体させるためtrainとtestのidカラム名を共通化する
train = train.rename(columns = {'train_id':'id'})
test = test.rename(columns = {'test_id':'id'})

# 絶対に使用しない変数をリストアップ
drop_train = ['id','is_train']
drop_test = ['id','is_train']

# 数値変数をリストに追加
value_vars = ['item_condition_id','shipping']
# カテゴリ変数をリストに追加
category_vars = ['name','category_name','brand_name','item_description']

print(train.isna())
print(test.isna())
for v in value_vars:
    print(v)
    print(train[train[v].isna()])

for c in category_vars:
    print(c)
    print(train[train[c].isna()])

    
train = train.fillna('Blank')
test = test.fillna('Blank')

print("Blank set")

print(train.isna())

In [None]:
# 一度testとtrainを合体して一括でデータタイプを変更する
#双方のセットに分割するためのカラムを追加
train['is_train'] = 1
test['is_train'] = 0

# セット連結
combine = pd.concat([train.drop(['price'], axis=1),test],axis=0)

# カテゴリ変数のデータタイプを変更
combine.category_name = combine.category_name.astype('category')
combine.item_description = combine.item_description.astype('category')
combine.name = combine.name.astype('category')
combine.brand_name = combine.brand_name.astype('category')

# combineの文字列を「.cat.codes」でラベルエンコーディングする
combine.name = combine.name.cat.codes
combine.category_name = combine.category_name.cat.codes
combine.brand_name = combine.brand_name.cat.codes
combine.item_description = combine.item_description.cat.codes

combine.head()
combine.dtypes


In [None]:
## データの成型

# 連結したdfを分離
df_test = combine.loc[combine['is_train'] == 0]
df_train = combine.loc[combine['is_train'] == 1]


print(df_train.columns)
print(df_test.columns)
#使用しない変数をdrop
y_train = train['price']
x_train = df_train.drop(drop_train, axis=1)
x_test = df_test.drop(drop_test, axis=1)


# 今回扱うのは金額の為、log関数で非線形変換する　p127
y_train = y_train.apply(lambda x: np.log(x) if x>0 else x)
print(x_train)


In [None]:
#モデルの学習
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

#モデルの作成,学習
model = RandomForestRegressor(n_jobs=-1, min_samples_leaf=5, n_estimators=200)
model.fit(x_train, y_train)

#スコア表示
print(model.score(x_train, y_train))

In [None]:
print(x_train.columns)
print(x_test.columns)

# テストデータの予測
preds = model.predict(x_test)

# 予測値 predsを指数関数np.exp()で正常の金額に戻す
preds = np.exp(preds)


In [None]:
# 提出用ファイルを作成
csvname = 'submission.csv'
print(csvname)
submission_col = ['test_id','Price']
submission = pd.DataFrame(preds,
                  columns=['Price'])
# ID追加
id_num = pd.RangeIndex(start=0, stop=len(submission.index), step=1)
submission['test_id'] = id_num

submission = submission.reindex(columns=['test_id', 'Price'])
submission.to_csv(csvname, index=False)
print(submission)