In [1]:
from pymongo import MongoClient
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
%matplotlib inline

In [2]:
client = MongoClient()
db = client['CARSENSOR']
collection = db['PRIUS']

col = collection.find()
# nan を含む行を削除
df = pd.DataFrame(list(col)).dropna()

In [3]:
# DBのドキュメント例
collection.find_one()

{'_id': ObjectId('5bc2240bb2bf24158484a663'),
 'url': 'https://www.carsensor.net/usedcar/detail/CU8456803215/index.html?TRCD=200002',
 'base_price': 2430000,
 'total_price': 2590000,
 'model_year': 2018,
 'distance': 0.001,
 'repare': 'なし',
 'inspection': '新車未登録',
 'info': '1.8 S セーフティ プラス 新車 LEDヘッド （ブラック）',
 'drive': '2WD',
 'recycle': 'リ済込',
 'legal_maintenance': '法定整備付',
 'warranty': '保証付：販売店保証 保証期間：5年 保証距離：100,000km',
 'one_owner': 0,
 'record_book': 1,
 'no_smoke': 1,
 'key': 'CU8456803215',
 'region': '愛知県',
 'keyless': 1,
 'smartkey': 1,
 'navi': 0,
 'TV': 0,
 'video': 0,
 'audio': 0,
 'player': 0,
 'monitor': 0,
 'ETC': 0,
 'sheat_air': 0,
 'sheat_heater': 0,
 'idling_stop': 1,
 'AS_sensor': 0,
 'cruise': 1,
 'ABS': 1,
 'ESC': 1,
 'anti_theft': 1,
 'auto_brake': 0,
 'parking_assist': 1,
 'airbag': '運転席/助手席/サイド/－',
 'headlight': 'LED',
 'camera': 0,
 'around_camera': 0,
 'aero': 0,
 'alumi_wheel': 0,
 'lowdown': 0,
 'liftup': 0,
 'cold_area': 0}

## データ読み込み

In [4]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

col = collection.find()
# nan を含む行を削除
df = pd.DataFrame(list(col)).dropna()
df = df.sample(frac=1)  # シャッフル
df = df.replace('フロント/サイド/バック', 1)
df = df.replace('フロント/－/バック', 1)
df = df.replace('－/サイド/バック', 1)
df = df.replace('－/－/バック', 1)

df = df.replace('CD', 0)
df = df.replace('DVD', 0)
df = df.replace('HDD', 1)
df = df.replace('メモリー他', 1)

df = df.replace('あり', 1)
df = df.replace('なし', 0)

df = df.replace('LED', 1)
df = df.replace('ディスチャージドランプ', 1)

variable_list = [
    'distance',
    'model_year',
    'smartkey',
    'keyless',
    'record_book',
    'cruise',
    'parking_assist',
    'AS_sensor',
    'auto_brake',
    'repare',
    'camera',
    'navi',
    'ETC',
    'sheat_air',
    'sheat_heater',
    'ABS',
    'ESC',
    'anti_theft',
    'around_camera',
    'aero',
    'alumi_wheel',
    'lowdown',
    'liftup',
    'cold_area'
]


x = df[variable_list]
y = df[['base_price']]

x_dummies = pd.get_dummies(x[variable_list[2:]])
x_ = x.drop(variable_list[2:], axis=1)

x_std = StandardScaler().fit_transform(x_)
x = np.hstack((x_std, x_dummies.values))
x_data_df = pd.DataFrame(x, columns=['distance', 'model_year'] + list(x_dummies.columns))
y_data_df = y / 10000
y = y_data_df.values.reshape(-1)


train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=30)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


## 特徴選択(全データを使用、RandomForest)

In [5]:
rf = RandomForestRegressor(n_estimators=200, random_state=50)
rf.fit(x, y)

fi = rf.feature_importances_
result = []
for i, label in zip(fi, x_data_df.columns):
    result.append((i, label))
    
result.sort(reverse=True)
v = []
for i, label in result:
    print("{0:15}  {1:0.6}".format(label, i))
    v.append(label)

model_year       0.836925
distance         0.0789052
lowdown          0.0168161
aero             0.00976041
repare           0.00806706
cruise           0.00589656
auto_brake       0.00579045
record_book      0.0051595
sheat_heater     0.00469167
camera           0.00400534
ETC              0.00370045
anti_theft       0.00357377
navi             0.00314469
AS_sensor        0.00273345
ESC              0.00260764
parking_assist   0.00197162
cold_area        0.00177578
smartkey         0.00158297
alumi_wheel      0.00135012
keyless          0.000664804
ABS              0.000528251
sheat_air        0.000165102
around_camera    0.000112198
liftup           7.20092e-05


In [11]:
def adjusted_r2(r2_list, num_y, num_d):
    result = []
    for r2 in r2_list:
        result.append(1 - (1 - r2) * (num_y - 1) / (num_y - num_d - 1))
    return result
#     1 - (1-clf.score(x_data, y_data))*(len(y_data)-1)/(len(y_data)-x_data.shape[1]-1)

In [13]:
# cross validationで各モデルを学習・評価
# 評価指標には adjusted r2
# linear regression, random forest, svrを使用

x2 = x_data_df.loc[:, v[:10]].values
linear_reg = linear_model.LinearRegression()
rf = RandomForestRegressor(n_estimators=200, random_state=50)
svr = SVR(gamma='scale')
num_y = y.shape[0] // 10

lr_scores_sub = cross_val_score(linear_reg, x2, y, cv=10, scoring='r2')
rf_scores_sub = cross_val_score(rf, x2, y, cv=10, scoring='r2')
svr_scores_sub = cross_val_score(svr, x2, y, cv=10, scoring='r2')
lr_scores_all = cross_val_score(linear_reg, x, y, cv=10, scoring='r2')
rf_scores_all = cross_val_score(rf, x, y, cv=10, scoring='r2')
svr_scores_all = cross_val_score(svr, x, y, cv=10, scoring='r2')

lr_adr2_sub = adjusted_r2(lr_scores_sub, num_y, x2.shape[1])
rf_adr2_sub = adjusted_r2(rf_scores_sub, num_y, x2.shape[1])
svr_adr2_sub = adjusted_r2(svr_scores_sub, num_y, x2.shape[1])
lr_adr2_all = adjusted_r2(lr_scores_all, num_y, x.shape[1])
rf_adr2_all = adjusted_r2(rf_scores_all, num_y, x.shape[1])
svr_adr2_all = adjusted_r2(svr_scores_all, num_y, x.shape[1])

for a in [lr_adr2_sub, rf_adr2_sub, svr_adr2_sub, lr_adr2_all, rf_adr2_all, svr_adr2_all]:
    print('mean:{0:.5}    std:{1:.5}'.format(np.mean(a), np.std(a)))

mean:0.87579    std:0.0078976
mean:0.90187    std:0.0049071
mean:0.89518    std:0.0054207
mean:0.87769    std:0.0073689
mean:0.90777    std:0.0063641
mean:0.89033    std:0.0057512


## statsmodelによる回帰分析

In [17]:
x_sm = sm.add_constant(x_data_df)
model = sm.OLS(y, x_sm) 
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.883
Model:                            OLS   Adj. R-squared:                  0.882
Method:                 Least Squares   F-statistic:                     2526.
Date:                Mon, 26 Nov 2018   Prob (F-statistic):               0.00
Time:                        23:46:53   Log-Likelihood:                -36520.
No. Observations:                8070   AIC:                         7.309e+04
Df Residuals:                    8045   BIC:                         7.326e+04
Df Model:                          24                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const            142.7134      3.132     45.

In [18]:
x_sm_sub = sm.add_constant(x_data_df.loc[:, v[:10]])
model = sm.OLS(y, x_sm_sub) 
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.878
Model:                            OLS   Adj. R-squared:                  0.878
Method:                 Least Squares   F-statistic:                     5791.
Date:                Mon, 26 Nov 2018   Prob (F-statistic):               0.00
Time:                        23:46:59   Log-Likelihood:                -36688.
No. Observations:                8070   AIC:                         7.340e+04
Df Residuals:                    8059   BIC:                         7.348e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const          121.1046      0.553    219.193   