In [7]:
import pandas as pd
from numpy import *
from sklearn import *
random.seed(20200108)

## Read Data

In [8]:
file_path = 'Updated-CSV/specification'

In [9]:
data_hktv = pd.read_csv(f"{file_path}/HKTVMALL_specification.csv")
data_hktv = data_hktv.drop(data_hktv.columns[0], axis=1)
data_suning = pd.read_csv(f"{file_path}/Suning_specification.csv")
data_suning = data_suning.drop(data_suning.columns[0], axis=1)

In [10]:
data_suning

Unnamed: 0,index,name,ram,storage,brand_in_num,mean_rate,total_comment,price
0,1.058485e+10,APPLE IPHONE XS 64GB 手機 金色,0,64,1501,5.0,472.0,7409.0
1,1.067406e+10,HUAWEI MATE 20 X 手機 藍色,0,0,1351,5.0,505.0,4988.0
2,1.058485e+10,APPLE IPHONE XR 128GB 手機 黑色,0,128,1501,5.0,510.0,5699.0
3,1.058485e+10,APPLE IPHONE XS MAX 256GB 手機 金色,0,256,1501,5.0,509.0,9199.0
4,1.055580e+10,Apple iPad Pro 11 256GB Wi-Fi Space Gray MTXQ2...,0,256,1451,5.0,508.0,6799.0
...,...,...,...,...,...,...,...,...
331,1.135650e+10,APPLE IPHONE 11 64GB 手機 白色,0,64,1501,0.0,0.0,5999.0
332,1.135651e+10,APPLE IPHONE 11 64GB 手機 綠色,0,64,1501,0.0,0.0,5999.0
333,1.136269e+10,HUAWEI P30 PRO 8+256GB 手機 薰衣草,0,256,1351,0.0,0.0,5988.0
334,1.135651e+10,APPLE IPHONE 11 PRO 64GB 手機 太空灰,0,64,1501,0.0,0.0,8599.0


## Price Factors

In [11]:
overall_df = pd.concat([data_hktv, data_suning])
overall_df.shape

(1339, 8)

In [12]:
Y_price = overall_df["price"]

In [13]:
X_price = overall_df.drop(
    columns=["index", "name", "price"]
)
X_price.shape

(1339, 5)

### Decision Tree Regressor

In [14]:
dtr_price = tree.DecisionTreeRegressor(random_state=5201314)

In [15]:
dtr_price.fit(X_price, Y_price)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=5201314, splitter='best')

In [16]:
important_price = dtr_price.feature_importances_

In [17]:
result_price = pd.DataFrame(
    data=important_price,
    index=X_price.columns.values
)
result_price.index.name = "Spec"
result_price.columns = ["Impact on price"]
result_price = result_price.sort_values("Impact on price")
result_price

Unnamed: 0_level_0,Impact on price
Spec,Unnamed: 1_level_1
mean_rate,0.007985
total_comment,0.032145
ram,0.03327
storage,0.225387
brand_in_num,0.701213


## Sales Factors

In [18]:
Y_sales = data_suning["total_comment"]
X_sales = data_suning.drop(
    columns=["index", "name", "total_comment"]
)
X_sales.shape

(336, 5)

In [19]:
dtr_sales = tree.DecisionTreeRegressor(random_state=5201314)

In [20]:
dtr_sales.fit(X_sales, Y_sales)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=5201314, splitter='best')

In [21]:
important_sales = dtr_sales.feature_importances_

In [22]:
result_sales = pd.DataFrame(
    data=important_sales,
    index=X_sales.columns.values
)
result_sales.index.name = "Spec"
result_sales.columns = ["Impact on sales"]
result_sales = result_sales.sort_values("Impact on sales")
result_sales

Unnamed: 0_level_0,Impact on sales
Spec,Unnamed: 1_level_1
storage,0.061594
ram,0.06845
brand_in_num,0.200978
mean_rate,0.20834
price,0.460638


## Plot the Trees

In [23]:
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus

In [24]:
def plot_a_tree(tree, name):
    dot_data = StringIO()
    
    export_graphviz(
        tree, 
        out_file=dot_data,
        filled=True,
        rounded=True,
        special_characters=True
    )
    
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_png(name)

In [26]:
plot_a_tree(dtr_price, "price-tree")

dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.934731 to fit



In [27]:
plot_a_tree(dtr_sales, "sales-tree")