# 1. 需求重要性

In [1]:
# 匯入套件
import numpy as np
import pandas as pd

from scipy.stats import pearsonr
from scipy.stats import kstest
from scipy.stats.contingency import association
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestRegressor

In [2]:
# 載入資料集 (final)
data = pd.read_excel("./Dataset/tb_cdri_work5_t3_final.xlsx")

In [4]:
# 載入資料集 (數據)
df = pd.read_excel("./Dataset/QFD數據_USB-C 充電器 60W 以上.xlsx")
df.rename(columns = {'ASIN':'asin'}, inplace = True)
df.drop(df.columns[10:], axis=1, inplace=True)

In [5]:
# 刪除不要的欄位
data.drop(columns = ["產業", "Brand", "p_no", "name",
                    "index", "opinion_items_word", "review", "title",
                    "rating","date","p_url","level1_no","level2_no"], inplace=True)

In [6]:
# 篩選為60W以上
sixty_above = data[data["Category"] == "USB-C充電器(60W以上)(USB-C Power adapter/charger Above 60W)"]

# 將emotion轉為 0:negative 1:positive
emotion_mapping = {'positive': 1, 'negative': 0}
sixty_above['emotion'] = sixty_above['emotion'].map(emotion_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sixty_above['emotion'] = sixty_above['emotion'].map(emotion_mapping)


In [7]:
# 針對主力商品 計算需求重要性
count_df = sixty_above["opinion_items"].value_counts().to_frame()
count_df = count_df.rename(columns={"opinion_items":"總評論數"})
count_df["重要性"] = count_df.apply(lambda r:r.rank(ascending=False)).astype(int)

In [8]:
# 計算平均
new_mean = sixty_above.groupby(by=["asin","opinion_items"])["emotion"].agg(["size","mean"]).reset_index()
new_mean.rename(columns = {"size":"產品評論總數","mean":"產品正評比例"}, inplace = True)

In [9]:
# 合併 df & new_mean datafeame
import warnings
warnings.filterwarnings("ignore")

final = pd.merge(df, new_mean, left_on= 'asin', right_on='asin')
final = final.drop(columns=["opinion_items"])

In [10]:
# 萃取 opinion_items 唯一值
items = list(sixty_above['opinion_items'].unique())

# 2. 計算相關性

In [11]:
feature_names = df.columns.tolist()[1:]

In [12]:
features = df.columns.size

Corr = np.empty((features-1, features-1))

for i in range(1, features):
    for j in range(1, features):
        A = df.iloc[:, i]
        B = df.iloc[:, j]
        if (df.iloc[:, i].dtype == 'float64') and (df.iloc[:, j].dtype == 'float64'):
            Corr[i-1, j-1] = pearsonr(A.values.T, B.values.T)[0]
        else:
            C = pd.crosstab(A, B).values       
            Corr[i-1, j-1] = association(C, method = 'cramer', correction = True)

corr = pd.DataFrame(Corr, index = feature_names, columns = feature_names)
corr

Unnamed: 0,尺寸(體積),重量,特徵,USB埠數量,總輸出功率,輸出,快充協議,認證,保固
尺寸(體積),1.0,0.9759,0.95477,0.987019,0.975345,0.976883,0.970848,1.0,1.0
重量,0.9759,1.0,0.931375,0.917362,0.915535,0.913362,0.912162,1.0,0.914195
特徵,0.95477,0.931375,1.0,0.915764,0.908538,0.927188,0.951734,0.957427,0.972774
USB埠數量,0.987019,0.917362,0.915764,1.0,0.616624,1.0,0.682474,0.839974,0.307352
總輸出功率,0.975345,0.915535,0.908538,0.616624,1.0,0.975553,0.657155,0.905232,0.532708
輸出,0.976883,0.913362,0.927188,1.0,0.975553,1.0,0.845812,0.924211,0.771006
快充協議,0.970848,0.912162,0.951734,0.682474,0.657155,0.845812,1.0,0.92582,0.637343
認證,1.0,1.0,0.957427,0.839974,0.905232,0.924211,0.92582,1.0,0.964365
保固,1.0,0.914195,0.972774,0.307352,0.532708,0.771006,0.637343,0.964365,1.0


# 3. QFD數據

In [14]:
# 載入資料集 (數據)
df = pd.read_excel("./Dataset/QFD數據_USB-C 充電器 60W 以上.xlsx")
df.rename(columns = {'ASIN':'asin'}, inplace = True)
df.drop(df.columns[10:], axis=1, inplace=True)

### 1.將重量：去掉單位 ounces

In [15]:
df["重量"] = df["重量"].str.split(" ", expand=True)[0]
df["總輸出功率"] = df["總輸出功率"].str.split("W", expand=True)[0]
df.head()

Unnamed: 0,asin,尺寸(體積),重量,特徵,USB埠數量,總輸出功率,輸出,快充協議,認證,保固
0,B09MV3M4GL,8.32136,6.1,Overcurrent Protection\nShort Circuit Protecti...,2C1A,65,C1 Max 65W\nC2 Max 25W\nA1 Max 15W,PD 3.0\nPDO\nPPS\nSFC,,0
1,B07PZSXL9J,8.01856,5.0,GaN\nFoldable Plug,1C1A,60,C1 Max 45W\nA1 Max 15W,PowerIQ 3.0,TUV certification,18
2,B07D64QLQ1,4.26899,3.42,Lightweight\nFoldable Plug\nCompact,1C,65,C1 Max 65W,PD,FCC\nSGS NA listed\nDOE Level VI,18
3,B097PTBB5V,20.475,10.8,Overcurrent Protection\nShort Circuit Protecti...,1C,100,C1 Max 100W,PD 3.0,,0
4,B07ZCGYP27,9.536406,4.8,Ultra Slim\nOvercharge Protection\nTemperature...,1C3A,65,C1 Max 65W\nA1 Max 20W\nA2 Max 20W\nA3 Max 20W,PowerIQ 3.0,,0


In [16]:
# 字串處理成數值
df["保固"] = df["保固"].replace(["Lifetime Warranty", 'lifetime warranty', '12(plus 12-month warranty extension)'], [100, 100, 24])

### 2.USB埠數量: 要切割成 2 欄，分別代表 USB-A 和 USB-C的數量

In [17]:
USB_C = []
USB_A = []

for i in df["USB埠數量"]:
    if len(i)//2 == 1:
        USB_C.append(i[::])
        x = i + "0A"
        y = x[2:]
        USB_A.append(y)
    else:
        C = slice(0, len(i)//2)
        A = slice(len(i)//2, len(i))
        USB_C.append(i[C]), USB_A.append(i[A]) 
        
# 把Ｃ替換成空白，只保留數值
USBC = [i.replace("C", "") for i in USB_C]
# 把A替換成空白，只保留數值
USBA = [i.replace("A", "") for i in USB_A]

# 新增新的兩個欄位USB-C,USB-A，並刪除欄位USB埠數量
df["USB-C數量"] = pd.DataFrame(USBC)
df["USB-A數量"] = pd.DataFrame(USBA)
df = df.drop(columns=["USB埠數量"], axis=1)

### 3.輸出: 類似作法分割成 2 columns，分別代表 USB-A 和 USB-C的最高輸出功率

In [18]:
# Import Library Regex
import re

# Define Function for Usb-A & Usb-C
def Extract_Usb_Max(input):
    numbers = max(map(float, re.findall('\d*\.\d+|\d+', input)))
    return numbers

def UsbC_Max(amount, type):
    for i in range(len(df)):
        if df[amount][i] == '1':
            type.append(Extract_Usb_Max(df["輸出"][i][:10]))
        
        if df[amount][i] == '2':
            type.append(Extract_Usb_Max(df["輸出"][i][:21]))
        
        if df[amount][i] == '3':
            type.append(Extract_Usb_Max(df["輸出"][i][:37]))
        
        if df[amount][i] == '4':
            type.append(Extract_Usb_Max(df["輸出"][i][:58]))

def UsbA_Max(amount, type):
    for i in range(len(df)):
        if df[amount][i] == '0':
            type.append(0)

        if df[amount][i] == '1':
            type.append(Extract_Usb_Max(df["輸出"][i][-10:]))
        
        if df[amount][i] == '2':
            type.append(Extract_Usb_Max(df["輸出"][i][-21:]))
        
        if df[amount][i] == '3':
            type.append(Extract_Usb_Max(df["輸出"][i][-32:]))
        
        if df[amount][i] == '4':
            type.append(Extract_Usb_Max(df["輸出"][i][-50:]))
            


# Execute Function & Get the Final Dataframe
numberC = []
numberA = []
UsbC_Max('USB-C數量', numberC)
UsbA_Max('USB-A數量', numberA)

df['USB-C Max'] = numberC
df['USB-A Max'] = numberA

### 4.“特徵”、“快充協議”和“認證”: 則可以看成是 multi-label 的問題 

In [19]:
# 匯入套件
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [20]:
# 特徵欄位處理

df_final = df.join(pd.DataFrame(mlb.fit_transform([str(df.loc[i,'特徵']).split("\n")
                                                   for i in range(len(df))]), columns=mlb.classes_))
col_特徵 = mlb.classes_.tolist()

In [21]:
# 快充協議欄位處理

df_final = df_final.join(pd.DataFrame(mlb.fit_transform([str(df.loc[i,'快充協議']).split("\n")
                                                   for i in range(len(df))]), columns=mlb.classes_))
col_快充 = mlb.classes_.tolist()
col_快充.remove('nan')
df_final = df_final.drop("nan", axis=1)

In [22]:
# 認證欄位處理

df_final = df_final.join(pd.DataFrame(mlb.fit_transform([str(df.loc[i,'認證']).split("\n")
                                                   for i in range(len(df))]), columns=mlb.classes_))
col_認證 = mlb.classes_.tolist()
col_認證.remove('nan')
df_final = df_final.drop("nan", axis=1)

In [23]:
# 改變資料型態為浮點數
df1 = df_final.astype({'重量':'float', 
                       "USB-C數量":"float",
                       "USB-A數量":"float", 
                       "總輸出功率":"float"})

In [24]:
# 刪除原始數據欄位
df = df1.drop(columns={'特徵', "輸出", "快充協議", "認證"}, axis=1)

In [25]:
spec_columns = df.columns.tolist()[1:]

# 4. 合併及建模

## 隨機森林回歸模型

In [78]:
# Parameters:
# n_estimators: 森林中樹木的數量，預設=100。
# max_features: 劃分時考慮的最大特徵數，預設auto。
# criterion: 評估切割點指標，mse/mae。
# max_depth: 樹的最大深度。
# splitter: 特徵劃分點選擇標準，best/random。預設為best。
# random_state: 亂數種子，確保每次訓練結果都一樣，splitter=random 才有用。
# min_samples_split: 至少有多少資料才能再分
# min_samples_leaf: 分完至少有多少資料才能分
# Attributes:
# feature_importances_: 查詢模型特徵的重要程度。

In [27]:
# 合併 data & new_mean datafeame
import warnings
warnings.filterwarnings("ignore")
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score

importances = np.empty((0,75), int)


number_list = [i for i in range(10, 110, 10)] # 設定 RandomForestRegression 模型裡的 n_estimaters 參數

mse_list = [] # 建立MSE串列
maxd_list = [1, 2, 3] # 建立樹的深度

for i in items:
    comment_temp = new_mean[new_mean['opinion_items'] == i].reset_index(drop = True)
    spec_comment = pd.merge(df, comment_temp, left_on= 'asin', right_on='asin')
    spec_comment.drop(columns=['opinion_items'], axis=1, inplace=True)
    
    X = spec_comment.iloc[:,1:-2].values
    y = spec_comment.iloc[:,-1].values
    
    # split into train test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)   

    # Fit the model
    best_mse = 1
    best_par = {}
    for n in number_list:
        for d in maxd_list:
            forest = RandomForestRegressor(n_estimators = n, criterion = "absolute_error", max_features = "sqrt", max_depth = d)
            mse = cross_val_score(forest, X_train, y_train, cv=len(y_train), scoring='neg_mean_squared_error').mean()
            if mse < best_mse:
                best_mse = mse
                best_par = {'number':n, 'max_depth': d}

    forest = RandomForestRegressor(n_estimators = best_par['number'], criterion = "mse", max_features = "sqrt", max_depth = best_par['max_depth'])
    forest.fit(X_train, y_train)
    predicted = forest.predict(X_test)
    mse = metrics.mean_squared_error(y_test, predicted)
    mse_list.append([mse, best_par])

    importances = np.vstack([importances, forest.feature_importances_])

MSE = pd.DataFrame(mse_list, columns=["MSE Score", "Best parameters"])
MSE

InvalidParameterError: The 'criterion' parameter of RandomForestRegressor must be a str among {'friedman_mse', 'poisson', 'squared_error', 'absolute_error'}. Got 'mse' instead.

In [80]:
df = pd.DataFrame(importances, index = items, columns = spec_columns)

In [81]:
sum_col_特徵 = df[col_特徵].sum(axis = 1)
df1 = df.assign(特徵 = sum_col_特徵)
df = df1.drop(columns = col_特徵)

sum_col_快充 = df[col_快充].sum(axis = 1)
df2 = df.assign(快充 = sum_col_快充)
df = df2.drop(columns = col_快充)

sum_col_認證 = df[col_認證].sum(axis = 1)
df3 = df.assign(認證 = sum_col_認證)
df = df3.drop(columns = col_認證)

# 將 df 匯出成Excel
df.to_excel("/Users/howard/Desktop/商業發展研究院產學/Python ipynb/Excel檔/USB-C_充電器_60W以上訓練RandomForest.xlsx")