# XGBClassifier - 分類電商商品名

In [1]:
# !pip install jieba
import pandas as pd
import numpy as np
import re
from sklearn.utils import shuffle
import torch
import jieba
import jieba.analyse as analyse
import jieba.posseg as pseg  # 詞性標註
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from sklearn.feature_extraction.text import TfidfVectorizer

jieba.load_userdict("./dict.txt")  # 匯入自訂義斷詞字典

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\user\AppData\Local\Temp\jieba.cache
Loading model cost 0.628 seconds.
Prefix dict has been built successfully.


In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [18]:
df = pd.read_csv("./dataset/result_online.csv", encoding="utf-8")
print(df.head())

   item_id         item_name item_category_item_category2
0   505462         大維錠狀食品純素錠                   保健/醫療|機能保健
1   510277  樟芝天然綜合水果益生菌酵素發酵液                   保健/醫療|機能保健
2   473902         養氣人蔘禮盒ml入                   保健/醫療|機能保健
3   406767           雪麗維他命C錠                   保健/醫療|機能保健
4   322386         勇健鈣鎂鋅加強錠錠                   保健/醫療|機能保健


In [19]:
product_name_list = list(df['item_name'])
label = list(df['item_category_item_category2'])

print(len(np.unique(label)))  # 印資料的label有多少獨立類別

# concat product_name_list與label為一dataframe
df = pd.DataFrame(
    {'product_name': product_name_list,
     'item_category12': label
    })
print(df.item_category12.value_counts())  # 印出每類數量

407
圖書影音|文學小說        3846
圖書影音|童書/教具       3422
服飾|女裝            2766
傢俱收納|櫥櫃/櫃子       2730
鞋包箱|運動鞋          2362
                 ... 
彩妝保養|彩妝刷具           1
精品/飾品|鐘錶周邊          1
手機/平板|SONY相機        1
看看買|BEBEPOSHE       1
生鮮|冷凍快配             1
Name: item_category12, Length: 407, dtype: int64


## 類別平衡

In [20]:
# 做類別平衡&打亂
# 由於硬體設備限制，對於category1+2每類只平衡至200

shuffled = df
data = []
unique_label = np.unique(np.unique(df.item_category12)).tolist()  # 不重複category1+2

# 對各Category抽樣n筆資料
for i in unique_label:
    globals()['name' + str(i)] = shuffled[shuffled['item_category12'] == i].sample(n=200,replace=True)
    data.append(globals()['name' + str(i)])

# concat the dataset 將各Category合併
concated = data[0]
for i in range(1,len(unique_label)):
    concated = pd.concat([concated, data[i]], ignore_index=True)

# Shuffle the dataset
concated = shuffle(concated)

# 重新assign index
concated.reset_index(drop=True, inplace=True)
print(concated.shape)  # (81400, 2): 每個category1+2有200筆資料，包含product_name&item_category12兩欄
print(concated.head())

(81400, 2)
          product_name item_category12
0          透明衣物收納防塵套小#    傢俱收納|壓縮袋/收納袋
1     矜蘭妃蠶絲石墨烯下半身幸福男內褲     看看買|看看買-內塑衣
2         英吋打蠟機拋光布件超值組       修繕園藝|電動工具
3  職場通勤健步美型簡約漆皮扣後拉帶中跟鞋       鞋包箱|品牌休閒鞋
4               電擊式捕蚊燈         家電|捕蚊家電


In [21]:
# 檢查有無空值
concated.isna().sum()

product_name       0
item_category12    0
dtype: int64

In [22]:
# 把中文字以外的字元拿掉
rule = re.compile(r"[^\u4e00-\u9fa5]")
concated['product_name'] = [rule.sub('', name) for name in concated['product_name']]
print(concated['product_name'][:5])

0             透明衣物收納防塵套小
1       矜蘭妃蠶絲石墨烯下半身幸福男內褲
2           英吋打蠟機拋光布件超值組
3    職場通勤健步美型簡約漆皮扣後拉帶中跟鞋
4                 電擊式捕蚊燈
Name: product_name, dtype: object


In [23]:
# 載入停用字檔案
with open('stop_words.txt', encoding="utf-8") as f:
    stopword_list = [s for s in f.read().split('\n')]

In [24]:
# 斷詞(3種version: cut/topk: 選k個最重要的特徵/posseg: 依詞性斷詞)
seg_product_name = []
count = 0
for name in concated['product_name']:
    name = ''.join([w for w in name if w not in stopword_list])
    seg_product_name.append([w for w in jieba.cut(name)])
#     seg_product_name.append([w for w in jieba.analyse.extract_tags(name, topK=3, withWeight=False, allowPOS=())])
"""
    seg = []
    generator = pseg.cut(name)  # 返回生成器
    for word, flag in generator:   # word為斷的字/詞; flag為對應詞性
        # 去除形容詞、副詞與無詞性
        if(flag == 'a' or flag == 'd' or flag == 'ad' or flag == 'x'):  
            continue
        seg.append(word)
    seg_product_name.append(seg)
"""
print(seg_product_name[:5])

[['透明', '衣物', '收納', '防塵套'], ['矜蘭妃', '蠶絲', '石墨', '烯身', '幸福', '男褲'], ['英', '吋', '蠟機', '拋布件', '超值', '組'], ['職場', '通勤', '健步', '美型', '簡約', '漆皮', '扣拉中', '鞋'], ['電擊式', '捕', '蚊燈']]


In [25]:
for idx, name in enumerate(seg_product_name):
    seg_product_name[idx] = ' '.join([word for word in name])
print(seg_product_name[:5])

['透明 衣物 收納 防塵套', '矜蘭妃 蠶絲 石墨 烯身 幸福 男褲', '英 吋 蠟機 拋布件 超值 組', '職場 通勤 健步 美型 簡約 漆皮 扣拉中 鞋', '電擊式 捕 蚊燈']


## 特徵提取&降維

In [25]:
tfidf = TfidfVectorizer(max_df = 0.7, ngram_range=(1, 2), sublinear_tf=True, norm='l2', min_df = 0.00004)
features = tfidf.fit_transform(seg_product_name).toarray()
print(features.shape)

(81400, 20253)


In [23]:
X = features
y = concated['item_category12']

In [24]:
# Dimenionality reduction. Only using the 100 best features per category
from sklearn.decomposition import PCA
pca = PCA(n_components= 100,random_state=3)
X = pca.fit_transform(X)
X.shape

# Deal with PCA memory error 
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html 
from sklearn import decomposition 
"""
svd = decomposition.TruncatedSVD(n_components = 100, algorithm='arpack')
X = svd.fit_transform(features)
print(X.shape)
print(svd.explained_variance_ratio_.sum())
"""

"\nsvd = decomposition.TruncatedSVD(n_components = 100, algorithm='arpack')\nX = svd.fit_transform(features)\nprint(X.shape)\nprint(svd.explained_variance_ratio_.sum())\n"

In [78]:
# Train test split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.2,
                                                    random_state=3)

In [79]:
from xgboost import XGBClassifier
# model = XGBClassifier(tree_method='gpu_hist')
model = XGBClassifier(
        n_estimators=100, #樹的個數
        learning_rate= 0.5,   
        max_depth = 6,  # 構建樹的深度，越大越容易過擬合   
        subsample= 0.1, # 隨機取樣訓練樣本 訓練例項的子取樣比
        gamma= 0.1, # 用於控制是否後剪枝的引數,越大越保守，一般0.1、0.2這樣子
        reg_lambda=1,   # 控制模型複雜度的權重值的L2正則化項引數，引數越大，模型越不容易過擬合。
        colsample_bytree= 1, #最大增量步長，我們允許每個樹的權重估計。
    
        # 這個引數預設是 1，是每個葉子裡面 h 的和至少是多少，對正負樣本不均衡時的 0-1 分類而言
        # 假設 h 在 0.01 附近，min_child_weight 為 1 意味著葉子節點中最少需要包含 100 個樣本。
        #這個引數非常影響結果，控制葉子節點中二階導的和的最小值，該引數值越小，越容易 overfitting。
        min_child_weight = 1, 
        reg_alpha=0, # L1 正則項引數
        scale_pos_weight=1, #如果取值大於0的話，在類別樣本不平衡的情況下有助於快速收斂。平衡正負權重
        objective= 'multi:softmax', 
        silent=0 , # 設定成1則沒有執行資訊輸出，最好是設定為0.是否在執行升級時列印訊息。
        nthread=8,
        eval_metric= 'auc',
        tree_method = 'gpu_hist')
model.fit(X_train, y_train)

  return f(*args, **kwargs)


Parameters: { "scale_pos_weight", "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='auc', gamma=0.1, gpu_id=0, importance_type=None,
              interaction_constraints='', learning_rate=0.5, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8, nthread=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              silent=0, subsample=0.1, tree_method='gpu_hist', ...)

In [16]:
# Save Model
import pickle

with open("XGB_model_cat1_cut.pkl", "wb") as file:
    pickle.dump(model, file)

## 預測Testing set (以下為預測offline data)

In [17]:
# 匯入模型
with open("XGB_model_cat12_cut.pkl", "rb") as file:
   model = pickle.load(file)

In [80]:
# 匯入offline data
testing_set = pd.read_csv("./dataset/統一資料集/offline_result.csv", encoding="utf-8")
testing_set = testing_set[:191]
print(testing_set.tail())

     sid      id  Original Item Name Preprocessed Item Name category1  \
186  186   82595               美國青花菜                  美國青花菜        生鮮   
187  187   38029  TOMMY’S BAKING軟質餅乾      TOMMY’SBAKING軟質餅乾      食品飲料   
188  188   90667            A++金牌肉燥飯                 A金牌肉燥飯      食品飲料   
189  189  100950            奇蹟活源水精華1                奇蹟活源水精華      彩妝保養   
190  190   59360          怡麗絲爾彈潤時控精粹             怡麗絲爾彈潤時控精粹      彩妝保養   

    category2    cat1|cat2  Unnamed: 7 Unnamed: 8 Unnamed: 9  Unnamed: 10  \
186      蔬菜水果      生鮮|蔬菜水果         NaN        NaN        NaN          NaN   
187      休閒零食    食品飲料|休閒零食         NaN        NaN        NaN          NaN   
188    米/麵/泡麵  食品飲料|米/麵/泡麵         NaN        NaN        NaN          NaN   
189    專櫃保養品牌  彩妝保養|專櫃保養品牌         NaN        NaN        NaN          NaN   
190    專櫃保養品牌  彩妝保養|專櫃保養品牌         NaN        NaN        NaN          NaN   

     Unnamed: 11  Unnamed: 12 Unnamed: 13 Unnamed: 14  
186          NaN          NaN         NaN 

In [81]:
# 斷詞&去除stopwords
processed_testing = []

for i in range(0, len(testing_set['Preprocessed Item Name'])):
    # 把中文字以外的字元拿掉
    seg = rule.sub('', testing_set['Preprocessed Item Name'][i])
    seg = ''.join([w for w in seg if w not in stopword_list])
    seg = ' '.join([w for w in jieba.cut(seg)])
    processed_testing.append(seg)

print(processed_testing[:10])  # X
print(testing_set['cat1|cat2'][:10])  # y
print(len(processed_testing))
print(len(testing_set['cat1|cat2']))

0        食品飲料|蛋糕甜點
1        食品飲料|進口零食
2        個人清潔|身體清潔
3      彩妝保養|開架彩妝品牌
4      彩妝保養|開架保養品牌
5        食品飲料|進口零食
6      彩妝保養|開架彩妝品牌
7        餐廚用品|碗盤餐具
8    食品飲料|罐頭/食材/烘焙
9          彩妝保養|香水
Name: cat1|cat2, dtype: object
['熊餅 乾 草莓 蛋糕 風味', '西村 嬰 野菜 蛋 酥', '熊野 油脂 超酷 涼 沐', '時尚 師色 眼影 盤 冷淡 沙漠', '保濕全 修護組', '黑巧克', '奢華 訂 製 鏡 水 唇 釉', '僑俐', '法國 洛克 福 乾酪', '香水 系列']
191
191


In [82]:
# tfidf特徵提取 + PCA降維(or truncated SVD)
tfidf = TfidfVectorizer(max_df = 0.7, ngram_range=(1, 2), sublinear_tf=True, norm='l2', min_df = 0.00004)
features_testing = tfidf.fit_transform(processed_testing).toarray()

from sklearn.decomposition import PCA
pca = PCA(n_components= 100,random_state=3)
X_test = pca.fit_transform(features_testing)

print(X_test.shape)
y_test = testing_set['cat1|cat2']

(191, 100)


In [83]:
#Parameters:
#   prediction: (List)預測結果; Y: (List)對應答案; classes: 包含所有分類的List
#Returns:
#   Overall precision, List of (class, precision, recall)
def precision_and_recall(prediction, Y, classes):

    count_dic = {}
    correct_dic = {}

    all_prec = float()
    class_prec = {}
    class_recall = {}
    ret = []

    #Overall precision/accuracy
    
    correct = 0
    for idx, s in enumerate(prediction):
        if s == Y[idx]:
            correct += 1
    all_prec = correct / len(Y)
    
#########################################

    #Recall of classes
    
    # 先初始化list
    for s in classes:
        count_dic[s] = 0
        correct_dic[s] = 0

    for idx, s in enumerate(Y):  # 對於每個正確答案類別
        count_dic[s] += 1    # 正確答案的各類別數ex. 正確答案是保健/醫療的有幾個(包含預測錯&預測對的)
        if s == prediction[idx]:   # 猜對的有幾個 ex. 猜對是保健/醫療的有幾個
            correct_dic[s] += 1

    for s in classes:
        if count_dic[s] == 0:
            class_recall[s] = None
        else:
            # 每個類別中，猜對數/答案是那個類別的數量
            class_recall[s] = correct_dic[s] / count_dic[s]
            
#######################################

    #Precision of classes
    
    # 先初始化list
    for s in classes:
        count_dic[s] = 0
        correct_dic[s] = 0

    for idx, s in enumerate(prediction):   # 對於每個預測的類別
        count_dic[s] += 1    # 模型猜某類別的數量ex. 預測是保健/醫療的有幾個(包含預測錯&對)
        if s == Y[idx]:    
            correct_dic[s] += 1    # 有猜對是某類別的數量ex. 正確答案也是保健/醫療的有幾個(預測對的)

    for s in classes:
        if count_dic[s] == 0:
            class_prec[s] = None
        else:
            # 每個類別中，猜對是那個類別的數量/所有猜那個類別的數量
            class_prec[s] = correct_dic[s] / count_dic[s]
            
######

    #Return
    for s in classes:
        ret.append((s, class_prec[s], class_recall[s]))

    return all_prec, ret

In [84]:
prediction = model.predict(X_test)
precision, recall = precision_and_recall(prediction.tolist(), y_test.tolist(), unique_label)

print("precision: ", precision)
print("precision and recall of classes: \n")
for s in recall:
    print("Class ", s[0], "\nPrecision: ", s[1], "\nRecall: ", s[2], "\n\n", sep='')

precision:  0.0
precision and recall of classes: 

Class outlet名品匯|品牌寢具
Precision: None
Recall: None


Class outlet名品匯|國際精品
Precision: None
Recall: 0.0


Class 保健/醫療|DV笛絲薇夢
Precision: None
Recall: None


Class 保健/醫療|Simply新普利
Precision: None
Recall: None


Class 保健/醫療|保健用品/體重(脂)計
Precision: 0.0
Recall: None


Class 保健/醫療|保險套/潤滑液
Precision: None
Recall: None


Class 保健/醫療|兒童保健
Precision: None
Recall: None


Class 保健/醫療|冷熱敷墊/護具
Precision: None
Recall: None


Class 保健/醫療|口罩/抗菌/防蚊
Precision: None
Recall: 0.0


Class 保健/醫療|台糖/李時珍
Precision: None
Recall: None


Class 保健/醫療|娘家/消費高手
Precision: None
Recall: None


Class 保健/醫療|媽媽寶寶
Precision: None
Recall: None


Class 保健/醫療|成人營養品
Precision: None
Recall: None


Class 保健/醫療|機能保健
Precision: None
Recall: 0.0


Class 保健/醫療|益生菌
Precision: None
Recall: None


Class 保健/醫療|看看買-嚴選保健
Precision: None
Recall: None


Class 保健/醫療|窈窕美體
Precision: None
Recall: None


Class 保健/醫療|維他命
Precision: None
Recall: None


Class 保健/醫療|葉黃素
Precision: None
Recall: None


Cl

Recall: None


Class 彩妝保養|擴香/蠟燭
Precision: None
Recall: None


Class 彩妝保養|水氧機/精油
Precision: 0.0
Recall: None


Class 彩妝保養|男士保養品牌
Precision: None
Recall: 0.0


Class 彩妝保養|看看買-美妝保養
Precision: None
Recall: None


Class 彩妝保養|眼眉彩
Precision: None
Recall: 0.0


Class 彩妝保養|私密保養
Precision: None
Recall: None


Class 彩妝保養|精油
Precision: 0.0
Recall: None


Class 彩妝保養|美容工具小物
Precision: 0.0
Recall: None


Class 彩妝保養|身體保養
Precision: None
Recall: 0.0


Class 彩妝保養|醫美保養品牌
Precision: None
Recall: 0.0


Class 彩妝保養|開架保養品牌
Precision: 0.0
Recall: 0.0


Class 彩妝保養|開架彩妝品牌
Precision: None
Recall: 0.0


Class 彩妝保養|香氛保養品牌
Precision: 0.0
Recall: 0.0


Class 彩妝保養|香水
Precision: None
Recall: 0.0


Class 戶外用品|戶外/露營
Precision: 0.0
Recall: None


Class 戶外用品|烤肉/野餐
Precision: None
Recall: None


Class 戶外用品|登山健行
Precision: 0.0
Recall: None


Class 戶外用品|軍迷用品
Precision: 0.0
Recall: None


Class 戶外用品|釣具
Precision: None
Recall: None


Class 戶外用品|雨傘/雨衣
Precision: None
Recall: None


Class 手機/平板|3C福利品
Precision: None
Recall: None

Class 電腦/週邊|墨水/碳粉匣
Precision: None
Recall: None


Class 電腦/週邊|外接式硬碟
Precision: None
Recall: None


Class 電腦/週邊|插座/延長線
Precision: None
Recall: None


Class 電腦/週邊|智能居家/視訊監控
Precision: None
Recall: None


Class 電腦/週邊|智能居家/電子鎖
Precision: None
Recall: None


Class 電腦/週邊|標籤機/相印機/其他列印
Precision: None
Recall: None


Class 電腦/週邊|機殼/電源/裝機配件
Precision: None
Recall: None


Class 電腦/週邊|滑鼠/鍵盤
Precision: None
Recall: None


Class 電腦/週邊|筆記型電腦
Precision: None
Recall: None


Class 電腦/週邊|筆電包/配件
Precision: 0.0
Recall: None


Class 電腦/週邊|網路設備/分享器
Precision: 0.0
Recall: None


Class 電腦/週邊|繪圖/手寫板
Precision: None
Recall: None


Class 電腦/週邊|記憶體
Precision: None
Recall: None


Class 電腦/週邊|辦公事務/營業設備
Precision: None
Recall: None


Class 電腦/週邊|隨身碟
Precision: None
Recall: None


Class 電腦/週邊|雷射印表機
Precision: None
Recall: None


Class 電腦/週邊|電競LCD電腦螢幕
Precision: None
Recall: None


Class 電腦/週邊|電競桌機/組裝電腦
Precision: None
Recall: None


Class 電腦/週邊|電競筆記型電腦
Precision: None
Recall: None


Class 電腦/週邊|電競週邊
Precision: None
Re

In [39]:
# 印出部分模型預測與實際label
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
y_pred = model.predict(X_test)
print("X: ", testing_set['Preprocessed Item Name'][:10])
print("Y hat: ", y_pred[:10])
print("Y: ", y_test[:10])
print(accuracy_score(y_test, y_pred))

print("---------------------------------")
print("\n印y hat各類數量: ")
(unique, counts) = np.unique(y_pred, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)
print("共", len(frequencies), "類")

print("\n印Y各類數量: ")
(unique, counts) = np.unique(y_test, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)
print("共", len(frequencies), "類")

X:  0          小熊餅乾草莓蛋糕風味g
1            西村嬰兒野菜蛋酥g
2    熊野油脂PharmaACT超酷涼沐
3         時尚大師色眼影盤冷淡沙漠
4              保濕全能修護組
5         SIMONCOLL黑巧克
6            奢華訂製鏡光水唇釉
7                   僑俐
8      PAPILLON法國洛克福乾酪
9              香奈兒之水系列
Name: Preprocessed Item Name, dtype: object
Y hat:  ['食品飲料' '傢飾寢具' '電腦/週邊' '彩妝保養' '精品/飾品' '加值/軟體' '加值/軟體' '寵物' '戶外用品' '旅遊/住宿']
Y:  0    食品飲料
1    食品飲料
2    個人清潔
3    彩妝保養
4    彩妝保養
5    食品飲料
6    彩妝保養
7    餐廚用品
8    食品飲料
9    彩妝保養
Name: category1, dtype: object
0.015706806282722512
---------------------------------

印y hat各類數量: 
[['outlet名品匯' 4]
 ['保健/醫療' 1]
 ['修繕園藝' 3]
 ['個人清潔' 6]
 ['傢飾寢具' 10]
 ['加值/軟體' 15]
 ['圖書影音' 1]
 ['家電' 7]
 ['寵物' 18]
 ['彩妝保養' 3]
 ['戶外用品' 7]
 ['手機/平板' 13]
 ['文具樂器' 4]
 ['旅遊/住宿' 23]
 ['日用/紙品' 2]
 ['服飾' 6]
 ['母嬰玩具' 6]
 ['直配大陸' 3]
 ['相機/攝影' 10]
 ['票券' 2]
 ['精品/飾品' 7]
 ['綠色生活' 1]
 ['車' 12]
 ['運動/按摩' 1]
 ['電腦/週邊' 13]
 ['鞋包箱' 9]
 ['食品飲料' 1]
 ['餐廚用品' 3]]
共 28 類

印Y各類數量: 
[['outlet名品匯' 1]
 ['保健/醫療' 3]
 ['個人清潔' 7]
 ['傢俱收納' 2]
 ['內衣' 4]
 ['加值/軟