In [1]:
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import wordninja
from wordninja import LanguageModel
import re
import numpy as np
import gzip
import shutil

def preprocess(x):
    x = re.sub('[\u4e00-\u9fa5]', '', x) # 1.去除中文
    x = re.sub('[’!"#$%&\'()*+,/:;<=>?@[\\]^_`{|}~，。,.]', '', x) # 2.去除標點符號
    x = x.replace('\n', '').replace('\r', '').replace('\t', '') # 3.去除換行符號
    x = str.strip(x) # 4.移除左右空白
    return x

# 資料讀取
df = pd.read_excel('../data/combined_excel.xlsx',index_col=0).astype(str)[['45A','SPEC']]
# 文本preprocess
df['45A'] = df['45A'].apply(preprocess)
df['SPEC'] = df['SPEC'].apply(preprocess)

# 寶典集合(新增版)
b1 = pd.read_excel('../data\台塑企業_ 產品寶典20210303.xlsx').iloc[:,:5]
b2 = pd.read_excel('../data\寶典.v3.台塑網.20210901.xlsx').iloc[:,:5]
b2.columns = b1.columns
產品集合 = set(b1.append(b2)['品名'].values.tolist())

def Collection_method(df,產品集合):
    labels = {}
    for i in tqdm(df.index):
        products = []
        for p in 產品集合:# 在寶典中搜索
            if p in df.loc[i,'45A']: 
                products.append(p)
        try:
            labels[i] = max(products,key=len) # 選最長的產品
        except:
            labels[i] = np.nan # 找不到就算了
    predict = pd.DataFrame(index=labels.keys(),columns=['Baodian_predict'])
    predict['Baodian_predict'] = labels.values()
    return predict
predict = Collection_method(df,產品集合)
# 將寶典標註結果加入
df = df.join(predict)

# 製作Y_label
ok_ = 0
no_ = 0
y_label = []
for i in tqdm(range(len(df))):
    # 1.判斷SPEC是否匹配
    if str(df['SPEC'].values[i]) in str(df['45A'].values[i]):
        y_label.append(df['SPEC'].values[i])
        ok_ += 1
        continue #continue：強制跳出 ❮本次❯ 迴圈，繼續進入下一圈
    
    # 2.如果SPEC無法匹配到則可以判斷寶典是否匹配
    if str(df['Baodian_predict'].values[i]) in str(df['45A'].values[i]):
        y_label.append(df['Baodian_predict'].values[i])
        ok_ += 1
        continue #continue：強制跳出 ❮本次❯ 迴圈，繼續進入下一圈
    
    # 3.如果都不匹配 則會觸發以下程序
    no_ += 1
    y_label.append(np.nan)

df['Y_label'] = y_label

# 寶典集合(新增版)
b1 = pd.read_excel('../data\台塑企業_ 產品寶典20210303.xlsx').iloc[:,:5]
b2 = pd.read_excel('../data\寶典.v3.台塑網.20210901.xlsx').iloc[:,:5]
b2.columns = b1.columns
產品集合 = list(set(b1.append(b2)['品名'].values.tolist()))
產品集合 = [str(i).lower() for i in 產品集合]

# wordninja 詞匯集合
with open('../data/wordninja_words.txt',encoding="utf-8") as f:
    wordninja_words_lst = f.read().split('\n')

# SPEC 詞匯集合
SPEC集合 = df['Y_label'].values.tolist()
SPEC集合 = [str(i).lower() for i in SPEC集合]

# 三者加相成全部集合
全部集合 = list(set(產品集合 + SPEC集合 + wordninja_words_lst))

# 寫成txt檔保存
with open('全部集合.txt', 'w',encoding="utf-8") as f:
    lines = [i + '\n' for i in 全部集合]
    f.writelines(lines)

# 將txt檔讀取近來變成gz檔保存
with open('全部集合.txt', 'rb') as f_in, gzip.open('全部集合.txt.gz', 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

# 客製化斷詞模型
class LanguageModel2(LanguageModel):
    def split(self, s):
        _SPLIT_RE = re.compile("[^a-zA-Z0-9-']+")
        l = [self._split(x) for x in _SPLIT_RE.split(s)]
        return [item for sublist in l for item in sublist]

# 對45A執行斷詞
lm = LanguageModel2('全部集合.txt.gz')
display(' '.join(lm.split(df['45A'].values[0])))
lst = []
for i in tqdm(range(len(df))):
    lst.append(' '.join(lm.split(df['45A'].values[i])))
df['45A'] = lst

# 清洗數據
df = df.dropna(subset=['45A','Y_label'],axis=0).reset_index(drop=True)
keep_lst = []
for i in range(len(df)):
    # 判斷Y_label 是否有在 45A以內
    if df.iloc[i]['Y_label'] in df.iloc[i]['45A']:
        keep_lst.append(i)
df = df.iloc[keep_lst]

# 起始結束位置標註
def str2index(context,string):
    ys = context.find(string)
    ye = ys + len(string)
    return ys,ye

ys_lst = []
ye_lst = []
for i in range(len(df)):
    context= df['45A'].values[i]
    string = df['Y_label'].values[i]
    ys,ye = str2index(context,string)
    ys_lst.append(ys)
    ye_lst.append(ye)
df['string_Y_1'] = ys_lst
df['string_Y_2'] = ye_lst
df.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(df.index):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4239.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(range(len(df))):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4239.0), HTML(value='')))




'MASS PVC RESIN B-57 QUANTITY 175 MT AT 1300 USD MT AS PER PROFORMA INVOICE NO A 0 B 002 - 11 D A T E D 10 - 12 - 2020 C I F PIPA VAV PORT INDIA INC O TERMS 2010'

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(range(len(df))):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4239.0), HTML(value='')))




Unnamed: 0,45A,SPEC,Baodian_predict,Y_label,string_Y_1,string_Y_2
0,MASS PVC RESIN B-57 QUANTITY 175 MT AT 1300 US...,MASS PVC RESIN B-57,PVC RESIN B-57,MASS PVC RESIN B-57,0,19
1,PHTHALIC ANHYDRIDE PA QUANTITY 306 MT UNIT PRI...,PHTHALIC ANHYDRIDE,PHTHALIC ANHYDRIDE PA,PHTHALIC ANHYDRIDE,0,18
2,COMMODITY LLDPE TAISOX 3470 QUANTITY 320 MT 2 ...,LINEAR LOW DENSITYPOLYETHYLENE RESINTAISOX 3470,LLDPE TAISOX,LLDPE TAISOX,10,22
5,ITEM 1 HDPE TAISOX 8010 200 MT USD 1100 MT CON...,HIGH DENSITYPOLYETHYLENE RESIN,HDPE TAISOX 8010,HDPE TAISOX 8010,7,23
6,ITEM 1 HDPE TAISOX 8010 200 MT USD 1100 MT CON...,ETHYLENE VINYL ACETATECOPOLYMERTAISOX 7360M,HDPE TAISOX 8010,HDPE TAISOX 8010,7,23


In [2]:
# 檢查
assert df['45A'].values[0][df['string_Y_1'].values[0]:df['string_Y_2'].values[0]] == df['Y_label'].values[0]
for i in np.random.choice(df.index,size=10):
    print(df.loc[i,'45A'][df.loc[i,'string_Y_1']:df.loc[i,'string_Y_2']],'|',df.loc[i,'Y_label'])
print(df.shape)

EVA TAISOX 7350 | EVA TAISOX 7350
PROPYLENE | PROPYLENE
TAIRYFIL CARBON | TAIRYFIL CARBON
DINP | DINP
EVA TAISOX | EVA TAISOX
CARBON FIBER | CARBON FIBER
LLDPE TAISOX | LLDPE TAISOX
PLASTIC RESIN | PLASTIC RESIN
PVC SUSPENSION RESIN | PVC SUSPENSION RESIN
ABS RESIN GRADE NO AG20GF | ABS RESIN GRADE NO AG20GF
(4052, 6)


In [3]:
# 保存
df.to_csv('../data/preprocess_for_SQUAD_產品.csv')