In [1]:
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import wordninja
from wordninja import LanguageModel
import re
import numpy as np
import gzip
import shutil

def add_space(x):
    if (' ' not in x)&(len(x)<=5):
        return ' ' + x + ' '
    else:
        return x

# X預處理
def preprocess(x):
    x = str(x)
    x = re.sub('[\u4e00-\u9fa5]', '', x) # 去除中文
    x = re.sub(r'[^\w\s]','',x) # 去除標點符號
    x = x.replace('\n', '').replace('\r', '').replace('\t', '') # 換行符號去除
    str.strip(x) # 移除左右空白
    # 去除多重空白
    x = x.replace('   ', ' ')
    x = x.replace('  ', ' ')
    # 出現在頭的 就不可能對到前後加空格的 這種情形要想想怎麼對照(加上左右空白)
    x = ' ' + x + ' '
    return x

# Y預處理
def product_name_postprocess(x):
    x = x.replace('-',' ')
    x = x.strip()
    x = add_space(x)
    return x

# 寶典

In [2]:
df = pd.read_excel('../data/寶典.v4.20211001.xlsx',engine='openpyxl')
df = df.rename(columns={'ITEMNM':'品名','DIVNM':'公司事業部門','CODIV':'公司代號'})
df['品名'] = df['品名'].apply(lambda x:product_name_postprocess(x)) #Y處理(寶典)

# 載入產品csv
val_df = pd.read_csv('../data/preprocess_for_SQUAD_產品.csv',index_col=0)[['string_X_train','Y_label','EXPNO','from']].dropna(subset=['Y_label'],axis=0)
val_df['string_X_train'] = val_df['string_X_train'].apply(lambda x:preprocess(x)) #X處理
val_df['Y_label'] = val_df['Y_label'].apply(lambda x:product_name_postprocess(x)) #Y處理(SPEC)

# 合併寶典和SPEC
產品集合 = set(df['品名'].values.tolist() + val_df['Y_label'].values.tolist())
產品集合 = set(val_df['Y_label'].values.tolist())
寶典 = list(產品集合)
print(len(寶典))
寶典[:5]

161


['TRIOCTYL TRIMELLITATE',
 'TETRAHYDROFURAN',
 'TAISOX 3490',
 'GP550N',
 'VISCOSE RAYON']

# 資料讀取

In [3]:
df = pd.read_excel('../data/combined_excel.xlsx',index_col=0)
print(df.shape)
df = df.rename(columns={'45A':'string_X_train','SPEC':'Y_label'})
df = df[['string_X_train','Y_label','EXPNO','from']].dropna(subset=['string_X_train'],axis=0)
print(df.shape)
df['from'] = df['from'].apply(lambda x:''.join(x.split('-')[1:]))
df['string_X_train'] = df['string_X_train'].apply(preprocess)
df['Y_label'] = df['Y_label'].apply(lambda x:str(x).split('\n'))
df['p'] = 'not find'
for idx in tqdm(df.index):
    for p in df.loc[idx,'Y_label'] + 寶典:
        if p in df.loc[idx,'string_X_train']:
            df.loc[idx,'p'] = p
            continue
print(df[df['p']!='not find'].shape)
df = df[df['p']!='not find']
df['Y_label'] = df['p']
df = df.drop(['p'],axis=1)

for i in tqdm(df.index):
    x,y = df.loc[i,'string_X_train'],df.loc[i,'Y_label']
    df.loc[i,'string_X_train'] = x[:x.find(y)] + ' ' + y + ' ' + x[x.find(y)+len(y):]
print(df.shape)
df.head(10)

(20737, 17)
(15273, 4)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for idx in tqdm(df.index):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15273.0), HTML(value='')))


(11073, 5)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(df.index):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=11073.0), HTML(value='')))


(11073, 4)


Unnamed: 0,string_X_train,Y_label,EXPNO,from
0,SHIPMENT OF PVC SUSPENSION RESIN S65D QTY 18...,PVC SUSPENSION RESIN,,20210103.xlsx
2,TERMS OF SALE CIF NHAVA SHEVA PORT INDIA70 MT...,PVC RESIN S60,,20210103.xlsx
4,PHTHALIC ANHYDRIDE PAQUANTITY 306 MT UNIT P...,PHTHALIC ANHYDRIDE,27.0,20210103.xlsx
5,COMMODITY LLDPE TAISOX 3470 QUANTITY 320MT...,TAISOX 3470,18.0,20210103.xlsx
6,TERM OF SALE CIF MUNDRA SEAPORT INDIA70 MT OF...,PVC SUSPENSION RESIN,,20210103.xlsx
7,COMMODITY AMOUNTPOLYESTER TEXTURED YA...,POLYESTER FILAMENT YARN,25.0,20210103.xlsx
8,COMMODITY AMOUNTPOLYESTER TEXTURED YA...,POLYESTER FILAMENT YARN,25.0,20210103.xlsx
9,COMMODITY AMOUNTPOLYESTER TEXTURED YA...,POLYESTER FILAMENT YARN,25.0,20210103.xlsx
10,COMMODITY AMOUNTPOLYESTER TEXTURED YA...,POLYESTER FILAMENT YARN,25.0,20210103.xlsx
11,COMMODITY AMOUNTPOLYESTER TEXTURED YA...,POLYESTER FILAMENT YARN,25.0,20210103.xlsx


In [4]:
# 起始結束位置標註
def str2index(context,string):
    ys = context.find(string)
    ye = ys + len(string)
    return ys,ye

ys_lst,ye_lst = [],[]
for i in range(len(df)):
    ys,ye = str2index(df['string_X_train'].values[i],df['Y_label'].values[i])
    ys_lst.append(ys),ye_lst.append(ye)
    
df['string_Y_1'] = ys_lst
df['string_Y_2'] = ye_lst
print(1,df.shape)

df = df[df['Y_label']!=''] # 去掉空值
print(2,df.shape)

df = df.loc[df['string_Y_1']!=-1,:] # 去掉找不到答案的
print(3,df.shape)

df = df.dropna(subset=['string_X_train','Y_label'],axis=0) # 去掉NAN
print(4,df.shape)

df.head(10)

1 (11073, 6)
2 (11073, 6)
3 (11073, 6)
4 (11073, 6)


Unnamed: 0,string_X_train,Y_label,EXPNO,from,string_Y_1,string_Y_2
0,SHIPMENT OF PVC SUSPENSION RESIN S65D QTY 18...,PVC SUSPENSION RESIN,,20210103.xlsx,13,33
2,TERMS OF SALE CIF NHAVA SHEVA PORT INDIA70 MT...,PVC RESIN S60,,20210103.xlsx,51,64
4,PHTHALIC ANHYDRIDE PAQUANTITY 306 MT UNIT P...,PHTHALIC ANHYDRIDE,27.0,20210103.xlsx,2,20
5,COMMODITY LLDPE TAISOX 3470 QUANTITY 320MT...,TAISOX 3470,18.0,20210103.xlsx,19,30
6,TERM OF SALE CIF MUNDRA SEAPORT INDIA70 MT OF...,PVC SUSPENSION RESIN,,20210103.xlsx,48,68
7,COMMODITY AMOUNTPOLYESTER TEXTURED YA...,POLYESTER FILAMENT YARN,25.0,20210103.xlsx,107,130
8,COMMODITY AMOUNTPOLYESTER TEXTURED YA...,POLYESTER FILAMENT YARN,25.0,20210103.xlsx,107,130
9,COMMODITY AMOUNTPOLYESTER TEXTURED YA...,POLYESTER FILAMENT YARN,25.0,20210103.xlsx,107,130
10,COMMODITY AMOUNTPOLYESTER TEXTURED YA...,POLYESTER FILAMENT YARN,25.0,20210103.xlsx,107,130
11,COMMODITY AMOUNTPOLYESTER TEXTURED YA...,POLYESTER FILAMENT YARN,25.0,20210103.xlsx,107,130


In [5]:
# 最後檢查 Y1 Y2 位置 和 Y_LABEL要對應上
for i in tqdm(df.index):
    a = df.loc[i,'string_X_train'][df.loc[i,'string_Y_1']:df.loc[i,'string_Y_2']]
    b = df.loc[i,'Y_label']
    assert a==b

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(df.index):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=11073.0), HTML(value='')))




In [6]:
from termcolor import colored

# 實際感受一下
def str2index(context,string):
    if type(string) != str:
        print(string)
    ys = context.find(string)
    ye = ys + len(string)
    return ys,ye

def color_output(text_input,text_output):
    ys,ye = str2index(text_input,text_output)
    left = text_input[:ys]
    mid = text_output
    right = text_input[ye:]
    print(left,colored(mid,'red'),right)

for j in range(10):
    i = np.random.randint(low=0,high=len(df))
    x = df['string_X_train'].values[i]
    p = df['Y_label'].values[i]
    print(j)
    color_output(x,p)
    print(colored(p,'green'))

0
 LLDPE  [31m GRADE [0m  3840UQUANTITY 300 MT UNIT PRICE USD980MTFOB TAIWAN UNDER SALES CONTRACT H9F295LLDPE GRADE LB1820E3QUANTITY 330 MT UNIT PRICE USD890MTFOB TAIWAN UNDER SALES CONTRACT H9F291HOPE GRADE 8001BLQUANTITY 306 MTUNIT PRICE USD1000MTFOB TAIWAN UNDER SALES CONTRACT H9F292LLDPE GRADE 3840UQUANTITY 200 MT UNIT PRICE USD980MTFOB TAIWAN UNDER SALES CONTRACT H9F292OTHER DETAILS ARE AS PER THE APPLICANTS SHIPPING INSTRUCTION 
[32m GRADE [0m
1
 SPECIFICATION POLYESTER STABLE  [31m FIBER [0m  15D X 64MM HC SUPER DRY QUANTITY 46000 KGS UNIT PRICE CIF HAIPHONG PORT IN VIETNAM INCOTERMS 2010097USDKGS AMOUNT USD 4462000 TOTAL AMOUNT USD 4462000ALL DETAILED SPECIFICATION STATED IN SALES CONTRACT NO N9179 
[32m FIBER [0m
2
 TAIRYFIL   [31mCARBON FIBER[0m  PRICE TERM CIF CHINA ANY PORT 
[32mCARBON FIBER[0m
3
 552 MTONS   [31mEPICHLOROHYDRIN[0m   ECH AT THE RATE OF USD 1820 PERMTON PAKING IN 230 KG PLASTIC DRUMS CFR KARACHI SEAPORTPAKISTANOTHER DETAILS ARE AS PER INDENT NO

In [7]:
# 保存
print(df.shape)
df.to_csv('../data/preprocess_for_SQUAD_產品.csv')
df

(11073, 6)


Unnamed: 0,string_X_train,Y_label,EXPNO,from,string_Y_1,string_Y_2
0,SHIPMENT OF PVC SUSPENSION RESIN S65D QTY 18...,PVC SUSPENSION RESIN,,20210103.xlsx,13,33
2,TERMS OF SALE CIF NHAVA SHEVA PORT INDIA70 MT...,PVC RESIN S60,,20210103.xlsx,51,64
4,PHTHALIC ANHYDRIDE PAQUANTITY 306 MT UNIT P...,PHTHALIC ANHYDRIDE,27,20210103.xlsx,2,20
5,COMMODITY LLDPE TAISOX 3470 QUANTITY 320MT...,TAISOX 3470,18,20210103.xlsx,19,30
6,TERM OF SALE CIF MUNDRA SEAPORT INDIA70 MT OF...,PVC SUSPENSION RESIN,,20210103.xlsx,48,68
...,...,...,...,...,...,...
20732,PVC SUSPENSION RESIN S65D FOR 70MT AT USD 1...,PVC SUSPENSION RESIN,,20200712.xlsx,2,22
20733,7000 MT IN 4X20 FT CONTAINERS PVC RESIN S60 A...,PVC RESIN S60,11,20200712.xlsx,31,44
20734,EPOXY RESIN NPEF178FOB ANY PORT OF TAIWAN,EPOXY RESIN,,20200712.xlsx,2,13
20735,EPOXY RESIN NPEL128CIF SAVANNAH GA,EPOXY RESIN,,20200712.xlsx,2,13
