# 載入算法

In [302]:
import re 
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm

# MSKTS文本相似度比對算法
url = 'https://gist.githubusercontent.com/skywalker0803r/7c00d680d731b99ab549dd40a96006ce/raw/a363c9ba642495e6a8782d1739966db33d781a45/MSKTS.py'
exec(requests.get(url).text)

# 保留英文字母
def keep_alpha(str1): 
  char = "" 
  for x in str(str1):
    if x.isalpha(): 
      char = "".join([char, x])
  return char 

# 基於規則之關鍵字匹配算法
def matching(sentence,database):
  candidate_list = []
  for word in database:
    if word in sentence: 
      candidate_list.append(word)
  return candidate_list

# rule對出來的word若為其他word的子集則剔除
def substringSieve(string_list):
    string_list.sort(key=lambda s: len(s), reverse=True)
    out = []
    for s in string_list:
      if not any([s in o for o in out]):
        out.append(s)
    return out

# 輸入sentence前處理
def preprocess_raw_sentence(x):
  x = str(x).upper() # 轉大寫字串
  x = re.sub('[\u4e00-\u9fa5]', '', x) # 去除中文
  x = re.sub(r'[^\w\s]','',x) # 去除標點符號
  x = x.replace('\n', '').replace('\r', '').replace('\t', '') # 去除換行符號
  str.strip(x) # 移除左右空白
  x = x.replace('   ', ' ')# 去除三重空白
  x = x.replace('  ', ' ')# 去除雙重空白
  x = ' ' + x + ' '# 出現在頭的 就不可能對到前後加空格的 這種情形要想想怎麼對照(加上左右空白)
  return x

# 產品後處理
def product_name_postprocess(x):
  x = str(x).upper() # 轉大寫字串
  x = x.replace('-','')# 去除標點符號
  x = x.replace('.','')# 去除標點符號
  x = x.replace(',','')# 去除標點符號
  x = x.strip() # 去除空白
  return x

# 基於關鍵字比對方法的預測函數
def predict_keyword(title,test_df,Unrecognized,input_col,database,output_col):
  result = []
  for i in tqdm(test_df.index):
    candidate_list = matching(
        sentence = test_df.loc[i,input_col],
        database = set(database) - set(Unrecognized)
        )
    result.append(substringSieve(candidate_list))
  test_df[output_col] = result
  return test_df

# 載入數據

In [303]:
# 資料庫
database = pd.read_excel('/content/drive/MyDrive/NLP_dataset/Letter-of-Credit-Intelligent-Auxiliary-Semantic-Analysis-System/data/combined_excel.xlsx')
# 測試數據
test_data = pd.read_csv('/content/drive/MyDrive/NLP_dataset/Letter-of-Credit-Intelligent-Auxiliary-Semantic-Analysis-System/data/測試數據/0927到2022.csv')

# 讀取產品名資料庫
品名寶典 = pd.read_excel('/content/drive/MyDrive/NLP_dataset/Letter-of-Credit-Intelligent-Auxiliary-Semantic-Analysis-System/data/寶典/寶典人工處理後/寶典.v8.202111202.xlsx',engine='openpyxl')[['CODIV','DIVNM','ITEMNM']]
品名寶典 = 品名寶典.rename(columns={'ITEMNM':'品名','DIVNM':'公司事業部門','CODIV':'公司代號'})
品名寶典['品名'] = 品名寶典['品名'].apply(lambda x:product_name_postprocess(x))

# 讀取開狀人資料庫
開狀人寶典 = pd.read_csv('/content/drive/MyDrive/NLP_dataset/Letter-of-Credit-Intelligent-Auxiliary-Semantic-Analysis-System/data/寶典/開狀人寶典.csv')

# 讀取公司寶典
公司寶典 = pd.read_csv('/content/drive/MyDrive/NLP_dataset/Letter-of-Credit-Intelligent-Auxiliary-Semantic-Analysis-System/data/寶典/公司寶典加尾綴.csv')


# 預處理函數

In [304]:
# 針對模型輸入做預處理
def 預處理(df):
  產品名輸入 = '45A' #產品名
  開狀人輸入 = '50' #開狀人
  受益人輸入 = '59' #受益人
  開狀銀行輸入 = 'LTADDRESS.1' #銀行輸入
  for i in [產品名輸入,開狀人輸入,受益人輸入]:
    df[i] = df[i].apply(lambda x:preprocess_raw_sentence(x))
  return df

# 抽特徵函數

In [305]:
def 抽特徵(df):
  # 1.預測產品
  df = predict_keyword(
      title = '正在預測產品',
      test_df = df,
      Unrecognized = ['PE','MA','EA','GRADE','INA','PACK','PP','PA','',' '],
      input_col = 產品名輸入,
      database = 品名寶典['品名'].values.tolist(),
      output_col = '產品名',
      )

  # 2.預測開狀人
  df = predict_keyword(
      title = '正在預測開狀人',
      test_df = df,
      Unrecognized = ['',' '],
      input_col = 開狀人輸入,
      database = 開狀人寶典['開狀人'].values.tolist(),
      output_col = '開狀人',
      )

  # 3.預測公司
  df = predict_keyword(
      title = '正在預測受益人',
      test_df = df,
      Unrecognized = ['',' '],
      input_col = 受益人輸入,
      database = 公司寶典['公司英文名稱'].values.tolist(),
      output_col = '受益人',
      )

  # 4.預測銀行
  df['開狀銀行'] = df[開狀銀行輸入].apply(lambda x:str(x)[:8])
  return df

In [306]:
# 準備訓練資料
train_data = database
train_data = 抽特徵(預處理(train_data))
train_data.iloc[:,-3:].head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/20737 [00:00<?, ?it/s]

  0%|          | 0/20737 [00:00<?, ?it/s]

  0%|          | 0/20737 [00:00<?, ?it/s]

Unnamed: 0,開狀人,受益人,開狀銀行
0,[RYTOIL PETROCHEMICALS LLP],[FORMOSA PLASTICS],PUNBINBB
1,[S R POLYVINYL LTD],[FORMOSA PLASTICS],INDBINBB
2,[OSWAL CABLE PRODUCTS PVT LTD],[FORMOSA PLASTICS],CITIINBX
3,[OJUS PETROCHEMICALS LLP],[FORMOSA PLASTICS],ICICINBB
4,[AHMED SAEED AFIFI FACTORY CO FOR],[NAN YA PLASTICS],NCBKSAJE


In [307]:
# 準備測試資料
test_data = 抽特徵(預處理(test_data))
test_data.iloc[:,-3:].head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/2427 [00:00<?, ?it/s]

  0%|          | 0/2427 [00:00<?, ?it/s]

  0%|          | 0/2427 [00:00<?, ?it/s]

Unnamed: 0,產品名,開狀人,開狀銀行
0,[PVC SUSPENSION RESIN],[DEEP JYOTI WAX TRADERS],ICICINBB
1,"[TETRAHYDROFURAN, EVA]",[],YESBINBB
2,"[ABS RESIN, TAIRILAC, EVA]",[SUPREME PETROCHEM LIMITED],ICICINBB
3,"[PVC SUSPENSION RESIN, EVA]",[],SBININBB
4,[PVC SUSPENSION RESIN],[KTM KIMYEVI MADDELER ITHALAT],AKBKTRIS


# 模型測試

In [308]:
def 根據特定欄位和索引給出候選答案清單(col,idx,k):
  # 預處理
  database['處理過的資料'] = (database[col]).apply(keep_alpha)
  test_data['處理過的資料'] = (test_data[col]).apply(keep_alpha)
  # 建立模型
  model = MSKTS(arg1=250,arg2=100*10)
  model.fit(database['處理過的資料'].values.tolist())
  # 產生預測答案清單
  output = model.predict(test_data['處理過的資料'][idx],k=k)
  output = [i[0] for i in output]
  預測答案清單 = database.loc[database['處理過的資料'].isin(output),'EXPNO'].dropna().apply(lambda x:str(x)[:2]).values.tolist()
  return 預測答案清單

# 正確率

In [309]:
correct = []
for idx in tqdm(range(100)):
  o1 = 根據特定欄位和索引給出候選答案清單(col='產品名',idx=idx,k=3)
  o2 = 根據特定欄位和索引給出候選答案清單(col='開狀人',idx=idx,k=3)
  o3 = 根據特定欄位和索引給出候選答案清單(col='開狀銀行',idx=idx,k=3)
  o = o1+o2+o3
  ensemble_output = max(o,key=o.count)
  if ensemble_output == test_data['推薦公司事業部'][idx]:
    correct.append(True)
  else:
    correct.append(False)
print('正確率:',np.mean(correct))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/100 [00:00<?, ?it/s]

正確率: 0.58
