In [1]:
from string import punctuation
from nltk import SnowballStemmer
from common import files
from constant import NEW_LOG, OLD_LOG
from BERT.json_processor import DataSetBuild
import pandas as pd
import re
import os
import json
os.chdir('/Users/pkun/PycharmProjects/ui_api_automated_test')
chinese_map_english = json.load(open('data/chinese_map_english.json', 'r'))

In [2]:
def clean_dataframe_train(train):
    stop_words = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    def text_to_wordlist(text, remove_stop_words=True, stem_words=False):
        text = re.sub(r"[^A-Za-z0-9]", " ", text)
        text = re.sub(r"what's", "", text)
        text = re.sub(r"What's", "", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"can't", "cannot ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"I'm", "I am", text)
        text = re.sub(r" m ", " am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r"60k", " 60000 ", text)
        text = re.sub(r" e g ", " eg ", text)
        text = re.sub(r" b g ", " bg ", text)
        text = re.sub(r"\0s", "0", text)
        text = re.sub(r" 9 11 ", "911", text)
        text = re.sub(r"e-mail", "email", text)
        text = re.sub(r"\s{2,}", " ", text)
        text = re.sub(r"quikly", "quickly", text)
        text = re.sub(r" usa ", " America ", text)
        text = re.sub(r" USA ", " America ", text)
        text = re.sub(r" u s ", " America ", text)
        text = re.sub(r" uk ", " England ", text)
        text = re.sub(r" UK ", " England ", text)
        text = re.sub(r"india", "India", text)
        text = re.sub(r"switzerland", "Switzerland", text)
        text = re.sub(r"china", "China", text)
        text = re.sub(r"chinese", "Chinese", text)
        text = re.sub(r"imrovement", "improvement", text)
        text = re.sub(r"intially", "initially", text)
        text = re.sub(r"quora", "Quora", text)
        text = re.sub(r" dms ", "direct messages ", text)
        text = re.sub(r"demonitization", "demonetization", text)
        text = re.sub(r"actived", "active", text)
        text = re.sub(r"kms", " kilometers ", text)
        text = re.sub(r"KMs", " kilometers ", text)
        text = re.sub(r" cs ", " computer science ", text)
        text = re.sub(r" upvotes ", " up votes ", text)
        text = re.sub(r" iPhone ", " phone ", text)
        text = re.sub(r"\0rs ", " rs ", text)
        text = re.sub(r"calender", "calendar", text)
        text = re.sub(r"ios", "operating system", text)
        text = re.sub(r"gps", "GPS", text)
        text = re.sub(r"gst", "GST", text)
        text = re.sub(r"programing", "programming", text)
        text = re.sub(r"bestfriend", "best friend", text)
        text = re.sub(r"dna", "DNA", text)
        text = re.sub(r"III", "3", text)
        text = re.sub(r"the US", "America", text)
        text = re.sub(r"Astrology", "astrology", text)
        text = re.sub(r"Method", "method", text)
        text = re.sub(r"Find", "find", text)
        text = re.sub(r"banglore", "Banglore", text)
        text = re.sub(r" J K ", " JK ", text)

        text = ''.join([c for c in text if c not in punctuation])

        if remove_stop_words:
            text = text.split()
            text = [w for w in text if not w in stop_words]
            text = " ".join(text)

        if stem_words:
            text = text.split()
            stemmer = SnowballStemmer('english')
            stemmed_words = [stemmer.stem(word) for word in text]
            text = " ".join(stemmed_words)


        return(text)

    def process_questions(question_list, questions, question_list_name, dataframe):
        for question in questions:
            question_list.append(text_to_wordlist(str(question)))
            if len(question_list) % 100000 == 0:
                progress = len(question_list)/len(dataframe) * 100
                print("{} is {}% complete.".format(question_list_name, round(progress, 1)))


    train_question1 = []
    process_questions(train_question1, train.question1, 'train_question1', train)

    train_question2 = []
    process_questions(train_question2, train.question2, 'train_question2', train)

    train["question1"] = train_question1
    train["question2"] = train_question2

    return train


In [3]:
def build_data_set(log):
    is_new_log = True if log == NEW_LOG else False
    # data_set= DataSet(is_remove_duplicate=is_remove_duplicate)
    data_set = list()
    df = dict()
    df['question1'] = list()
    df['question2'] = list()
    df['is_duplicated'] = list()
    for path in files(log):
        dsb = DataSetBuild(path, is_new_log)
        res = dsb.json_file_process()
        for example in res:
            query = example['query']
            if query == 'init' or query == 'system requests from direct business' or query == 'wrap do repair':
                continue
            text_b = example['positive_doc'].replace('\n', ' ')
            df['question1'].append(query)
            df['question2'].append(text_b)
            df['is_duplicated'].append(0)
            df['question1'].append(text_b)
            df['question2'].append(text_b)
            df['is_duplicated'].append(0)
            # df.add(Data('yes', query, text_b))
            for n_c in example['negative_docs']:
                df['question1'].append(text_b)
                df['question2'].append(n_c)
                df['is_duplicated'].append(0)
                # df.add(Data('no', query, n_c))
            data_set.append(pd.DataFrame(df))
            df['question1'] = list()
            df['question2'] = list()
            df['is_duplicated'] = list()
    return data_set

In [4]:
def translate_single_text_to_english(text):
    return chinese_map_english[text]

In [7]:
new_data = build_data_set(NEW_LOG)
old_data = build_data_set(OLD_LOG)
new_data[0]

Unnamed: 0,question1,question2,is_duplicated
0,confirm payment information to confirm payment...,立即支付,0
1,立即支付,立即支付,0
2,立即支付,"当前所在页面,支付",0
3,立即支付,返回,0
4,立即支付,支付,0
5,立即支付,body,0
6,立即支付,￥0.05,0
7,立即支付,收款方,0
8,立即支付,UAT普通直连测试商户,0
9,立即支付,支付安全由中国人民财产保险股份有限公司承保,0


In [8]:
for i in new_data:
    i['question2'] = i['question2'].apply(translate_single_text_to_english)
    t  =  translate_single_text_to_english(i['question1'][1])
    i['question1'][1:] = t
for i in old_data:
    i['question2'] = i['question2'].apply(translate_single_text_to_english)
    t  =  translate_single_text_to_english(i['question1'][1])
    i['question1'][1:] = t

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [9]:
new_data[0]

Unnamed: 0,question1,question2,is_duplicated
0,confirm payment information to confirm payment...,pay immediately,0
1,pay immediately,pay immediately,0
2,pay immediately,"Current page, pay",0
3,pay immediately,return,0
4,pay immediately,To pay,0
5,pay immediately,body,0
6,pay immediately,￥0.05,0
7,pay immediately,Beneficiary,0
8,pay immediately,UAT ordinary direct connection test merchant,0
9,pay immediately,Payment security is underwritten by the People...,0


In [10]:
for i in range(len(new_data)):
    new_data[i].to_csv('data/new_train/temp'+str(i)+'.csv', index_label=False)
for i in range(len(old_data)):
    old_data[i].to_csv('data/new_test/temp' + str(i)+'.csv', index_label=False)

In [11]:
d = pd.read_csv('data/new_test/temp1.csv')
d

Unnamed: 0,question1,question2,is_duplicated
0,submit payment password authorized payment vie...,Discount 0.03 yuan,0
1,Discount 0.03 yuan,Discount 0.03 yuan,0
2,Discount 0.03 yuan,"Current page, pay",0
3,Discount 0.03 yuan,To pay,0
4,Discount 0.03 yuan,return,0
5,Discount 0.03 yuan,body,0
6,Discount 0.03 yuan,￥0.05,0
7,Discount 0.03 yuan,Beneficiary,0
8,Discount 0.03 yuan,UAT ordinary direct connection test merchant,0
9,Discount 0.03 yuan,pay immediately,0
