In [341]:
import jieba
import numpy as np
import os
from tqdm.notebook import tqdm

In [342]:
from os import listdir
from os.path import isfile, join,splitext
t_dir = 'dataTrainComplete'
txt_fnames = [splitext(f)[0] for f in listdir(t_dir) if isfile(join(t_dir, f))] #Article filenames

In [343]:
crop_list = open('Keywords/02crop.list.csv', "r",encoding='UTF-8-sig')
crop = crop_list.read()
crop_line_sep = crop.splitlines()

pest_list = open('Keywords/02pest.list.csv', "r",encoding='UTF-8-sig')
pest = pest_list.read()
pest_line_sep = pest.splitlines()

chem_list = open('Keywords/02chem.list.csv', "r",encoding='UTF-8-sig')
chem = chem_list.read()
chem_line_sep = chem.splitlines()
#Keywords split by lines, keyword with more than one entry will be on the same line

In [344]:
from itertools import chain
import csv
vector_dict = {}
word_list = []
#Keyword lookup with keyword as key and vector index as value
for idx,line in enumerate(chain(crop_line_sep,pest_line_sep,chem_line_sep)):
    l = line.split(',')
    for word in l:
        #Some line will have more than one entry, which should have the same vector index
        if(word == ''):continue
        jieba.add_word(word) #Each keyword is added to jieba
        vector_dict[word] = l[0] # same meaning of different word will have same key number

In [345]:
# adding user dictionary
userdict = ['多變', '溫差', '防檢局', '果農', '防範', '颱風', '台中市', '臺中市',
            '發布', '發佈', '復育', '轄區', '臺南區', '開花期', '莫拉克', '桃園市', '新竹縣']
for word in userdict:
    jieba.add_word(word)

In [346]:
# tokenization
import pandas as pd
df = pd.DataFrame([])
for fname in txt_fnames:
    txt = open(t_dir+'/'+fname+'.txt', "r",encoding="utf-8")
    content = txt.read()
    df = pd.concat([df, pd.Series([jieba.lcut(content, cut_all=False)])], axis = 0)
df.columns = ['tokenization']
df['name'] = txt_fnames
df = df.reset_index(drop=True)

In [347]:
df

Unnamed: 0,tokenization,name
0,"[梅雨季, 來臨, ，, 文旦, 黑點病, 易, 發生, ，, 請, 注意, 病徵, ，, ...",1
1,"[天氣, 多變, 溫差, 大, ，, 近山區, 及, 偏施, 氮肥, 田區, 稻熱病, 發病...",10
2,"[新聞稿, -, 稻熱病, 進入, 好, 發季節, ，, 防檢局, 籲請, 農友, 加強, ...",1000
3,"[稻熱病, 進入, 好, 發季節, ，, 防檢局, 籲請, 農友, 加強, 防治, \n, ...",1005
4,"[乍暖, 還寒, ，, 防檢局, 籲請, 農友, 加強, 防治, 稻熱病, \n, 農委會,...",1007
...,...,...
555,"[苗栗, 區農業, 改良, 場, 發佈, 水稻, 白葉枯病, 警報, \n, 糧食, 作物,...",986
556,"[雨, 後, 適合, 稻熱病, 發生, ，, 請持續, 進行, 監測, 並指導, 農民, 防...",988
557,"[新, 入侵, 果實蠅, 緊急, 撲滅, 模擬, 演習, , , 新聞稿, \n, 新,...",992
558,"[梨木蝨, 危害, ，, 請, 農友, 注意, 防範, \n, 梨木蝨, 危害, ，, 請,...",997


In [348]:
# normalize the different word but have same meaning into one word
def keyword_normalization(list_to_be_mapped, key_dict):
    return [key_dict.get(a) if key_dict.get(a) else a for a in list_to_be_mapped]

In [349]:
# mapping all the docs
for index, row in df.iterrows():
    row['tokenization'] = keyword_normalization(df.iloc[index]['tokenization'], vector_dict)

In [350]:
df

Unnamed: 0,tokenization,name
0,"[梅雨季, 來臨, ，, 文旦柚, 黑點病, 易, 發生, ，, 請, 注意, 病徵, ，,...",1
1,"[天氣, 多變, 溫差, 大, ，, 近山區, 及, 偏施, 氮肥, 田區, 稻熱病, 發病...",10
2,"[新聞稿, -, 稻熱病, 進入, 好, 發季節, ，, 防檢局, 籲請, 農友, 加強, ...",1000
3,"[稻熱病, 進入, 好, 發季節, ，, 防檢局, 籲請, 農友, 加強, 防治, \n, ...",1005
4,"[乍暖, 還寒, ，, 防檢局, 籲請, 農友, 加強, 防治, 稻熱病, \n, 農委會,...",1007
...,...,...
555,"[苗栗, 區農業, 改良, 場, 發佈, 水稻, 白葉枯病, 警報, \n, 糧食, 作物,...",986
556,"[雨, 後, 適合, 稻熱病, 發生, ，, 請持續, 進行, 監測, 並指導, 農民, 防...",988
557,"[新, 入侵, 果實蠅, 緊急, 撲滅, 模擬, 演習, 三泰芬, 三泰芬, 新聞稿, \n...",992
558,"[梨木蝨, 危害, ，, 請, 農友, 注意, 防範, \n, 梨木蝨, 危害, ，, 請,...",997


In [351]:
# concat tokenized text with space
df2list = df.tokenization.tolist()
cut_corpus = []
for i in df2list:
    cut_corpus.append(' '.join(i))

In [352]:
len(cut_corpus)

560

In [353]:
cut_corpus[0]

'梅雨季 來臨 ， 文旦柚 黑點病 易 發生 ， 請 注意 病徵 ， 以及 早加強 防治 措施 。 \n 5 月 已 進入 梅雨季 節 ， 近日 連續 降雨 ， 為 文旦柚 黑點病 開始 感染 的 時機 ， 往年 文旦柚 在 經過 4 - 6 月 的 春雨 及 梅雨季 後 ， 原來 長 得 亮麗 的 果實 外表 ， 會 開始 出現 許多 小黑 點 ， 現在 文旦柚 已 開始 進入 中果期 ， 花蓮區 農業 改良 場呼籲 應 注意 防治 。 \n 除 冬季 清園 作業外 ， 在 4 - 8 月 時應 每月 施用 一次 56% 貝芬 硫 \x7f 可濕性 粉劑 800 倍 、 或 22.7% \x7f 硫 \x7f 水懸劑 1000 倍 、 或 80% 鋅錳乃浦 500 倍 、 或 33% 鋅錳乃浦 500 倍 等 政府 核准 登記 使用 之藥劑 防治 ， 並依 登記 使用 方法 使用 ， 尤其 雨前 及雨後要 特別 加強 防治 ， 若遇 連續 降雨 時則 可 利用 間 歇 時 分區 進行 施藥 以 即 時 達 到 防治效果 。 \n'

In [354]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# functions for analysis module

# CountVector
def CountVectorizer_model(data):
    count_vect = CountVectorizer()
    counts = count_vect.fit_transform(data).toarray() # type: np.array
    # names per every features: count_vect.get_feature_names(), type: list
    return count_vect, count_vect.get_feature_names(), counts

# TF-IDF
def TfidfVectorizer_model(data):
    tfidf_vect = TfidfVectorizer()
    tfidfs = tfidf_vect.fit_transform(data).toarray()
    return tfidf_vect, tfidf_vect.get_feature_names(), tfidfs

In [355]:
# for CountVectorizer module to calculate the term frequency
count_vect, count_feature, counts = CountVectorizer_model(cut_corpus)
# for TfidfVectorizer module to convert a collection of raw documents to a matrix of TF-IDF features.
tfidf_vect, tfidf_feature, tfidfs = TfidfVectorizer_model(cut_corpus)



In [356]:
len(tfidfs)

560

In [357]:
len(tfidf_feature)

10646

In [358]:
tfidfs

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.12346158, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.09047349, ..., 0.        , 0.        ,
        0.        ]])

In [359]:
tfidf_feature

['00',
 '000',
 '02',
 '03',
 '037',
 '039',
 '04',
 '05',
 '06',
 '069',
 '07',
 '08',
 '0800',
 '0800039131',
 '089',
 '090',
 '0905',
 '0932',
 '0935425837',
 '095',
 '0972',
 '10',
 '100',
 '1000',
 '10004',
 '1000x',
 '100g',
 '101',
 '103',
 '104',
 '105',
 '1051102',
 '106',
 '108',
 '109',
 '11',
 '110',
 '118',
 '12',
 '1200',
 '12000',
 '127',
 '13',
 '130',
 '1300',
 '131',
 '139',
 '14',
 '145',
 '146',
 '15',
 '150',
 '1500',
 '154',
 '1542',
 '15mm',
 '16',
 '1600',
 '1684',
 '1686',
 '17',
 '1700',
 '17512',
 '1783',
 '18',
 '1800',
 '1845',
 '19',
 '192',
 '1990',
 '1mm',
 '20',
 '200',
 '2000',
 '2002',
 '2006',
 '20492',
 '205',
 '206',
 '20for',
 '20japan1021226',
 '21',
 '22',
 '222111',
 '224',
 '23',
 '23431471',
 '23431473',
 '236583',
 '236619',
 '24',
 '25',
 '250',
 '2500',
 '250g',
 '251',
 '256',
 '2566',
 '26',
 '260',
 '2665',
 '2679526',
 '268',
 '27',
 '272',
 '277',
 '28',
 '29',
 '2face',
 '30',
 '300',
 '3000',
 '30000',
 '301',
 '302',
 '303',
 '305'

In [360]:
from gensim.models import Word2Vec
from scipy import spatial
## setting
vector_dim = 200
window_size = 5
min_count = 1
training_iter = 20

## model
word2vec_model = Word2Vec(sentences=df2list,
                          vector_size=vector_dim, window=window_size, 
                          min_count=min_count, epochs=training_iter)
# index to word
index_to_word = word2vec_model.wv.index_to_key  # len(word2vec_model.wv.index2word) = 8659
# index to vectors
index2vec = word2vec_model.wv.vectors # word2vec_model.wv.vectors.shape = (8659, 100)

def preprocess(s):
    return [i.lower() for i in s.split()]
def get_vector(s):
    for i in preprocess(s):
        try:
            values = np.sum(np.array([word2vec_model.wv[i]]), axis=0)#Change
        except KeyError:
            c = 0
    return values

word2vec_feature = np.array([])
for i in np.arange(len(cut_corpus)):
    if word2vec_feature.size == 0:
        word2vec_feature = np.array([get_vector(cut_corpus[i])])
    else:
        # concat the two vectors by different columns
        word2vec_feature = np.concatenate((word2vec_feature,np.array([get_vector(cut_corpus[i])])), axis = 0)

# word2vec_feature.shape = (560, 200)

In [361]:
labels = {}
pair_list = []
#Deprecated
for fname1 in txt_fnames:
    for fname2 in txt_fnames:
        if(fname1 == fname2):continue
        else:
            labels[(fname1,fname2)] = 0
            pair_list.append((fname1,fname2))
corr_list = open('TrainLabel.csv', "r",encoding='UTF-8-sig')
corr = corr_list.read()
corr_line_sep = corr.splitlines()
#Training label
for line in corr_line_sep[1:]:
    l = line.split(',')
    labels[(l[0],l[1])] = 1
print(len(labels))
print(sum(labels.values()))
print(len(pair_list))

313040
1383
313040


In [362]:
pos_pair_list = []
pos_labels = set() #faster lookup to filter out positive pairs, not used elsewhere
#All the associated article pairs given by train label
for line in corr_line_sep[1:]:
    l = line.split(',')
    pos_pair_list.append((l[0],l[1]))
    pos_labels.add((l[0],l[1]))
neg_pair_list = []
#Other pairs with no association
for fname1 in txt_fnames:
    for fname2 in txt_fnames:
        if(fname1 == fname2):continue
        if((fname1,fname2) not in pos_labels): #Filter out positive pairs
            neg_pair_list.append((fname1,fname2))
print(len(pos_pair_list)+len(neg_pair_list))

313040


In [363]:
import random
# decide which feature you want, tfidf or countvector or others
def construct_input_vector(adapted_vector, tokenized_df, pos_pair_list, neg_pair_list):
    pl = len(pos_pair_list) # Number of positive pairs
    neg_keys = random.sample(neg_pair_list,pl) # same length of negative pairs
    sample_labels = {}
    for key in neg_keys:sample_labels[key] = 0
    for key in pos_pair_list:sample_labels[key] = 1
    sample_pair_list = pos_pair_list + neg_keys
    sample_labels_list = []
    input_vectors = np.array([])
    for i in np.arange(len(sample_pair_list)):
        Test, Ref = sample_pair_list[i] # order of docs
        sample_labels_list.append(sample_labels[(Test,Ref)]) # labels
        Test_vec = adapted_vector[tokenized_df[tokenized_df['name']==Test].index[0]] # from order of doc to find out the order of vectors
        Ref_vec = adapted_vector[tokenized_df[tokenized_df['name']==Ref].index[0]] # same
        temp_vectors = np.concatenate(([Test_vec],[Ref_vec]), axis = 1) # concat the two vectors in same row
        if input_vectors.size == 0:
            input_vectors = temp_vectors
        else:
            input_vectors = np.concatenate((input_vectors,temp_vectors)) # concat the two vectors by different columns
    return input_vectors, sample_labels_list

In [29]:
input_vectors0, sample_labels_list0 = construct_input_vector(tfidfs, df, pos_pair_list, neg_pair_list)
# input_vectors0.shape, len(sample_labels_list0) = (2766, 15122), 2766

In [30]:
input_vectors1, sample_labels_list1 = construct_input_vector(counts, df, pos_pair_list, neg_pair_list)

In [31]:
input_vectors2, sample_labels_list2 = construct_input_vector(word2vec_feature, df, pos_pair_list, neg_pair_list)

In [34]:
# using these code as embedding features
keras.layers.Embedding(input_dim, output_dim, embeddings_initializer='uniform',
                       embeddings_regularizer=None, activity_regularizer=None,
                       embeddings_constraint=None, mask_zero=False, input_length=None)
model.add(Embedding(index2vec.shape[0], index2vec.shape[1], weights=[index2vec], trainable=False))

NameError: name 'keras' is not defined

In [371]:
input_vectors0.shape,input_vectors1.shape,input_vectors2.shape

(560, 10646)

## Training

In [428]:
df.iloc[0]['name']
tfidf_dict = {}
for idx,vec in enumerate(tfidfs):
    tfidf_dict[df.iloc[idx]['name']] = vec
print(len(tfidf_dict))
w2v_dict = {}
for idx,vec in enumerate(word2vec_feature):
    w2v_dict[df.iloc[idx]['name']] = vec
print(len(w2v_dict))
counts_dict = {}
for idx,vec in enumerate(counts):
    counts_dict[df.iloc[idx]['name']] = vec
print(len(counts_dict))

560
560
560


In [429]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
# Dataset
#Custom dataset, currently generates a 50/50 split of positive and negative sample
#To change the split, change the second variable of random.sample and the __len__ function accordingly
class PartDataset(data.Dataset):
    def __init__(self, pos_pair_list, neg_pair_list, vectors):
        self.l = len(pos_pair_list)#Number of positive pairs
        neg_keys = random.sample(neg_pair_list,self.l)#Sample negative pairs, change the second variable to change the split
        self.labels = {}
        for key in neg_keys:self.labels[key] = 0
        for key in pos_pair_list:self.labels[key] = 1
        self.pair_list = pos_pair_list+neg_keys
        self.vectors = vectors
        
    def __len__(self):
        return self.l*2
    
    def __getitem__(self, idx):
        Test,Ref = self.pair_list[idx]
        label = self.labels[(Test,Ref)]
        #comb_vector = self.vectors[Test] + self.vectors[Ref]
        comb_vector = np.concatenate((self.vectors[Test], self.vectors[Ref]), axis=None)
        return torch.tensor(comb_vector), label

In [430]:
inputdim_tfidf = len(tfidfs[0])
class TfIdfNeuralNetwork(nn.Module):
    def __init__(self):
        super(TfIdfNeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(inputdim_tfidf*2, 2048),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(2048, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 56),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(56, 2),
            nn.Softmax(),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits
    
inputdim_w2v = len(word2vec_feature[0])
class w2vNeuralNetwork(nn.Module):
    def __init__(self):
        super(w2vNeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(inputdim_w2v*2, 100),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(100, 50),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(50, 20),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(20, 2),
            nn.Softmax(),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [431]:
train_dataset = PartDataset(pos_pair_list, neg_pair_list, tfidf_dict)
test_dataset = PartDataset(pos_pair_list, neg_pair_list, tfidf_dict)
#train_dataset = PartDataset(pos_pair_list, neg_pair_list, w2v_dict)
#test_dataset = PartDataset(pos_pair_list, neg_pair_list, w2v_dict)
#train_dataset = PartDataset(pos_pair_list, neg_pair_list, counts_dict)
#test_dataset = PartDataset(pos_pair_list, neg_pair_list, counts_dict)

In [432]:
tfidf_train_dataloader = data.DataLoader(train_dataset, batch_size=4, shuffle=True)
tfidf_test_dataloader = data.DataLoader(test_dataset, batch_size=4, shuffle=False)
tfidf_dataloader_dict = {'train': tfidf_train_dataloader, 'test': tfidf_test_dataloader}

# Operation Check
print('Operation Check')
batch_iterator = iter(tfidf_train_dataloader)
inputs, label = next(batch_iterator)
#print(label,inputs[0][0:200]==inputs[0][200:])

Operation Check


In [433]:
net = TfIdfNeuralNetwork()
#net = w2vNeuralNetwork()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(params=net.parameters(), lr=0.001, momentum=0.9)
#cross entropy loss and stochastic gradient descent
print(net)

TfIdfNeuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=21292, out_features=2048, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=2048, out_features=512, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=512, out_features=56, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.1, inplace=False)
    (9): Linear(in_features=56, out_features=2, bias=True)
    (10): Softmax(dim=None)
  )
)


In [434]:
import time
import copy
from tqdm.notebook import tqdm


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def train_model(net, dataloader_dict, criterion, optimizer, num_epoch):
    
    since = time.time()
    best_model_wts = copy.deepcopy(net.state_dict())
    best_acc = 0.0
    net = net.to(device)
    
    for epoch in range(num_epoch):
        print('Epoch {}/{}'.format(epoch + 1, num_epoch))
        print('-'*20)
        
        for phase in ['train', 'test']:
            
            if phase == 'train':
                net.train()
            else:
                net.eval()
                
            epoch_loss = 0.0
            epoch_corrects = 0
            
            #tqdm for progress bar
            for inputs, labels in tqdm(dataloader_dict[phase]):
                inputs = inputs.type(torch.FloatTensor).to(device)
                labels = labels.to(device)
                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                    
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        
                    epoch_loss += loss.item() * inputs.size(0)
                    epoch_corrects += torch.sum(preds == labels.data)
                    
            epoch_loss = epoch_loss / len(dataloader_dict[phase].dataset)
            epoch_acc = epoch_corrects.double() / len(dataloader_dict[phase].dataset)
            
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            
            # deep copy the model
            if phase == 'test' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(net.state_dict())
                print(time.time())
                torch.save(net.state_dict(), 'best_checkpoint_last_TFIDF.pth')
                
                
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    net.load_state_dict(best_model_wts)
    return net

In [435]:
num_epoch = 40
net = train_model(net, tfidf_dataloader_dict, criterion, optimizer, num_epoch)

Epoch 1/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6937 Acc: 0.4989


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6933 Acc: 0.5000
1639384508.127628
Epoch 2/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6934 Acc: 0.4899


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6931 Acc: 0.5000
Epoch 3/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6932 Acc: 0.5025


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6930 Acc: 0.5000
Epoch 4/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6930 Acc: 0.5061


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6929 Acc: 0.5000
Epoch 5/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6930 Acc: 0.5119


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6928 Acc: 0.5000
Epoch 6/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6929 Acc: 0.5260


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6927 Acc: 0.6970
1639384572.1051893
Epoch 7/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6928 Acc: 0.5181


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6926 Acc: 0.5000
Epoch 8/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6926 Acc: 0.5325


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6924 Acc: 0.6652
Epoch 9/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6924 Acc: 0.5325


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6922 Acc: 0.5000
Epoch 10/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6923 Acc: 0.5557


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6920 Acc: 0.5000
Epoch 11/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6920 Acc: 0.5076


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6916 Acc: 0.6999
1639384636.149301
Epoch 12/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6914 Acc: 0.5965


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6911 Acc: 0.6941
Epoch 13/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6909 Acc: 0.6392


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6904 Acc: 0.6988
Epoch 14/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6900 Acc: 0.6298


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6893 Acc: 0.6580
Epoch 15/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6884 Acc: 0.6688


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6875 Acc: 0.6862
Epoch 16/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6860 Acc: 0.6352


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6844 Acc: 0.7111
1639384703.0954645
Epoch 17/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6815 Acc: 0.7050


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6783 Acc: 0.7054
Epoch 18/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6725 Acc: 0.7104


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6659 Acc: 0.7260
1639384729.3248913
Epoch 19/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6535 Acc: 0.7242


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6398 Acc: 0.7140
Epoch 20/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6170 Acc: 0.7426


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.5990 Acc: 0.7375
1639384755.6632469
Epoch 21/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.5746 Acc: 0.7693


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.5641 Acc: 0.7581
1639384770.1679895
Epoch 22/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.5395 Acc: 0.7921


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.5330 Acc: 0.7899
1639384784.451025
Epoch 23/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.5128 Acc: 0.8142


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.5076 Acc: 0.8203
1639384797.8543744
Epoch 24/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4942 Acc: 0.8337


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4957 Acc: 0.8283
1639384811.211729
Epoch 25/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4785 Acc: 0.8485


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4842 Acc: 0.8413
1639384824.973546
Epoch 26/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4691 Acc: 0.8565


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4755 Acc: 0.8471
1639384838.8112617
Epoch 27/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4608 Acc: 0.8626


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4709 Acc: 0.8529
1639384852.978666
Epoch 28/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4546 Acc: 0.8670


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4706 Acc: 0.8427
Epoch 29/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4515 Acc: 0.8702


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4653 Acc: 0.8507
Epoch 30/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4458 Acc: 0.8742


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4640 Acc: 0.8507
Epoch 31/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4422 Acc: 0.8774


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4628 Acc: 0.8507
Epoch 32/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4386 Acc: 0.8814


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4556 Acc: 0.8601
1639384918.7673357
Epoch 33/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4360 Acc: 0.8821


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4586 Acc: 0.8557
Epoch 34/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4323 Acc: 0.8894


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4546 Acc: 0.8594
Epoch 35/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4304 Acc: 0.8890


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4520 Acc: 0.8608
1639384958.0610454
Epoch 36/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4279 Acc: 0.8919


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4519 Acc: 0.8604
Epoch 37/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4263 Acc: 0.8930


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4485 Acc: 0.8641
1639384986.4552097
Epoch 38/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4245 Acc: 0.8933


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4479 Acc: 0.8659
1639385000.4497347
Epoch 39/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4229 Acc: 0.8962


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4484 Acc: 0.8615
Epoch 40/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4209 Acc: 0.8966


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4451 Acc: 0.8688
1639385028.5957649
Training complete in 8m 54s
Best val Acc: 0.868764


## testing

#### preprocessing

In [397]:
#datapath = 'Stage_2\dataPublicComplete_s2\dataPublicComplete'
datapath = 'dataTrainComplete'
txt_fnames = [splitext(f)[0] for f in listdir(datapath) if isfile(join(datapath, f))]

In [398]:
# tokenization
df = pd.DataFrame([])
for fname in txt_fnames:
    txt = open(datapath+'/'+fname+'.txt', "r",encoding="utf-8")
    content = txt.read()
    df = pd.concat([df, pd.Series([jieba.lcut(content, cut_all=False)])], axis = 0)
df.columns = ['tokenization']
df['name'] = txt_fnames
df = df.reset_index(drop=True)

In [399]:
# mapping all the docs
for index, row in df.iterrows():
    row['tokenization'] = keyword_normalization(df.iloc[index]['tokenization'], vector_dict)

In [400]:
# concat tokenized text with space
df2list = df.tokenization.tolist()
cut_corpus = []
for i in df2list:
    cut_corpus.append(' '.join(i))

In [401]:
len(cut_corpus)

560

In [402]:
# testing input
# for CountVectorizer module to calculate the term frequency
counts = count_vect.transform(cut_corpus).toarray()
count_feature = count_vect.get_feature_names()
# for TfidfVectorizer module to convert a collection of raw documents to a matrix of TF-IDF features.
tfidfs = tfidf_vect.transform(cut_corpus).toarray()
tfidf_feature = tfidf_vect.get_feature_names()

In [403]:
word2vec_feature = np.array([])
for i in np.arange(len(cut_corpus)):
    if word2vec_feature.size == 0:
        word2vec_feature = np.array([get_vector(cut_corpus[i])])
    else:
        # concat the two vectors by different columns
        word2vec_feature = np.concatenate((word2vec_feature,np.array([get_vector(cut_corpus[i])])), axis = 0)
# word2vec_feature.shape = (421, 200)

In [237]:
# using these code as embedding features
keras.layers.Embedding(input_dim, output_dim, embeddings_initializer='uniform',
                       embeddings_regularizer=None, activity_regularizer=None,
                       embeddings_constraint=None, mask_zero=False, input_length=None)
model.add(Embedding(index2vec.shape[0], index2vec.shape[1], weights=[index2vec], trainable=False))

NameError: name 'keras' is not defined

In [404]:
labels = {}
pair_list = []
#Deprecated
for fname1 in txt_fnames:
    for fname2 in txt_fnames:
        if(fname1 == fname2):continue
        else:
            labels[(fname1,fname2)] = 0
            pair_list.append((fname1,fname2))

In [405]:
"""
# decide which feature you want, tfidf or countvector or others
def construct_test_input_vector(adapted_vector, tokenized_df, test_pair_list):
    input_vectors = np.array([])
    for i in np.arange(len(test_pair_list)):
        Test, Ref = test_pair_list[i] # order of docs
        Test_vec = adapted_vector[tokenized_df[tokenized_df['name']==Test].index[0]] # from order of doc to find out the order of vectors
        Ref_vec = adapted_vector[tokenized_df[tokenized_df['name']==Ref].index[0]] # same
        temp_vectors = np.concatenate(([Test_vec],[Ref_vec]), axis = 1) # concat the two vectors in same row
        if input_vectors.size == 0:
            input_vectors = temp_vectors
        else:
            input_vectors = np.concatenate((input_vectors,temp_vectors)) # concat the two vectors by different columns
    return input_vectors
"""

"\n# decide which feature you want, tfidf or countvector or others\ndef construct_test_input_vector(adapted_vector, tokenized_df, test_pair_list):\n    input_vectors = np.array([])\n    for i in np.arange(len(test_pair_list)):\n        Test, Ref = test_pair_list[i] # order of docs\n        Test_vec = adapted_vector[tokenized_df[tokenized_df['name']==Test].index[0]] # from order of doc to find out the order of vectors\n        Ref_vec = adapted_vector[tokenized_df[tokenized_df['name']==Ref].index[0]] # same\n        temp_vectors = np.concatenate(([Test_vec],[Ref_vec]), axis = 1) # concat the two vectors in same row\n        if input_vectors.size == 0:\n            input_vectors = temp_vectors\n        else:\n            input_vectors = np.concatenate((input_vectors,temp_vectors)) # concat the two vectors by different columns\n    return input_vectors\n"

In [406]:
#a = construct_test_input_vector(tfidfs, df, pair_list)

In [407]:
tfidf_test_dict = {}
for idx,vec in enumerate(tfidfs):
    tfidf_test_dict[df.iloc[idx]['name']] = vec
print(len(tfidf_test_dict))
w2v_test_dict = {}
for idx,vec in enumerate(word2vec_feature):
    w2v_test_dict[df.iloc[idx]['name']] = vec
print(len(w2v_dict))
counts_dict = {}
for idx,vec in enumerate(counts):
    counts_dict[df.iloc[idx]['name']] = vec
print(len(counts_dict))

560
560
560


In [412]:
out = []
with torch.no_grad():
    for test,ref in tqdm(pair_list):
        input_vec = torch.tensor(np.concatenate((counts_dict[test], counts_dict[ref]), axis=None))
        input_vec = input_vec.type(torch.FloatTensor).to(device)
        lbl = net(input_vec)
        out.append([(test,ref),lbl.cpu()])

  0%|          | 0/313040 [00:00<?, ?it/s]

([[('1', '10'), tensor([9.9908e-01, 9.1622e-04])],
  [('1', '1000'), tensor([0.9985, 0.0015])],
  [('1', '1005'), tensor([0.9989, 0.0011])],
  [('1', '1007'), tensor([9.9993e-01, 7.0890e-05])],
  [('1', '1010'), tensor([0.9854, 0.0146])],
  [('1', '1011'), tensor([9.9958e-01, 4.1698e-04])],
  [('1', '1015'), tensor([9.9986e-01, 1.3595e-04])],
  [('1', '1016'), tensor([0.9853, 0.0147])],
  [('1', '1023'), tensor([0.9960, 0.0040])],
  [('1', '1025'), tensor([0.8974, 0.1026])]],
 242581)

In [422]:
better = []
for o in out:
    if(o[1][1] > 0.999999):better.append(o[0])


1201 313040


In [424]:
count = 0
bads = []
for o in better:
    if(o in pos_labels):count+=1
    else:bads.append(o)
recall,precision = count/len(pos_labels),count/len(better)
print(count,len(better))
print('Recall:',recall,'Precision:',precision,'F1',2*(recall*precision)/(recall+precision))

292 1201
Recall: 0.2111352133044107 Precision: 0.24313072439633637 F1 0.2260061919504644


#### classification

In [None]:
#

In [340]:
import csv
print(len(better))
with open('val_w2v.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(["Test"]+["Reference"])
    for row in better:  
        spamwriter.writerow(row)

980
