In [1]:
import jieba
import numpy as np
import os
from tqdm.notebook import tqdm

In [2]:
from os import listdir
from os.path import isfile, join,splitext
t_dir = 'dataTrainComplete'
txt_fnames = [splitext(f)[0] for f in listdir(t_dir) if isfile(join(t_dir, f))] #Article filenames

In [3]:
crop_list = open('Keywords/02crop.list.csv', "r",encoding='UTF-8-sig')
crop = crop_list.read()
crop_line_sep = crop.splitlines()

pest_list = open('Keywords/02pest.list.csv', "r",encoding='UTF-8-sig')
pest = pest_list.read()
pest_line_sep = pest.splitlines()

chem_list = open('Keywords/02chem.list.csv', "r",encoding='UTF-8-sig')
chem = chem_list.read()
chem_line_sep = chem.splitlines()
#Keywords split by lines, keyword with more than one entry will be on the same line

In [4]:
from itertools import chain
import csv
vector_dict = {}
word_list = []
#Keyword lookup with keyword as key and vector index as value
for idx,line in enumerate(chain(crop_line_sep,pest_line_sep,chem_line_sep)):
    l = line.split(',')
    for word in l:
        #Some line will have more than one entry, which should have the same vector index
        if(word == ''):continue
        jieba.add_word(word) #Each keyword is added to jieba
        vector_dict[word] = l[0] # same meaning of different word will have same key number

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Renewrr\AppData\Local\Temp\jieba.cache
Loading model cost 0.576 seconds.
Prefix dict has been built successfully.


In [5]:
# adding user dictionary
userdict = ['多變', '溫差', '防檢局', '果農', '防範', '颱風', '台中市', '臺中市',
            '發布', '發佈', '復育', '轄區', '臺南區', '開花期', '莫拉克', '桃園市', '新竹縣']
for word in userdict:
    jieba.add_word(word)

In [6]:
# tokenization
import pandas as pd
df = pd.DataFrame([])
for fname in txt_fnames:
    txt = open(t_dir+'/'+fname+'.txt', "r",encoding="utf-8")
    content = txt.read()
    df = pd.concat([df, pd.Series([jieba.lcut(content, cut_all=False)])], axis = 0)
df.columns = ['tokenization']
df['name'] = txt_fnames
df = df.reset_index(drop=True)

In [7]:
df

Unnamed: 0,tokenization,name
0,"[梅雨季, 來臨, ，, 文旦, 黑點病, 易, 發生, ，, 請, 注意, 病徵, ，, ...",1
1,"[天氣, 多變, 溫差, 大, ，, 近山區, 及, 偏施, 氮肥, 田區, 稻熱病, 發病...",10
2,"[新聞稿, -, 稻熱病, 進入, 好, 發季節, ，, 防檢局, 籲請, 農友, 加強, ...",1000
3,"[稻熱病, 進入, 好, 發季節, ，, 防檢局, 籲請, 農友, 加強, 防治, \n, ...",1005
4,"[乍暖, 還寒, ，, 防檢局, 籲請, 農友, 加強, 防治, 稻熱病, \n, 農委會,...",1007
...,...,...
555,"[苗栗, 區農業, 改良, 場, 發佈, 水稻, 白葉枯病, 警報, \n, 糧食, 作物,...",986
556,"[雨, 後, 適合, 稻熱病, 發生, ，, 請持續, 進行, 監測, 並指導, 農民, 防...",988
557,"[新, 入侵, 果實蠅, 緊急, 撲滅, 模擬, 演習, , , 新聞稿, \n, 新,...",992
558,"[梨木蝨, 危害, ，, 請, 農友, 注意, 防範, \n, 梨木蝨, 危害, ，, 請,...",997


In [8]:
# normalize the different word but have same meaning into one word
def keyword_normalization(list_to_be_mapped, key_dict):
    return [key_dict.get(a) if key_dict.get(a) else a for a in list_to_be_mapped]

In [9]:
# mapping all the docs
for index, row in df.iterrows():
    row['tokenization'] = keyword_normalization(df.iloc[index]['tokenization'], vector_dict)

In [10]:
df

Unnamed: 0,tokenization,name
0,"[梅雨季, 來臨, ，, 文旦柚, 黑點病, 易, 發生, ，, 請, 注意, 病徵, ，,...",1
1,"[天氣, 多變, 溫差, 大, ，, 近山區, 及, 偏施, 氮肥, 田區, 稻熱病, 發病...",10
2,"[新聞稿, -, 稻熱病, 進入, 好, 發季節, ，, 防檢局, 籲請, 農友, 加強, ...",1000
3,"[稻熱病, 進入, 好, 發季節, ，, 防檢局, 籲請, 農友, 加強, 防治, \n, ...",1005
4,"[乍暖, 還寒, ，, 防檢局, 籲請, 農友, 加強, 防治, 稻熱病, \n, 農委會,...",1007
...,...,...
555,"[苗栗, 區農業, 改良, 場, 發佈, 水稻, 白葉枯病, 警報, \n, 糧食, 作物,...",986
556,"[雨, 後, 適合, 稻熱病, 發生, ，, 請持續, 進行, 監測, 並指導, 農民, 防...",988
557,"[新, 入侵, 果實蠅, 緊急, 撲滅, 模擬, 演習, 三泰芬, 三泰芬, 新聞稿, \n...",992
558,"[梨木蝨, 危害, ，, 請, 農友, 注意, 防範, \n, 梨木蝨, 危害, ，, 請,...",997


In [11]:
# concat tokenized text with space
df2list = df.tokenization.tolist()
cut_corpus = []
for i in df2list:
    cut_corpus.append(' '.join(i))

In [12]:
len(cut_corpus)

560

In [13]:
cut_corpus[0]

'梅雨季 來臨 ， 文旦柚 黑點病 易 發生 ， 請 注意 病徵 ， 以及 早加強 防治 措施 。 \n 5 月 已 進入 梅雨季 節 ， 近日 連續 降雨 ， 為 文旦柚 黑點病 開始 感染 的 時機 ， 往年 文旦柚 在 經過 4 - 6 月 的 春雨 及 梅雨季 後 ， 原來 長 得 亮麗 的 果實 外表 ， 會 開始 出現 許多 小黑 點 ， 現在 文旦柚 已 開始 進入 中果期 ， 花蓮區 農業 改良 場呼籲 應 注意 防治 。 \n 除 冬季 清園 作業外 ， 在 4 - 8 月 時應 每月 施用 一次 56% 貝芬 硫 \x7f 可濕性 粉劑 800 倍 、 或 22.7% \x7f 硫 \x7f 水懸劑 1000 倍 、 或 80% 鋅錳乃浦 500 倍 、 或 33% 鋅錳乃浦 500 倍 等 政府 核准 登記 使用 之藥劑 防治 ， 並依 登記 使用 方法 使用 ， 尤其 雨前 及雨後要 特別 加強 防治 ， 若遇 連續 降雨 時則 可 利用 間 歇 時 分區 進行 施藥 以 即 時 達 到 防治效果 。 \n'

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# functions for analysis module

# CountVector
def CountVectorizer_model(data):
    count_vect = CountVectorizer()
    counts = count_vect.fit_transform(data).toarray() # type: np.array
    # names per every features: count_vect.get_feature_names(), type: list
    return count_vect, count_vect.get_feature_names(), counts

# TF-IDF
def TfidfVectorizer_model(data):
    tfidf_vect = TfidfVectorizer()
    tfidfs = tfidf_vect.fit_transform(data).toarray()
    return tfidf_vect, tfidf_vect.get_feature_names(), tfidfs

In [15]:
# for CountVectorizer module to calculate the term frequency
count_vect, count_feature, counts = CountVectorizer_model(cut_corpus)
# for TfidfVectorizer module to convert a collection of raw documents to a matrix of TF-IDF features.
tfidf_vect, tfidf_feature, tfidfs = TfidfVectorizer_model(cut_corpus)



In [356]:
len(tfidfs)

560

In [357]:
len(tfidf_feature)

10646

In [358]:
tfidfs

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.12346158, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.09047349, ..., 0.        , 0.        ,
        0.        ]])

In [359]:
tfidf_feature

['00',
 '000',
 '02',
 '03',
 '037',
 '039',
 '04',
 '05',
 '06',
 '069',
 '07',
 '08',
 '0800',
 '0800039131',
 '089',
 '090',
 '0905',
 '0932',
 '0935425837',
 '095',
 '0972',
 '10',
 '100',
 '1000',
 '10004',
 '1000x',
 '100g',
 '101',
 '103',
 '104',
 '105',
 '1051102',
 '106',
 '108',
 '109',
 '11',
 '110',
 '118',
 '12',
 '1200',
 '12000',
 '127',
 '13',
 '130',
 '1300',
 '131',
 '139',
 '14',
 '145',
 '146',
 '15',
 '150',
 '1500',
 '154',
 '1542',
 '15mm',
 '16',
 '1600',
 '1684',
 '1686',
 '17',
 '1700',
 '17512',
 '1783',
 '18',
 '1800',
 '1845',
 '19',
 '192',
 '1990',
 '1mm',
 '20',
 '200',
 '2000',
 '2002',
 '2006',
 '20492',
 '205',
 '206',
 '20for',
 '20japan1021226',
 '21',
 '22',
 '222111',
 '224',
 '23',
 '23431471',
 '23431473',
 '236583',
 '236619',
 '24',
 '25',
 '250',
 '2500',
 '250g',
 '251',
 '256',
 '2566',
 '26',
 '260',
 '2665',
 '2679526',
 '268',
 '27',
 '272',
 '277',
 '28',
 '29',
 '2face',
 '30',
 '300',
 '3000',
 '30000',
 '301',
 '302',
 '303',
 '305'

In [17]:
from gensim.models import Word2Vec
from scipy import spatial
## setting
vector_dim = 200
window_size = 5
min_count = 1
training_iter = 20

## model
word2vec_model = Word2Vec(sentences=df2list,
                          vector_size=vector_dim, window=window_size, 
                          min_count=min_count, epochs=training_iter)
# index to word
index_to_word = word2vec_model.wv.index_to_key  # len(word2vec_model.wv.index2word) = 8659
# index to vectors
index2vec = word2vec_model.wv.vectors # word2vec_model.wv.vectors.shape = (8659, 100)

def preprocess(s):
    return [i.lower() for i in s.split()]
def get_vector(s):
    for i in preprocess(s):
        try:
            values = np.sum(np.array([word2vec_model.wv[i]]), axis=0)#Change
        except KeyError:
            c = 0
    return values

word2vec_feature = np.array([])
for i in np.arange(len(cut_corpus)):
    if word2vec_feature.size == 0:
        word2vec_feature = np.array([get_vector(cut_corpus[i])])
    else:
        # concat the two vectors by different columns
        word2vec_feature = np.concatenate((word2vec_feature,np.array([get_vector(cut_corpus[i])])), axis = 0)

# word2vec_feature.shape = (560, 200)

In [22]:
labels = {}
pair_list = []
#Deprecated
for fname1 in txt_fnames:
    for fname2 in txt_fnames:
        if(fname1 == fname2):continue
        else:
            labels[(fname1,fname2)] = 0
            pair_list.append((fname1,fname2))
corr_list = open('TrainLabel.csv', "r",encoding='UTF-8-sig')
corr = corr_list.read()
corr_line_sep = corr.splitlines()
#Training label
for line in corr_line_sep[1:]:
    l = line.split(',')
    labels[(l[0],l[1])] = 1
print(len(labels))
print(sum(labels.values()))
print(len(pair_list))

313040
1383
313040


In [23]:
pos_pair_list = []
pos_labels = set() #faster lookup to filter out positive pairs, not used elsewhere
#All the associated article pairs given by train label
for line in corr_line_sep[1:]:
    l = line.split(',')
    pos_pair_list.append((l[0],l[1]))
    pos_labels.add((l[0],l[1]))
neg_pair_list = []
#Other pairs with no association
for fname1 in txt_fnames:
    for fname2 in txt_fnames:
        if(fname1 == fname2):continue
        if((fname1,fname2) not in pos_labels): #Filter out positive pairs
            neg_pair_list.append((fname1,fname2))
print(len(pos_pair_list)+len(neg_pair_list))

313040


In [24]:
import random
# decide which feature you want, tfidf or countvector or others
def construct_input_vector(adapted_vector, tokenized_df, pos_pair_list, neg_pair_list):
    pl = len(pos_pair_list) # Number of positive pairs
    neg_keys = random.sample(neg_pair_list,pl) # same length of negative pairs
    sample_labels = {}
    for key in neg_keys:sample_labels[key] = 0
    for key in pos_pair_list:sample_labels[key] = 1
    sample_pair_list = pos_pair_list + neg_keys
    sample_labels_list = []
    input_vectors = np.array([])
    for i in np.arange(len(sample_pair_list)):
        Test, Ref = sample_pair_list[i] # order of docs
        sample_labels_list.append(sample_labels[(Test,Ref)]) # labels
        Test_vec = adapted_vector[tokenized_df[tokenized_df['name']==Test].index[0]] # from order of doc to find out the order of vectors
        Ref_vec = adapted_vector[tokenized_df[tokenized_df['name']==Ref].index[0]] # same
        temp_vectors = np.concatenate(([Test_vec],[Ref_vec]), axis = 1) # concat the two vectors in same row
        if input_vectors.size == 0:
            input_vectors = temp_vectors
        else:
            input_vectors = np.concatenate((input_vectors,temp_vectors)) # concat the two vectors by different columns
    return input_vectors, sample_labels_list

In [29]:
input_vectors0, sample_labels_list0 = construct_input_vector(tfidfs, df, pos_pair_list, neg_pair_list)
# input_vectors0.shape, len(sample_labels_list0) = (2766, 15122), 2766

In [30]:
input_vectors1, sample_labels_list1 = construct_input_vector(counts, df, pos_pair_list, neg_pair_list)

In [31]:
input_vectors2, sample_labels_list2 = construct_input_vector(word2vec_feature, df, pos_pair_list, neg_pair_list)

In [34]:
# using these code as embedding features
keras.layers.Embedding(input_dim, output_dim, embeddings_initializer='uniform',
                       embeddings_regularizer=None, activity_regularizer=None,
                       embeddings_constraint=None, mask_zero=False, input_length=None)
model.add(Embedding(index2vec.shape[0], index2vec.shape[1], weights=[index2vec], trainable=False))

NameError: name 'keras' is not defined

In [371]:
input_vectors0.shape,input_vectors1.shape,input_vectors2.shape

(560, 10646)

## Training

In [379]:
#df.iloc[0]['name']
#tfidf_dict = {}
#for idx,vec in enumerate(tfidfs):
    #tfidf_dict[df.iloc[idx]['name']] = vec
#print(len(tfidf_dict))
#w2v_dict = {}
#for idx,vec in enumerate(word2vec_feature):
    #w2v_dict[df.iloc[idx]['name']] = vec
#print(len(w2v_dict))
counts_dict = {}
for idx,vec in enumerate(counts):
    counts_dict[df.iloc[idx]['name']] = vec
print(len(counts_dict))

560


In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
# Dataset
#Custom dataset, currently generates a 50/50 split of positive and negative sample
#To change the split, change the second variable of random.sample and the __len__ function accordingly
class PartDataset(data.Dataset):
    def __init__(self, pos_pair_list, neg_pair_list, vectors):
        self.l = len(pos_pair_list)#Number of positive pairs
        neg_keys = random.sample(neg_pair_list,self.l)#Sample negative pairs, change the second variable to change the split
        self.labels = {}
        for key in neg_keys:self.labels[key] = 0
        for key in pos_pair_list:self.labels[key] = 1
        self.pair_list = pos_pair_list+neg_keys
        self.vectors = vectors
        
    def __len__(self):
        return self.l*2
    
    def __getitem__(self, idx):
        Test,Ref = self.pair_list[idx]
        label = self.labels[(Test,Ref)]
        #comb_vector = self.vectors[Test] + self.vectors[Ref]
        comb_vector = np.concatenate((self.vectors[Test], self.vectors[Ref]), axis=None)
        return torch.tensor(comb_vector), label

In [26]:
inputdim_tfidf = len(tfidfs[0])
class TfIdfNeuralNetwork(nn.Module):
    def __init__(self):
        super(TfIdfNeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(inputdim_tfidf*2, 2048),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(2048, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 56),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(56, 2),
            nn.Softmax(),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits
    
inputdim_w2v = len(word2vec_feature[0])
class w2vNeuralNetwork(nn.Module):
    def __init__(self):
        super(w2vNeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(inputdim_w2v*2, 100),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(100, 50),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(50, 20),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(20, 2),
            nn.Softmax(),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [390]:
#train_dataset = PartDataset(pos_pair_list, neg_pair_list, tfidf_dict)
#test_dataset = PartDataset(pos_pair_list, neg_pair_list, tfidf_dict)
#train_dataset = PartDataset(pos_pair_list, neg_pair_list, w2v_dict)
#test_dataset = PartDataset(pos_pair_list, neg_pair_list, w2v_dict)
train_dataset = PartDataset(pos_pair_list, neg_pair_list, counts_dict)
test_dataset = PartDataset(pos_pair_list, neg_pair_list, counts_dict)

In [391]:
tfidf_train_dataloader = data.DataLoader(train_dataset, batch_size=4, shuffle=True)
tfidf_test_dataloader = data.DataLoader(test_dataset, batch_size=4, shuffle=False)
tfidf_dataloader_dict = {'train': tfidf_train_dataloader, 'test': tfidf_test_dataloader}

# Operation Check
print('Operation Check')
batch_iterator = iter(tfidf_train_dataloader)
inputs, label = next(batch_iterator)
#print(label,inputs[0][0:200]==inputs[0][200:])

Operation Check


In [392]:
net = TfIdfNeuralNetwork()
#net = w2vNeuralNetwork()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(params=net.parameters(), lr=0.001, momentum=0.9)
#cross entropy loss and stochastic gradient descent
print(net)

TfIdfNeuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=21292, out_features=2048, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=2048, out_features=512, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=512, out_features=56, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.1, inplace=False)
    (9): Linear(in_features=56, out_features=2, bias=True)
    (10): Softmax(dim=None)
  )
)


In [395]:
import time
import copy
from tqdm.notebook import tqdm


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def train_model(net, dataloader_dict, criterion, optimizer, num_epoch):
    
    since = time.time()
    best_model_wts = copy.deepcopy(net.state_dict())
    best_acc = 0.0
    net = net.to(device)
    
    for epoch in range(num_epoch):
        print('Epoch {}/{}'.format(epoch + 1, num_epoch))
        print('-'*20)
        
        for phase in ['train', 'test']:
            
            if phase == 'train':
                net.train()
            else:
                net.eval()
                
            epoch_loss = 0.0
            epoch_corrects = 0
            
            #tqdm for progress bar
            for inputs, labels in tqdm(dataloader_dict[phase]):
                inputs = inputs.type(torch.FloatTensor).to(device)
                labels = labels.to(device)
                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                    
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        
                    epoch_loss += loss.item() * inputs.size(0)
                    epoch_corrects += torch.sum(preds == labels.data)
                    
            epoch_loss = epoch_loss / len(dataloader_dict[phase].dataset)
            epoch_acc = epoch_corrects.double() / len(dataloader_dict[phase].dataset)
            
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            
            # deep copy the model
            if phase == 'test' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(net.state_dict())
                print(time.time())
                torch.save(net.state_dict(), 'best_checkpoint_last_counts.pth')
                
                
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    net.load_state_dict(best_model_wts)
    return net

In [396]:
num_epoch = 40
net = train_model(net, tfidf_dataloader_dict, criterion, optimizer, num_epoch)

Epoch 1/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6593 Acc: 0.7281


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.5832 Acc: 0.7831
1639381984.4212077
Epoch 2/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.5290 Acc: 0.7986


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4943 Acc: 0.8265
1639381998.5416963
Epoch 3/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4765 Acc: 0.8424


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4704 Acc: 0.8445
1639382012.330852
Epoch 4/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4518 Acc: 0.8662


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4541 Acc: 0.8623
1639382026.089442
Epoch 5/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4371 Acc: 0.8792


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4422 Acc: 0.8717
1639382039.8292882
Epoch 6/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4258 Acc: 0.8897


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4393 Acc: 0.8709
Epoch 7/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4157 Acc: 0.8988


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4320 Acc: 0.8803
1639382067.955607
Epoch 8/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4060 Acc: 0.9114


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4208 Acc: 0.8919
1639382082.775338
Epoch 9/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4023 Acc: 0.9125


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4223 Acc: 0.8901
Epoch 10/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.4003 Acc: 0.9154


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4173 Acc: 0.8970
1639382108.9771674
Epoch 11/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3995 Acc: 0.9161


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4260 Acc: 0.8836
Epoch 12/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3999 Acc: 0.9140


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4137 Acc: 0.8988
1639382135.6756938
Epoch 13/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3947 Acc: 0.9201


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4172 Acc: 0.8966
Epoch 14/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3915 Acc: 0.9234


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4113 Acc: 0.9017
1639382162.7648337
Epoch 15/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3895 Acc: 0.9244


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4127 Acc: 0.8999
Epoch 16/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3933 Acc: 0.9205


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4175 Acc: 0.8944
Epoch 17/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3929 Acc: 0.9197


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4195 Acc: 0.8915
Epoch 18/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3904 Acc: 0.9223


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4283 Acc: 0.8829
Epoch 19/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3773 Acc: 0.9378


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3926 Acc: 0.9201
1639382229.0976589
Epoch 20/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3822 Acc: 0.9335


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4061 Acc: 0.9056
Epoch 21/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3720 Acc: 0.9422


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3934 Acc: 0.9165
Epoch 22/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3663 Acc: 0.9483


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3822 Acc: 0.9306
1639382268.576571
Epoch 23/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3626 Acc: 0.9512


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3896 Acc: 0.9234
Epoch 24/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3584 Acc: 0.9566


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3832 Acc: 0.9291
Epoch 25/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3598 Acc: 0.9544


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3903 Acc: 0.9223
Epoch 26/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3610 Acc: 0.9534


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3976 Acc: 0.9147
Epoch 27/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3658 Acc: 0.9476


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3936 Acc: 0.9179
Epoch 28/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3589 Acc: 0.9548


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3801 Acc: 0.9324
1639382346.5274894
Epoch 29/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3545 Acc: 0.9591


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3790 Acc: 0.9342
1639382360.5031013
Epoch 30/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3547 Acc: 0.9591


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3815 Acc: 0.9313
Epoch 31/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3551 Acc: 0.9588


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3837 Acc: 0.9273
Epoch 32/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3546 Acc: 0.9591


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3817 Acc: 0.9284
Epoch 33/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3515 Acc: 0.9620


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3755 Acc: 0.9375
1639382412.5924015
Epoch 34/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3507 Acc: 0.9628


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3768 Acc: 0.9367
Epoch 35/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3505 Acc: 0.9631


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3837 Acc: 0.9270
Epoch 36/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3554 Acc: 0.9581


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3872 Acc: 0.9237
Epoch 37/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3556 Acc: 0.9577


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3891 Acc: 0.9219
Epoch 38/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3502 Acc: 0.9631


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3729 Acc: 0.9393
1639382477.5874476
Epoch 39/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3484 Acc: 0.9646


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3751 Acc: 0.9375
Epoch 40/40
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3512 Acc: 0.9620


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.3898 Acc: 0.9201
Training complete in 8m 53s
Best val Acc: 0.939262


## testing

#### preprocessing

In [397]:
#datapath = 'Stage_2\dataPublicComplete_s2\dataPublicComplete'
datapath = 'dataTrainComplete'
txt_fnames = [splitext(f)[0] for f in listdir(datapath) if isfile(join(datapath, f))]

In [398]:
# tokenization
df = pd.DataFrame([])
for fname in txt_fnames:
    txt = open(datapath+'/'+fname+'.txt', "r",encoding="utf-8")
    content = txt.read()
    df = pd.concat([df, pd.Series([jieba.lcut(content, cut_all=False)])], axis = 0)
df.columns = ['tokenization']
df['name'] = txt_fnames
df = df.reset_index(drop=True)

In [399]:
# mapping all the docs
for index, row in df.iterrows():
    row['tokenization'] = keyword_normalization(df.iloc[index]['tokenization'], vector_dict)

In [400]:
# concat tokenized text with space
df2list = df.tokenization.tolist()
cut_corpus = []
for i in df2list:
    cut_corpus.append(' '.join(i))

In [401]:
len(cut_corpus)

560

In [402]:
# testing input
# for CountVectorizer module to calculate the term frequency
counts = count_vect.transform(cut_corpus).toarray()
count_feature = count_vect.get_feature_names()
# for TfidfVectorizer module to convert a collection of raw documents to a matrix of TF-IDF features.
tfidfs = tfidf_vect.transform(cut_corpus).toarray()
tfidf_feature = tfidf_vect.get_feature_names()

In [18]:
word2vec_feature = np.array([])
for i in np.arange(len(cut_corpus)):
    if word2vec_feature.size == 0:
        word2vec_feature = np.array([get_vector(cut_corpus[i])])
    else:
        # concat the two vectors by different columns
        word2vec_feature = np.concatenate((word2vec_feature,np.array([get_vector(cut_corpus[i])])), axis = 0)
# word2vec_feature.shape = (421, 200)

In [237]:
# using these code as embedding features
keras.layers.Embedding(input_dim, output_dim, embeddings_initializer='uniform',
                       embeddings_regularizer=None, activity_regularizer=None,
                       embeddings_constraint=None, mask_zero=False, input_length=None)
model.add(Embedding(index2vec.shape[0], index2vec.shape[1], weights=[index2vec], trainable=False))

NameError: name 'keras' is not defined

In [404]:
labels = {}
pair_list = []
#Deprecated
for fname1 in txt_fnames:
    for fname2 in txt_fnames:
        if(fname1 == fname2):continue
        else:
            labels[(fname1,fname2)] = 0
            pair_list.append((fname1,fname2))

In [405]:
"""
# decide which feature you want, tfidf or countvector or others
def construct_test_input_vector(adapted_vector, tokenized_df, test_pair_list):
    input_vectors = np.array([])
    for i in np.arange(len(test_pair_list)):
        Test, Ref = test_pair_list[i] # order of docs
        Test_vec = adapted_vector[tokenized_df[tokenized_df['name']==Test].index[0]] # from order of doc to find out the order of vectors
        Ref_vec = adapted_vector[tokenized_df[tokenized_df['name']==Ref].index[0]] # same
        temp_vectors = np.concatenate(([Test_vec],[Ref_vec]), axis = 1) # concat the two vectors in same row
        if input_vectors.size == 0:
            input_vectors = temp_vectors
        else:
            input_vectors = np.concatenate((input_vectors,temp_vectors)) # concat the two vectors by different columns
    return input_vectors
"""

"\n# decide which feature you want, tfidf or countvector or others\ndef construct_test_input_vector(adapted_vector, tokenized_df, test_pair_list):\n    input_vectors = np.array([])\n    for i in np.arange(len(test_pair_list)):\n        Test, Ref = test_pair_list[i] # order of docs\n        Test_vec = adapted_vector[tokenized_df[tokenized_df['name']==Test].index[0]] # from order of doc to find out the order of vectors\n        Ref_vec = adapted_vector[tokenized_df[tokenized_df['name']==Ref].index[0]] # same\n        temp_vectors = np.concatenate(([Test_vec],[Ref_vec]), axis = 1) # concat the two vectors in same row\n        if input_vectors.size == 0:\n            input_vectors = temp_vectors\n        else:\n            input_vectors = np.concatenate((input_vectors,temp_vectors)) # concat the two vectors by different columns\n    return input_vectors\n"

In [406]:
#a = construct_test_input_vector(tfidfs, df, pair_list)

In [20]:
tfidf_dict = {}
for idx,vec in enumerate(tfidfs):
    tfidf_dict[df.iloc[idx]['name']] = vec
print(len(tfidf_dict))
w2v_dict = {}
for idx,vec in enumerate(word2vec_feature):
    w2v_dict[df.iloc[idx]['name']] = vec
print(len(w2v_dict))
counts_dict = {}
for idx,vec in enumerate(counts):
    counts_dict[df.iloc[idx]['name']] = vec
print(len(counts_dict))

560
560
560


In [33]:
net_tfidf,net_w2v,net_counts = TfIdfNeuralNetwork(),w2vNeuralNetwork(),TfIdfNeuralNetwork()
net_tfidf.load_state_dict(torch.load('best_checkpoint_last_TFIDF.pth'))
net_w2v.load_state_dict(torch.load('best_checkpoint_last_w2v.pth'))
net_counts.load_state_dict(torch.load('best_checkpoint_last_counts.pth'))
net_tfidf = net_tfidf.to(device)
net_w2v = net_w2v.to(device)
net_counts = net_counts.to(device)

In [34]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
out_counts = []
with torch.no_grad():
    for test,ref in tqdm(pair_list):
        input_vec = torch.tensor(np.concatenate((counts_dict[test], counts_dict[ref]), axis=None))
        input_vec = input_vec.type(torch.FloatTensor).to(device)
        lbl = net_counts(input_vec)
        out_counts.append([(test,ref),lbl.cpu()])
out_tfidf = []
with torch.no_grad():
    for test,ref in tqdm(pair_list):
        input_vec = torch.tensor(np.concatenate((tfidf_dict[test], tfidf_dict[ref]), axis=None))
        input_vec = input_vec.type(torch.FloatTensor).to(device)
        lbl = net_tfidf(input_vec)
        out_tfidf.append([(test,ref),lbl.cpu()])
out_w2v = []
with torch.no_grad():
    for test,ref in tqdm(pair_list):
        input_vec = torch.tensor(np.concatenate((w2v_dict[test], w2v_dict[ref]), axis=None))
        input_vec = input_vec.type(torch.FloatTensor).to(device)
        lbl = net_w2v(input_vec)
        out_w2v.append([(test,ref),lbl.cpu()])

  0%|          | 0/313040 [00:00<?, ?it/s]

  input = module(input)


  0%|          | 0/313040 [00:00<?, ?it/s]

  0%|          | 0/313040 [00:00<?, ?it/s]

[[('10', '727'), tensor([0.6064, 0.3936])],
 [('10', '73'), tensor([0.5423, 0.4577])],
 [('10', '730'), tensor([0.5866, 0.4134])],
 [('10', '732'), tensor([0.5894, 0.4106])],
 [('10', '733'), tensor([0.5926, 0.4074])],
 [('10', '740'), tensor([0.6210, 0.3790])],
 [('10', '741'), tensor([0.5922, 0.4078])],
 [('10', '744'), tensor([0.5860, 0.4140])],
 [('10', '746'), tensor([0.5769, 0.4231])],
 [('10', '747'), tensor([0.5958, 0.4042])]]

In [58]:
better_tfidf = []
for o in out_tfidf:
    if(o[1][1] > 0.3):better_tfidf.append(o[0])
better_counts = []
for o in out_counts:
    if(o[1][1] > 0.99):better_counts.append(o[0])
better_w2v = []
for o in out_w2v:
    if(o[1][1] > 0.5):better_w2v.append(o[0])
len(better_tfidf),len(better_counts),len(better_w2v)

(1172, 11025, 18320)

In [59]:
count = 0
for o in better_tfidf:
    if(o in pos_labels):count+=1
recall,precision = count/len(pos_labels),count/len(better_tfidf)
print(count,len(better_tfidf))
print('TFIDF - Recall:',recall,'Precision:',precision,'F1',2*(recall*precision)/(recall+precision))

count = 0
for o in better_counts:
    if(o in pos_labels):count+=1
recall,precision = count/len(pos_labels),count/len(better_counts)
print(count,len(better_counts))
print('Counts - Recall:',recall,'Precision:',precision,'F1',2*(recall*precision)/(recall+precision))

count = 0
for o in better_w2v:
    if(o in pos_labels):count+=1
recall,precision = count/len(pos_labels),count/len(better_w2v)
print(count,len(better_w2v))
print('W2V - Recall:',recall,'Precision:',precision,'F1',2*(recall*precision)/(recall+precision))

138 1172
TFIDF - Recall: 0.09978308026030369 Precision: 0.11774744027303755 F1 0.10802348336594912
682 11025
Counts - Recall: 0.49313087490961677 Precision: 0.061859410430839 F1 0.10992907801418439
137 18320
W2V - Recall: 0.09906001446131597 Precision: 0.0074781659388646286 F1 0.01390651169872608


#### classification

In [None]:
#

In [340]:
import csv
print(len(better))
with open('val_w2v.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(["Test"]+["Reference"])
    for row in better:  
        spamwriter.writerow(row)

980
