In [145]:
import jieba
import numpy as np
import os, glob, time, copy, random, zipfile
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.nn.functional as F
import torchvision
from torchvision import models, transforms

In [146]:
from os import listdir
from os.path import isfile, join,splitext
t_dir = 'dataTrainComplete'
txt_fnames = [splitext(f)[0] for f in listdir(t_dir) if isfile(join(t_dir, f))] #Article filenames

In [147]:
crop_list = open('Keywords/02crop.list.csv', "r",encoding='UTF-8-sig')
crop = crop_list.read()
crop_line_sep = crop.splitlines()

pest_list = open('Keywords/02pest.list.csv', "r",encoding='UTF-8-sig')
pest = pest_list.read()
pest_line_sep = pest.splitlines()

chem_list = open('Keywords/02chem.list.csv', "r",encoding='UTF-8-sig')
chem = chem_list.read()
chem_line_sep = chem.splitlines()
#Keywords split by lines, keyword with more than one entry will be on the same line

In [148]:
from itertools import chain,product
import csv
vector_dict = {}
#Keyword lookup with keyword as key and vector index as value
for idx,line in enumerate(chain(crop_line_sep,pest_line_sep,chem_line_sep)):
    l = line.split(',')
    for word in l:
        #Some line will have more than one entry, which should have the same vector index
        if(word == ''):continue
        jieba.add_word(word)#Each keyword is added to jieba
        vector_dict[word] = idx

In [149]:
vectors = {}
#Vector table with filename as key and vector as value
for fname in txt_fnames:
    txt = open(t_dir+'/'+fname+'.txt', "r",encoding="utf-8")
    content = txt.read()
    seg_list = jieba.cut(content, cut_all=True)
    vectors[fname] = [0]*764
    #Initialize an 0 vector for each file
    for seg in seg_list:
        if(seg in vector_dict):
            vectors[fname][vector_dict[seg]] = 1
            #Count keyword appearence - Bag of words

In [150]:
labels = {}
pair_list = []
#Deprecated
for fname1 in txt_fnames:
    for fname2 in txt_fnames:
        if(fname1 == fname2):continue
        else:
            labels[(fname1,fname2)] = 0
            pair_list.append((fname1,fname2))
corr_list = open('TrainLabel.csv', "r",encoding='UTF-8-sig')
corr = corr_list.read()
corr_line_sep = corr.splitlines()
#Training label
for line in corr_line_sep[1:]:
    l = line.split(',')
    labels[(l[0],l[1])] = 1
print(len(labels))
print(sum(labels.values()))
print(len(pair_list))

313040
1383
313040


In [151]:
pos_pair_list = []
pos_labels = set() #faster lookup to filter out positive pairs, not used elsewhere
#All the associated article pairs given by train label
for line in corr_line_sep[1:]:
    l = line.split(',')
    pos_pair_list.append((l[0],l[1]))
    pos_labels.add((l[0],l[1]))
neg_pair_list = []
#Other pairs with no association
for fname1 in txt_fnames:
    for fname2 in txt_fnames:
        if(fname1 == fname2):continue
        if((fname1,fname2) not in pos_labels): #Filter out positive pairs
            neg_pair_list.append((fname1,fname2))
print(len(pos_pair_list)+len(neg_pair_list))

313040


In [158]:
# Dataset
import random
#Custom dataset, currently generates a 50/50 split of positive and negative sample
#To change the split, change the second variable of random.sample and the __len__ function accordingly
class PartDataset(data.Dataset):
    
    def __init__(self, pos_pair_list, neg_pair_list, vectors, phase='train'):
        self.l = len(pos_pair_list)#Number of positive pairs
        neg_keys = random.sample(neg_pair_list,self.l)#Sample negative pairs, change the second variable to change the split
        self.labels = {}
        for key in neg_keys:self.labels[key] = 0
        for key in pos_pair_list:self.labels[key] = 1
        self.pair_list = pos_pair_list+neg_keys
        self.vectors = vectors
        self.phase = phase
        
    def __len__(self):
        return self.l*2
    
    def __getitem__(self, idx):
        Test,Ref = self.pair_list[idx]
        label = self.labels[(Test,Ref)]
        comb_vector = self.vectors[Test] + self.vectors[Ref]
        return torch.tensor(comb_vector), label

In [159]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [160]:
psl = len(pos_pair_list)
pos_pair_list_train,pos_pair_list_test = data.random_split(pos_pair_list, [int(psl*0.7),psl - int(psl*0.7)], generator=torch.Generator().manual_seed(42))
train_dataset = PartDataset(list(pos_pair_list_train), neg_pair_list, vectors,phase='train')
test_dataset = PartDataset(list(pos_pair_list_test), neg_pair_list, vectors,phase='test')

In [161]:
#Deprecated
#print('Operation Check')
#print(labels[('3','415')])
#print(pair_list.index(('3','415')))
#print(train_dataset.__getitem__(133886)[1])

In [162]:
# DataLoader
train_dataloader = data.DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = data.DataLoader(test_dataset, batch_size=4, shuffle=False)

dataloader_dict = {'train': train_dataloader, 'test': test_dataloader}

# Operation Check
print('Operation Check')
batch_iterator = iter(train_dataloader)
inputs, label = next(batch_iterator)
print(label)

Operation Check
tensor([0, 1, 1, 1])


In [163]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(2*764, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 20),
            nn.Sigmoid(),
            nn.Linear(20, 2),
            nn.Softmax(),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [164]:
net = NeuralNetwork()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(params=net.parameters(), lr=0.001, momentum=0.9)
#cross entropy loss and stochastic gradient descent
print(net)

NeuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=1528, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=20, bias=True)
    (5): Sigmoid()
    (6): Linear(in_features=20, out_features=2, bias=True)
    (7): Softmax(dim=None)
  )
)


In [165]:
import time
def train_model(net, dataloader_dict, criterion, optimizer, num_epoch):
    
    since = time.time()
    best_model_wts = copy.deepcopy(net.state_dict())
    best_acc = 0.0
    net = net.to(device)
    
    for epoch in range(num_epoch):
        print('Epoch {}/{}'.format(epoch + 1, num_epoch))
        print('-'*20)
        
        for phase in ['train', 'test']:
            
            if phase == 'train':
                net.train()
            else:
                net.eval()
                
            epoch_loss = 0.0
            epoch_corrects = 0
            
            #tqdm for progress bar
            for inputs, labels in tqdm(dataloader_dict[phase]):
                inputs = inputs.type(torch.FloatTensor).to(device)
                labels = labels.to(device)
                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                    
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        
                    epoch_loss += loss.item() * inputs.size(0)
                    epoch_corrects += torch.sum(preds == labels.data)
                    
            epoch_loss = epoch_loss / len(dataloader_dict[phase].dataset)
            epoch_acc = epoch_corrects.double() / len(dataloader_dict[phase].dataset)
            
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            
            # deep copy the model
            if phase == 'test' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(net.state_dict())
                print(time.time())
                torch.save(net.state_dict(), 'best_checkpoint_last.pth')
                
                
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    net.load_state_dict(best_model_wts)
    return net

In [179]:
num_epoch = 20
net = train_model(net, dataloader_dict, criterion, optimizer, num_epoch)

Epoch 1/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3772 Acc: 0.9416


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.4168 Acc: 0.9000
1639327602.0866709
Epoch 2/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3755 Acc: 0.9432


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.4150 Acc: 0.9012
1639327603.6466193
Epoch 3/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3741 Acc: 0.9442


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.4142 Acc: 0.9036
1639327605.2198057
Epoch 4/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3728 Acc: 0.9458


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.4122 Acc: 0.9048
1639327606.7122393
Epoch 5/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3713 Acc: 0.9463


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.4109 Acc: 0.9084
1639327608.2370026
Epoch 6/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3692 Acc: 0.9489


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.4091 Acc: 0.9108
1639327609.7866397
Epoch 7/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3666 Acc: 0.9499


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.4053 Acc: 0.9096
Epoch 8/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3619 Acc: 0.9561


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.4026 Acc: 0.9145
1639327612.7977304
Epoch 9/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3576 Acc: 0.9613


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.3987 Acc: 0.9181
1639327614.328052
Epoch 10/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3522 Acc: 0.9654


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.3897 Acc: 0.9289
1639327615.8481648
Epoch 11/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3468 Acc: 0.9726


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.3865 Acc: 0.9301
1639327617.361362
Epoch 12/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3439 Acc: 0.9752


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.3808 Acc: 0.9434
1639327618.8627808
Epoch 13/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3408 Acc: 0.9804


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.3770 Acc: 0.9422
Epoch 14/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3393 Acc: 0.9804


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.3758 Acc: 0.9434
Epoch 15/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3384 Acc: 0.9809


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.3753 Acc: 0.9434
Epoch 16/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3377 Acc: 0.9809


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.3746 Acc: 0.9434
Epoch 17/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3371 Acc: 0.9809


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.3744 Acc: 0.9434
Epoch 18/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3365 Acc: 0.9814


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.3741 Acc: 0.9422
Epoch 19/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3360 Acc: 0.9814


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.3736 Acc: 0.9446
1639327629.5692062
Epoch 20/20
--------------------


  0%|          | 0/484 [00:00<?, ?it/s]

train Loss: 0.3357 Acc: 0.9814


  0%|          | 0/208 [00:00<?, ?it/s]

test Loss: 0.3733 Acc: 0.9458
1639327631.076554
Training complete in 0m 31s
Best val Acc: 0.945783


In [180]:
#inputs = vectors['3'] + vectors['415']
#outputs = net(torch.tensor(inputs).type(torch.FloatTensor).to(device))
outs = {}
with torch.no_grad():
    for Ref,Test in tqdm(product(txt_fnames,txt_fnames)):
        if(Ref == Test):continue
        inputs = vectors[Ref] + vectors[Test]
        outs[(Test,Ref)] = net(torch.tensor(inputs).type(torch.FloatTensor).to(device))

0it [00:00, ?it/s]

In [181]:
better = []
for key in outs:
    if(outs[key][1] > 0.5):better.append(key)
len(better)

20575

In [182]:
count = 0
bads = []
for o in better:
    if(o in pos_labels):count+=1
    else:bads.append(o)
recall,precision = count/len(pos_labels),count/len(better)
print(count,len(better))
print('Recall:',recall,'Precision:',precision,'F1',2*(recall*precision)/(recall+precision))

934 20575
Recall: 0.6753434562545192 Precision: 0.04539489671931956 F1 0.08507150013662447


0.49320188517046437