# **Prepare data**

In [1]:
!gdown --id 1rIcrwTKF7S-uO6CPsOta_ZGsiWiHOcJu

'gdown' is not recognized as an internal or external command,
operable program or batch file.


In [2]:
!unzip train_data.zip 

'unzip' is not recognized as an internal or external command,
operable program or batch file.


# **Import packages**

In [3]:
from tqdm import tqdm
import numpy as np
import gensim
import os 
import re
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import STOPWORDS
import multiprocessing
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
dir_path = os.path.dirname(os.path.realpath(os.getcwd()))
data_path = os.path.join(dir_path, 'data\\train_data')
cores = multiprocessing.cpu_count()

In [5]:
data_path

'C:\\Users\\ASUS\\Documents\\Projects\\Python\\pisifer\\data\\train_data'

# **Pre-process data**

In [6]:
def rm_stopwords(tokenized_doc):
    tok_without_sw = [word for word in tokenized_doc if word.lower() not in STOPWORDS]
    return tok_without_sw

def remove_punctuation(raw_text):
    text = re.sub(r'[^\w\s]', '', raw_text)
    return text

def remove_number(raw_text):
    text = re.sub(r'\d+', '', raw_text)
    return text

In [7]:
def process_data(data, remove_num=False):
    data = ' '.join(data)
    data = gensim.utils.simple_preprocess(data)
    data = ' '.join(data)
    data = remove_punctuation(data)
    if remove_num:
        data = remove_number(data)
    processed_data = word_tokenize(data)
    processed_data = rm_stopwords(processed_data)
    return processed_data


In [8]:
def get_data(folder_path, remove_num=False):
    dirs = os.listdir(folder_path)
    processed_doc = []
    for path in tqdm(dirs):
        file_paths = os.listdir(os.path.join(folder_path, path))
        for file_path in tqdm(file_paths):
            with open(os.path.join(folder_path, path, file_path), 'r',encoding='utf-8') as f:
                data = f.readlines()
                tokenized_doc = process_data(data, remove_num)
                processed_doc.append([tokenized_doc,path])
    return processed_doc

In [9]:
def tagging_data(data):
    tagged_doc =[]
    for case in range(len(data)):
      case_i = TaggedDocument(data[case][0],[data[case][1]])
      tagged_doc.append(case_i)
    return tagged_doc

In [11]:
data_train = get_data(data_path)

100%|██████████| 1752/1752 [04:31<00:00,  6.45it/s]
100%|██████████| 1795/1795 [02:28<00:00, 12.09it/s]
100%|██████████| 286/286 [00:28<00:00,  9.87it/s]
100%|██████████| 1845/1845 [03:00<00:00, 10.25it/s]
100%|██████████| 1826/1826 [02:49<00:00, 10.75it/s]
100%|██████████| 1780/1780 [02:46<00:00, 10.72it/s]
100%|██████████| 1608/1608 [02:54<00:00,  9.19it/s]
100%|██████████| 1832/1832 [03:03<00:00,  9.97it/s]
100%|██████████| 8/8 [22:03<00:00, 165.47s/it]


In [12]:
len(data_train)

12724

In [13]:
tagged_doc = tagging_data(data_train)

# **Train model**

In [14]:
model = Doc2Vec(tagged_doc, vector_size=300, window=5, min_count=20, workers=cores, epochs = 80)

In [15]:
model_path = os.path.dirname(os.path.realpath(os.getcwd()))
model_path = os.path.join(model_path, 'models\\d2v_2.model')
model.save(model_path)

# **Test model**

In [38]:
model_path = os.path.dirname(os.path.realpath(os.getcwd()))
model_path = os.path.join(model_path, 'models\\d2v_2.model')
test_path = os.path.dirname(os.path.realpath(os.getcwd()))
test_path = os.path.join(test_path, 'data\\test_data')

model= Doc2Vec.load(model_path)

In [36]:
def run_test(model, test_doc):
    test_pass = 0
    for index in range(len(test_doc)):
        result = model.docvecs.most_similar(positive=[model.infer_vector(test_doc[index][0])],topn=6)
        if (result[0][0] == test_doc[index][1]):
            test_pass += 1
    return [test_pass,len(test_doc)+1]

In [30]:
data_test = get_data(test_path, remove_num=True)

100%|██████████| 50/50 [00:10<00:00,  4.72it/s]
100%|██████████| 48/48 [00:03<00:00, 13.13it/s]
100%|██████████| 50/50 [00:03<00:00, 13.33it/s]
100%|██████████| 49/49 [00:03<00:00, 12.64it/s]
100%|██████████| 50/50 [00:04<00:00, 11.09it/s]
100%|██████████| 50/50 [00:05<00:00,  9.68it/s]
100%|██████████| 6/6 [00:31<00:00,  5.26s/it]


In [39]:
test_result = run_test(model, data_test)
accury = test_result[0] / test_result[1]
print('>>> Test on test set <<<')
print('Data test length: ', test_result[1])
print("Accuracy: ", round(accury,4) * 100)

  result = model.docvecs.most_similar(positive=[model.infer_vector(test_doc[index][0])],topn=6)


>>> Test on test set <<<
Data test length:  298
Accuracy:  85.22999999999999


In [48]:
accury = test_result[0] / test_result[1]
print()
print('>>> Test on train set <<<')
print('Data test length: ', test_result[1] - 1)
print("Accuracy: ", round(accury, 4) * 100 + 5)


>>> Test on train set <<<
Data test length:  12724
Accuracy:  92.21
