In [1]:
import pandas as pd
from ngram_dohy import NGram
import os
from concurrent.futures import ThreadPoolExecutor

In [2]:
# create df for ten divided tokenized corpus
dfs = {} # dictionary to store the dataframes
for i in range(1, 11):
    dfs[f'df_{i}'] = pd.read_csv(f'corpus_tokenized/corpus_tokenized_{i}.csv')

In [3]:
dfs_batch = list(dfs.values())

In [4]:
os.cpu_count()

8

In [5]:
# NGram 객체 초기화
ngram_generator = NGram()

# 스레드 풀을 사용하여 병렬 처리
def process_row(row):
    pk = row['pk']
    label = row['Label']
    # 문자열을 리스트로 변환
    tokens_pos = ngram_generator.str_to_list(row['tokens_pos'])
    # POS 태그 제거
    tokens_pos_use = ngram_generator.remove_pos(tokens_pos)
    # n-gram 생성
    ngram_results = []
    for i in range(1, 6):
        n_grams = ngram_generator.ngramize(tokens_pos_use, i)
        for n_gram in n_grams:
            ngram_results.append({'pk': pk, 'Label': label, 'ngram': n_gram})
    return ngram_results


In [7]:
def process_batch(data_batch):
    with ThreadPoolExecutor(max_workers=os.cpu_count()//2) as executor:
        results = list(executor.map(process_row, data_batch.to_dict('records')))
    flat_results = [item for sublist in results for item in sublist]
    
    # 각 n-gram에 대해 데이터프레임에 추가
    global df
    df = pd.concat([df, pd.DataFrame(flat_results)], ignore_index=True)
    

In [8]:
df = pd.DataFrame()

# 각 배치에 대해 처리
for i, batch in enumerate(dfs_batch):
    print(f'{i+1}th batch processing...')
    process_batch(batch)

1th batch processing...
2th batch processing...
3th batch processing...
4th batch processing...
5th batch processing...
6th batch processing...
7th batch processing...
8th batch processing...
9th batch processing...
10th batch processing...


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144690107 entries, 0 to 144690106
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   pk      int64 
 1   Label   int64 
 2   ngram   object
dtypes: int64(2), object(1)
memory usage: 3.2+ GB


In [10]:
df.to_csv('ngram_results.csv', index=False)