In [34]:
import pandas as pd
from ekonlpy.tag import Mecab
from multiprocessing import Pool, cpu_count

In [39]:
mecab = Mecab()

In [17]:
def tok_pos(text):
    return mecab.pos(text)

In [41]:
df_corpus = pd.read_csv('corpus_data.csv')
df_sample = df_corpus.sample(1000)

In [42]:
df_sample['tokens'] = df_sample['Content'].apply(tok_pos)

In [43]:
df_sample['tokens'].head()

13185     [(연일, NNG), (하락, NNG), (으로, JKB), (치닫, VV), (던...
113521    [(자료, NNG), (출처, NNG), (연합인포맥스, NNP), (Refinit...
4787      [(12, SN), (일, NNG), ((, SSO), (미국, NNG), (시간,...
66181     [(금리스와프, NNG), (IRS, NNG), (는, JX), (약보합세, NNG...
10727     [((, SSO), (홍성, NNG), (=, SY), (연합뉴스, NNP), ()...
Name: tokens, dtype: object

In [47]:
# 토크나이저 & POS tagging
import pandas as pd
from ekonlpy.tag import Mecab
from multiprocessing import Pool

# Initialize Mecab
mecab = Mecab()

def tokenize_and_pos(text):
    """ Tokenize and perform POS tagging using Mecab. """
    return mecab.pos(text)

def process_in_batches(df, batch_size=1000, output_prefix='processed_batch'):
    """ Process data in batches to handle large datasets and save intermediate results. """
    for start in range(0, len(df), batch_size):
        end = min(start + batch_size, len(df))
        batch = df.iloc[start:end].copy()  # Create a copy of the slice
        batch['tokens_pos'] = batch['Content'].apply(tokenize_and_pos)
        # Save intermediate results
        batch.to_csv(f'{output_prefix}_{start}.csv', index=False)

def parallel_process(df):
    """ Use parallel processing to speed up the tokenization and POS tagging. """
    with Pool() as pool:
        df['tokens_pos'] = pool.map(tokenize_and_pos, df['Content'])
    return df

def main():
    # Load your dataset (adjust the path to your file)
    df = pd.read_csv("corpus_data.csv")  # Ensure your CSV file has a column named 'Content'

    # Process in batches
    process_in_batches(df, batch_size=1000, output_prefix='processed_batch')

    # Combine all intermediate results into a final file
    import glob
    import os

    # List all batch files
    batch_files = glob.glob('processed_batch_*.csv')

    # Initialize an empty DataFrame for the final result
    final_df = pd.DataFrame()

    # Append each batch file to the final DataFrame
    for batch_file in sorted(batch_files):
        batch_df = pd.read_csv(batch_file)
        final_df = pd.concat([final_df, batch_df], ignore_index=True)

    # Save the final result to a different name
    final_df.to_csv('tokenized.csv', index=False)

    # Optionally, clean up intermediate batch files
    for batch_file in batch_files:
        os.remove(batch_file)

if __name__ == "__main__":
    main()