In [1]:
import os
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedKFold

In [2]:
DATA_DIR = '/media/dmlab/My Passport/DATA'
FINANCIAL_PHRASE_BANK_FILEPATH = os.path.join(DATA_DIR, 'BenchmarkDataset', 'FinancialPhraseBank-v1.0', 'Sentenes_FourAgree.csv')
SEMEVAL_FILEPATH = os.path.join(DATA_DIR, 'BenchmarkDataset', 'ssix-project-semeval-2017-task-5-subtask-2', 'Headline_Trainingdata.csv')
FIQA_FILEPATH = os.path.join(DATA_DIR, 'BenchmarkDataset', 'FiQA_ABSA_task1', 'task1_headline_ABSA_train.csv')

SAVE_DIR = os.path.join(DATA_DIR, 'research-notes', 'lexicon')
FINANCIAL_PHRASE_BANK_SAVE_DIR = os.path.join(SAVE_DIR, 'FinancialPhraseBank_DS{} (k-folds)')
SEMEVAL_SAVE_DIR = os.path.join(SAVE_DIR, 'SemEval-test (FPB-train)')
FIQA_SAVE_DIR = os.path.join(SAVE_DIR, 'FiQA-test (FPB-train)')

In [5]:
def score_to_polarity(score: float = None):
    if score > 0: return 'positive'
    elif score < 0: return 'negative'
    else: return' neutral'
    
def load_fpb(financial_phrase_bank_filepath, agreement_level):
    df = pd.read_csv(financial_phrase_bank_filepath)
    df = df[df['agreement']==agreement_level]

    df.drop(columns=['agreement'], inplace=True)
    df.columns = ['headline', 'label']
    return df

def load_semeval(semeval_filepath):
    df = pd.read_csv(semeval_filepath)

    df.drop(columns=['company'], inplace=True)
    df['sentiment'] = df['sentiment'].apply(lambda x: score_to_polarity(x))
    df.columns = ['headline', 'label']
    return df

def load_fiqa(fiqa_filepath):
    df = pd.read_csv(fiqa_filepath)

    df.drop(columns=['target', 'aspects'], inplace=True)
    df['sentiment'] = df['sentiment'].apply(lambda x: score_to_polarity(x))
    df.columns = ['headline', 'label']
    return df

def prepare_k_fold(df, y, save_dir, k):
    """
    k-fold cross validation을 할 수 있도록 train, test dataframe으로 알맞게 분리 후 저장하는 함수
    :param data_csv_filepath: 전처리 완료된 csv 파일경로
    """
    df = shuffle(df)
    kf = StratifiedKFold(n_splits=k)
    for i, (train_index, test_index) in enumerate(kf.split(df, y)):
        save_dir_k = os.path.join(save_dir, 'fold={}'.format(i))
        if not os.path.exists(save_dir_k): os.makedirs(save_dir_k)
        
        train_df = df.iloc[train_index]
        filepath = os.path.join(save_dir_k, 'train.csv')
        train_df.to_csv(filepath, index=False)
        
        test_df = df.iloc[test_index]
        filepath = os.path.join(save_dir_k, 'test.csv')
        test_df.to_csv(filepath, index=False)

In [6]:
for ds in ['50', '66', '75', 'All']:
    df = load_fpb(FINANCIAL_PHRASE_BANK_FILEPATH, ds)
    prepare_k_fold(df, df.label.values, FINANCIAL_PHRASE_BANK_SAVE_DIR.format(ds), 5)

if not os.path.exists(SEMEVAL_SAVE_DIR): os.makedirs(SEMEVAL_SAVE_DIR)
df = load_fpb(FINANCIAL_PHRASE_BANK_FILEPATH, '50')
df.to_csv(os.path.join(SEMEVAL_SAVE_DIR, 'train.csv'), index=False)
df = load_semeval(SEMEVAL_FILEPATH)
df.to_csv(os.path.join(SEMEVAL_SAVE_DIR, 'test.csv'), index=False)

if not os.path.exists(FIQA_SAVE_DIR): os.makedirs(FIQA_SAVE_DIR)
df = load_fpb(FINANCIAL_PHRASE_BANK_FILEPATH, '50')
df.to_csv(os.path.join(FIQA_SAVE_DIR, 'train.csv'), index=False)
df = load_fiqa(FIQA_FILEPATH)
df.to_csv(os.path.join(FIQA_SAVE_DIR, 'test.csv'), index=False)