In [1]:
import os
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedKFold

In [2]:
DATA_DIR = '/media/dmlab/My Passport/DATA'
FINANCIAL_PHRASE_BANK_FILEPATH = os.path.join(DATA_DIR, 'BenchmarkDataset', 'FinancialPhraseBank-v1.0', \
                                              'Sentenes_FourAgree.csv')
semeval_filepath = os.path.join(DATA_DIR, 'BenchmarkDataset', 'ssix-project-semeval-2017-task-5-subtask-2', \
                               'Headline_Trainingdata.csv')

SAVE_DIR = './data'
FINANCIAL_PHRASE_BANK_SAVE_DIR = os.path.join(SAVE_DIR, 'FinancialPhraseBank_DS{} (k-folds)')
semeval_save_dir = os.path.join(SAVE_DIR, 'SemEvalTask5-2 (k-folds)')

In [4]:
def score_to_polarity(score: float = None):
    if score > 0: return 'positive'
    elif score < 0: return 'negative'
    else: return' neutral'
    
def load_fpb(financial_phrase_bank_filepath, agreement_level):
    df = pd.read_csv(financial_phrase_bank_filepath)
    df = df[df['agreement']==agreement_level]

    df.drop(columns=['agreement'], inplace=True)
    df.columns = ['headline', 'label']
    return df

def load_semeval(filepath):
    df = pd.read_csv(filepath)

    df.drop(columns=['company'], inplace=True)
    df.columns = ['headline', 'label']
    df['label'] = df['label'].apply(lambda x: score_to_polarity(x))
    return df

def prepare_k_fold(df, y, save_dir, k):
    """
    k-fold cross validation을 할 수 있도록 train, test dataframe으로 알맞게 분리 후 저장하는 함수
    :param data_csv_filepath: 전처리 완료된 csv 파일경로
    """
    df = shuffle(df)
    kf = StratifiedKFold(n_splits=k)
    for i, (train_index, test_index) in enumerate(kf.split(df, y)):
        save_dir_k = os.path.join(save_dir, 'fold={}'.format(i))
        if not os.path.exists(save_dir_k): os.makedirs(save_dir_k)
        
        train_df = df.iloc[train_index]
        filepath = os.path.join(save_dir_k, 'train.csv')
        train_df.to_csv(filepath, index=False)
        
        test_df = df.iloc[test_index]
        filepath = os.path.join(save_dir_k, 'test.csv')
        test_df.to_csv(filepath, index=False)

Financial Phrase Bank

In [6]:
for ds in ['50', '66', '75', 'All']:
    df = load_fpb(FINANCIAL_PHRASE_BANK_FILEPATH, ds)
    prepare_k_fold(df, df.label.values, FINANCIAL_PHRASE_BANK_SAVE_DIR.format(ds), 5)

SemEval 2017 Task 5 SubTask 2

In [7]:
df = load_semeval(semeval_filepath)
prepare_k_fold(df, df.label.values, semeval_save_dir, 5)