In [48]:
import pandas as pd
import numpy as np
import os
from functools import partial
import re

In [29]:
path = 'data'
experiment_data = {
    'exp_1': ['data_D1.pkl'],
    'exp_2': ['data_D2.pkl', 'data_D3.pkl', 'data_D4.pkl'],
    'exp_3': ['data_D5.pkl', 'data_D6.pkl'],
}

# handle experiment 1 reformat:
files = experiment_data['exp_1']
for file in files:
    df = pd.read_pickle('/'.join([path, file]))
    rename_mapping = {"a_text": "a_abstract", "b_text": "b_abstract", "c_text": "c_abstract"}
    df.rename(columns=rename_mapping, inplace=True)
    df["a_title"] = np.nan
    df["b_title"] = np.nan
    df["c_title"] = np.nan
    df['y_true'] = df['y_true'].values

    print(df['y_true'])
    columns_to_keep = ['id', 'title', 'abstract', 'categories', 'authors', 'date', 'research_type', 'y_true']
    df = df[columns_to_keep]
    # df.to_csv('data_exp_1.csv', index=False)


0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Name: y_true, Length: 1000, dtype: bool


In [33]:
# handle experiment 2 reformat:
def unsqueeze_title_abs(row, abs):
    split_lst = row[abs].split(";", 1)
    # row[title], row[abs] = split_lst
    return pd.Series({'title': split_lst[0], 'abs': split_lst[1]})

files = experiment_data['exp_2']
for idx, file in enumerate(files):
    df = pd.read_pickle('/'.join([path, file]))
    rename_mapping = {"a_text": "a_abstract", "b_text": "b_abstract", "c_text": "c_abstract"}
    df.rename(columns=rename_mapping, inplace=True)
    df["a_title"] = np.nan
    df["a_categories"] = np.nan
    df["b_title"] = np.nan
    df["c_title"] = np.nan
    df[['b_title', 'b_abstract']] = df.apply(partial(unsqueeze_title_abs, abs='b_abstract'), axis=1)
    df[['c_title', 'c_abstract']] = df.apply(partial(unsqueeze_title_abs, abs='c_abstract'), axis=1)
    df.to_csv(f'data_exp_2_{idx+1}.csv', index=False)



In [81]:
# handle experiment 3 reformat:

def unsqueeze_list(row):
    input_text, position = row["list"], int(row["list_true"])
    # Define the regex pattern
    pattern = r'\d+\)\s*(?P<title>[^;]+?)\s*; \s*(?P<abstract>[^;]+?)(?:;|\.|$)'

    # Create the dictionary to store the results
    dic = {"title": [], "abstract": []}

    # Use re.finditer to extract all occurrences of the pattern
    for match in re.finditer(pattern, input_text):
        dic["title"].append(match.group("title").strip())
        dic["abstract"].append(match.group("abstract").strip())

    return dic



files = experiment_data['exp_3']
for idx, file in enumerate(files):
    df = pd.read_pickle('/'.join([path, file]))
    rename_mapping = {"a_text": "start_title", "main_text": "start_abstract", "main_categories": "start_categories", "target_text": "target_abstract"}
    df.rename(columns=rename_mapping, inplace=True)
    df["target_title"] = np.nan
    # df["a_category"] = np.nan
    # df["b_title"] = np.nan
    # df["c_title"] = np.nan
    
    df[['start_title', 'start_abstract']] = df.apply(partial(unsqueeze_title_abs, abs='start_abstract'), axis=1)
    df[['target_title', 'target_abstract']] = df.apply(partial(unsqueeze_title_abs, abs='target_abstract'), axis=1)
    df["list"] = df.apply(unsqueeze_list, axis=1)

    df = df.drop('main_id', axis=1)
    df.to_json(f'data_exp_3_{idx+1}.json', indent=2, index=False, orient='records')
    # df[['c_title', 'c_abstract']] = df.apply(partial(unsqueeze_title_abs, abs='c_abstract'), axis=1)
    # df.to_csv(f'data_exp_2_{idx+1}.csv', index=False)

In [47]:
import re

# Sample input string
text = "1) Non-extendablity of Shelukhin's quasimorphism and non-triviality of Reznikov's class; Shelukhin constructed a quasimorphism on the universal covering of the group of Hamiltonian diffeomorphisms for a general closed symplectic manifold. In the present paper, we prove the non-extendability of that quasimorphism for certain symplectic manifolds, such as a blow-up of torus and the product of a surface ; 2) Towards low-dimensionalization of four-dimensional QCD; nspired by the one-dimensional color-electric flux-tube in a hadron, we propose a possible way of low-dimensionalization of 4D QCD."

# Define the regex pattern
pattern = r'\d+\)\s*(?P<title>[^;]+?)\s*;\s*(?P<abstract>[^;]+?)(?:;|\.|$)'

# Create the dictionary to store the results
dic = {"title": [], "abstract": []}

# Use re.finditer to extract all occurrences of the pattern
for match in re.finditer(pattern, text):
    dic["title"].append(match.group("title").strip())
    dic["abstract"].append(match.group("abstract").strip())

print(dic)

{'title': ["Non-extendablity of Shelukhin's quasimorphism and non-triviality of Reznikov's class", 'Towards low-dimensionalization of four-dimensional QCD'], 'abstract': ['Shelukhin constructed a quasimorphism on the universal covering of the group of Hamiltonian diffeomorphisms for a general closed symplectic manifold', 'nspired by the one-dimensional color-electric flux-tube in a hadron, we propose a possible way of low-dimensionalization of 4D QCD']}


In [83]:
from fireworks.client import Fireworks