In [14]:
import os
import pandas as pd
import re

def add_filename_column(csv_path):
    filename = os.path.basename(csv_path).split('.')[0]
    df = pd.read_csv(csv_path)
    df['filename'] = filename
    df.to_csv(csv_path, index=False)

def extract_classname(row):
    match = re.search(r'{{(.*?)}}', row['ParaCleanedContent'])
    if match:
        return match.group(1)
    else:
        return None

def remove_markup_and_class(row):
    cleaned_content = re.sub(r'{{.*?}}', '', row['ParaCleanedContent'])
    return cleaned_content.strip()

def remove_classname_from_paracontent(row):
    cleaned_content = re.sub(r'{{.*?}}', '', row['ParaContent'])
    return cleaned_content.strip()

def add_classname_column(csv_path):
    df = pd.read_csv(csv_path)
    df['classname'] = df.apply(extract_classname, axis=1)
    df['ParaCleanedContent'] = df.apply(remove_markup_and_class, axis=1)
    # Move the 'classname' column to the last position
    cols = df.columns.tolist()
    cols.insert(len(cols), cols.pop(cols.index('classname')))
    df = df[cols]
    df['ParaContent'] = df.apply(remove_classname_from_paracontent, axis=1)  # Remove classname pattern from ParaContent
    df.to_csv(csv_path, index=False)

def process_folder(folder_path):
    combined_df = pd.DataFrame()
    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            csv_path = os.path.join(folder_path, file)
            add_filename_column(csv_path)
            add_classname_column(csv_path)
            df = pd.read_csv(csv_path)
            combined_df = pd.concat([combined_df, df], ignore_index=True)
    combined_df.to_csv(os.path.join(folder_path, 'combined_dataset.csv'), index=False)

if __name__ == "__main__":
    folder_path = '/Users/senthil/Downloads/JPT/journals_data_element_prediction/test'
    process_folder(folder_path)


In [15]:
import pandas as pd

def remove_empty_classname_rows(csv_path):
    df = pd.read_csv(csv_path)
    df = df.dropna(subset=['classname'])  # Drop rows with empty classname
    df.to_csv(csv_path, index=False)

if __name__ == "__main__":
    csv_path = '/Users/senthil/Downloads/JPT/journals_data_element_prediction/dataset/journals_ep_dataset.csv'
    remove_empty_classname_rows(csv_path)


  df = pd.read_csv(csv_path)
