### Step 1: Load train_labeled.csv

In [3]:
import pandas as pd

train_df = pd.read_csv(r"D:\Data Science Projects\Data Citation Intent Classification\data\processed\train_labeled.csv")

print(f"✅ Loaded {len(train_df)} rows from train_labeled.csv")
train_df.head()


✅ Loaded 44899 rows from train_labeled.csv


Unnamed: 0,article_id,ref_id,context,labels
0,10.1002_2017jc013030,,"(Volk and Hoffert, 1985;","[('https://doi.org/10.17882/49388', 'Primary')]"
1,10.1002_2017jc013030,,"Honjo et al., 2014;","[('https://doi.org/10.17882/49388', 'Primary')]"
2,10.1002_2017jc013030,,"Legendre et al., 2015)","[('https://doi.org/10.17882/49388', 'Primary')]"
3,10.1002_2017jc013030,,"(Riser and Johnson, 2008;","[('https://doi.org/10.17882/49388', 'Primary')]"
4,10.1002_2017jc013030,,"Graff et al., 2012","[('https://doi.org/10.17882/49388', 'Primary')]"


### Step 2: Filter Primary and Secondary labels

In [4]:
# Keep only contexts with labels
train_df = train_df[train_df['labels'].notnull() & (train_df['labels'] != '[]')].copy()

# Convert string of list back to actual list (if needed)
import ast
train_df['labels'] = train_df['labels'].apply(ast.literal_eval)

# Take the first label for each context (Primary/Secondary)
train_df['dataset_type'] = train_df['labels'].apply(lambda x: x[0][1] if len(x) > 0 else 'Missing')

# Keep only contexts with Primary or Secondary
train_df = train_df[train_df['dataset_type'].isin(['Primary', 'Secondary'])]

print(f"✅ Filtered to {len(train_df)} contexts with Primary/Secondary labels")
train_df.head()


✅ Filtered to 21215 contexts with Primary/Secondary labels


Unnamed: 0,article_id,ref_id,context,labels,dataset_type
0,10.1002_2017jc013030,,"(Volk and Hoffert, 1985;","[(https://doi.org/10.17882/49388, Primary)]",Primary
1,10.1002_2017jc013030,,"Honjo et al., 2014;","[(https://doi.org/10.17882/49388, Primary)]",Primary
2,10.1002_2017jc013030,,"Legendre et al., 2015)","[(https://doi.org/10.17882/49388, Primary)]",Primary
3,10.1002_2017jc013030,,"(Riser and Johnson, 2008;","[(https://doi.org/10.17882/49388, Primary)]",Primary
4,10.1002_2017jc013030,,"Graff et al., 2012","[(https://doi.org/10.17882/49388, Primary)]",Primary


### Step 3: Clean context text

In [5]:
import re

def clean_text(text):
    text = str(text)
    text = re.sub(r'\s+', ' ', text)                 # remove multiple spaces
    text = re.sub(r'\(.*?\)', '', text)             # remove content in parentheses
    text = re.sub(r'\[[^\]]*\]', '', text)          # remove content in brackets
    text = re.sub(r'[^A-Za-z0-9.,;:!?\'" ]+', '', text)  # keep only letters, numbers, and basic punctuation
    return text.strip()

train_df['clean_context'] = train_df['context'].apply(clean_text)
train_df.head()


Unnamed: 0,article_id,ref_id,context,labels,dataset_type,clean_context
0,10.1002_2017jc013030,,"(Volk and Hoffert, 1985;","[(https://doi.org/10.17882/49388, Primary)]",Primary,"Volk and Hoffert, 1985;"
1,10.1002_2017jc013030,,"Honjo et al., 2014;","[(https://doi.org/10.17882/49388, Primary)]",Primary,"Honjo et al., 2014;"
2,10.1002_2017jc013030,,"Legendre et al., 2015)","[(https://doi.org/10.17882/49388, Primary)]",Primary,"Legendre et al., 2015"
3,10.1002_2017jc013030,,"(Riser and Johnson, 2008;","[(https://doi.org/10.17882/49388, Primary)]",Primary,"Riser and Johnson, 2008;"
4,10.1002_2017jc013030,,"Graff et al., 2012","[(https://doi.org/10.17882/49388, Primary)]",Primary,"Graff et al., 2012"


### Step 4: Save preprocessed data

In [6]:
train_df[['article_id', 'ref_id', 'clean_context', 'dataset_type']].to_csv("train_preprocessed.csv", index=False)
print("✅ Saved cleaned dataset to train_preprocessed.csv")


✅ Saved cleaned dataset to train_preprocessed.csv
