In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize,sent_tokenize
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
from nltk import pos_tag

from pprint import pprint
import pandas as pd
from synergy_dataset import Dataset, iter_datasets

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
def get_dataset(file_name):
    d = Dataset(file_name)
    data_dict = d.to_dict(["authorships", "cited_by_count", "publication_year"])
    d.metadata

    data_list = []

    for openalex_id, data in data_dict.items():
        authorships = data.get("authorships", [])
        cited_by_count = data.get("cited_by_count", None)
        publication_year = data.get("publication_year", None)

        # Append the extracted data as a dictionary to the data_list
        data_list.append({
            "openalex-id": openalex_id,
            "authorships": authorships,
            "cited_by_count": cited_by_count,
            "publication_year": publication_year
        })

    # Create a DataFrame from the data_list
    df_metadata = pd.DataFrame(data_list)

    df_labels = d.to_frame()
    df_labels.reset_index(inplace=True)
    df_labels = df_labels.rename(columns={'openalex_id': 'openalex-id'})
    df = pd.merge(df_metadata, df_labels, on="openalex-id", how="left")

    author_names = df['authorships'].apply(lambda x: ', '.join([item['author']['display_name'] for item in x]))

    df['author_names'] = author_names
    columns_to_drop = ['authorships']
    df.drop(columns=columns_to_drop, inplace=True)

    return df

In [None]:

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def clean_text(text, for_embedding=False):
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE)  # remove white space
    RE_TAGS = re.compile(r"<[^>]+>")  # removes tags
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE)  # keep only ASCII and European characters
    RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE)
    if for_embedding:
        # Keep punctuation
        RE_ASCII = re.compile(r"[^A-Za-zÀ-ž,.!? ]", re.IGNORECASE)
        RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž,.!?]\b", re.IGNORECASE)

    text = re.sub(RE_TAGS, " ", str(text))
    text = re.sub(RE_ASCII, " ", str(text))
    text = re.sub(RE_SINGLECHAR, " ", str(text))
    text = re.sub(RE_WSPACE, " ", str(text))

    word_tokens = word_tokenize(text)
    pos_tags = pos_tag(word_tokens)

    # Filter words to keep only nouns and adjectives
    filtered_words = [word for word, tag in pos_tags if tag.startswith('NN') or tag.startswith('JJ')]
    
    if not for_embedding:
        filtered_words = [word.lower() for word in filtered_words]  # Convert to lowercase
        filtered_words = [lemmatizer.lemmatize(word) for word in filtered_words if word not in stop_words]  # Lemmatize and remove stopwords

    text_clean = " ".join(filtered_words)
    return text_clean


In [None]:
file_name = "Hall_2012" 
# file_name = "Jeyaraman_2020"
# file_name = "Radjenovic_2013"
# file_name = "Smid_2020"
df = get_dataset(file_name)
df['Corpus'] = df['title'] + ' ' + df['abstract']
df['Corpus'] = df['Corpus'].apply(clean_text)
df

In [None]:
output_csv = file_name + "_cleaned.csv"
df.to_csv(output_csv, index=False)