In [2]:
from sqlalchemy import create_engine
import pandas as pd
import ast
from datetime import datetime

In [None]:
DATABASE_URL = 'postgresql://postgres@localhost:5432/arxivdb'
engine = create_engine(DATABASE_URL)

## `category` Table

In [None]:
category_df = pd.read_csv(
    '../Dataset/categories.csv'
)[['code', 'name']]

In [None]:
category_df.rename(
    columns={'code': 'categorycode', 'name': 'categoryname'}, 
    inplace=True
)

In [7]:
category_df.to_sql(
    'category',
    con=engine,
    if_exists='append',
    index=False
)

155

## `author` Table

In [8]:
df = pd.read_csv('../Dataset/data1000000.csv')

In [None]:
unique_authors = set()

for _, row in df.iterrows():
    if pd.isna(row['authors_parsed']):
        continue
    try:
        authors = ast.literal_eval(row['authors_parsed'])
        for a in authors:
            lastname = a[0].strip()
            initials = a[1].strip() if len(a) > 1 else ''
            fullname = f"{initials} {lastname}"
            unique_authors.add(fullname)
    except:
        continue

In [None]:
author_id_map = {name: idx + 1 for idx, name in enumerate(unique_authors)}

authors_df = pd.DataFrame(
    list(author_id_map.items()), 
    columns=['authorname', 'authorid']
)

authors_df = authors_df[['authorid', 'authorname']]

In [None]:
authors_df.to_sql(
    'author', 
    con=engine, 
    if_exists='append', 
    index=False
)

392

## `paper` Table

In [None]:
def get_submitter_id(full_name):
    if not isinstance(full_name, str):
        return None
    if full_name in author_id_map:
        return author_id_map[full_name]
    parts = full_name.split()
    if len(parts) > 1:
        init = parts[0][0] + '.'
        lastname = parts[-1]
        key = f"{init} {lastname}"
        if key in author_id_map:
            return author_id_map[key]
    return None

In [None]:
df['update_date'] = pd.to_datetime(
    df['update_date'], 
    errors='coerce'
).dt.date

df['submitterid'] = df['submitter'].apply(get_submitter_id)

df = df[df['submitterid'].notnull()]

In [14]:
papers_df = df[[
    'id', 'title', 'abstract', 'license', 'doi', 'report-no',
    'journal-ref', 'comments', 'update_date', 'submitterid'
]].copy()

In [15]:
papers_df.rename(columns={
    'id': 'arxivid',
    'update_date': 'updatedate',
    'report-no': 'reportno',
    'journal-ref': 'journalref'
}, inplace=True)

In [None]:
papers_df.to_sql(
    'paper', 
    con=engine, 
    if_exists='append', 
    index=False
)

511

## `papercategory` Table

In [None]:
paper_category_rows = []
for _, row in df.iterrows():
    if pd.isna(row['categories']): 
        continue
    for cat in row['categories'].split():
        paper_category_rows.append(
            {
                'arxivid': row['id'], 
                'categorycode': cat
            }
        )

In [18]:
paper_category_df = pd.DataFrame(paper_category_rows)

In [None]:
all_categories = set()

for cats in df['categories'].dropna():
    for c in cats.split():
        all_categories.add(c.strip())

In [None]:
existing_cats = pd.read_sql(
    'SELECT categorycode FROM category', 
    con=engine
)

existing_codes = set(existing_cats['categorycode'].tolist())

In [26]:
missing_cats = all_categories - existing_codes
missing_cats

{'astro-ph'}

In [None]:
new_cat = pd.DataFrame([{
    'categorycode': 'astro-ph', 
        # ['astro-ph', 'hep-ph']
    'categoryname': 'Astrophysics' 
        # ['Astrophysics', 'High Energy Physics - Phenomenology']
}])

new_cat.to_sql(
    'category', 
    con=engine, 
    if_exists='append', 
    index=False
)

1

In [None]:
paper_category_df.to_sql(
    'papercategory', 
    con=engine, 
    if_exists='append', 
    index=False
)

78

## `paperauthor` Table

In [29]:
paper_author_rows = []

for _, row in df.iterrows():
    if pd.isna(row['authors_parsed']): 
        continue
    try:
        authors = ast.literal_eval(row['authors_parsed'])
        for a in authors:
            author_name = f"{a[1]} {a[0]}".strip()
            if author_name in author_id_map:
                paper_author_rows.append({
                    'arxivid': row['id'],
                    'authorid': author_id_map[author_name]
                })
    except:
        continue

In [None]:
paper_author_df = pd.DataFrame(
    paper_author_rows
).drop_duplicates()

In [None]:
paper_author_df.to_sql(
    'paperauthor', 
    con=engine, 
    if_exists='append', 
    index=False
)

195

## `version` Table

In [None]:
version_rows = []

for _, row in df.iterrows():
    try:
        versions = ast.literal_eval(row['versions'])
        for v in versions:
            version_no = v.get('version')
            created_str = v.get('created')
            try:
                create_date = \
                    datetime.strptime(
                        created_str, "%a, %d %b %Y %H:%M:%S %Z"
                    ).date()
            except:
                create_date = None
            version_rows.append({
                'arxivid': row['id'],
                'versionno': version_no,
                'createdate': create_date
            })
    except:
        continue

In [33]:
version_df = pd.DataFrame(version_rows)

In [None]:
version_df.to_sql(
    'version', 
    con=engine, 
    if_exists='append', 
    index=False
)

117