In [1]:
import os
import pandas as pd
from collections import defaultdict
import logging
import gc

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('/kaggle/working/math_papers_log.txt'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Define all math subcategories (32 subcategories)
SELECTED_SUBCATEGORIES = [
    'math.AC', 'math.AG', 'math.AP', 'math.AT', 'math.CA',
    'math.CO', 'math.CT', 'math.CV', 'math.DG', 'math.DS',
    'math.FA', 'math.GM', 'math.GN', 'math.GR', 'math.GT',
    'math.HO', 'math.IT', 'math.KT', 'math.LO', 'math.MG',
    'math.MP', 'math.NA', 'math.NT', 'math.OA', 'math.OC',
    'math.PR', 'math.QA', 'math.RA', 'math.RT', 'math.SG',
    'math.SP', 'math.ST'
]

# Map deprecated subcategories
DEPRECATED_MAP = {
    'q-alg': 'math.QA',
    'alg-geom': 'math.AG',
    'dg-ga': 'math.DG',
    'funct-an': 'math.FA'
}

# Define main category
MAIN_CATEGORY = 'math'

# Output paths
OUTPUT_DIR = '/kaggle/working/'
COUNTS_OUTPUT_PATH = os.path.join(OUTPUT_DIR, 'math_papers_per_category.csv')

# Verify output directory
logger.info(f"Checking output directory: {OUTPUT_DIR}")
if not os.path.exists(OUTPUT_DIR):
    try:
        os.makedirs(OUTPUT_DIR)
        logger.info(f"Created output directory: {OUTPUT_DIR}")
    except Exception as e:
        logger.error(f"Failed to create output directory {OUTPUT_DIR}: {e}")
        raise
if not os.access(OUTPUT_DIR, os.W_OK):
    logger.error(f"Output directory {OUTPUT_DIR} is not writable")
    raise PermissionError(f"Output directory {OUTPUT_DIR} is not writable")

# Path to input CSV
csv_path = '/kaggle/input/mathematics/math_papers_3000_per_category.csv'

# Verify file exists
logger.info(f"Checking for CSV file at {csv_path}...")
if not os.path.exists(csv_path):
    logger.error(f"CSV file not found: {csv_path}")
    raise FileNotFoundError(f"CSV file not found: {csv_path}")

# Initialize storage
paper_counts = defaultdict(int)  # Track paper counts per subcategory

# Process CSV in chunks
logger.info("Processing CSV file for math subcategories...")
try:
    chunk_size = 10000  # Balanced for ~100–500 MB CSV
    total_papers = 0
    for chunk in pd.read_csv(csv_path, chunksize=chunk_size):
        for _, row in chunk.iterrows():
            categories = row['categories']
            subcategory = row['subcategory']
            if pd.isna(categories) or pd.isna(subcategory):
                continue
            # Split and map categories
            subcategories = [DEPRECATED_MAP.get(subcat, subcat) for subcat in categories.split()]
            # Ensure all subcategories are math-related
            is_math_only = all(
                subcat.startswith('math.') or subcat in DEPRECATED_MAP.values()
                for subcat in subcategories
            )
            if is_math_only and subcategory in SELECTED_SUBCATEGORIES:
                paper_counts[subcategory] += 1
        total_papers += len(chunk)
        logger.info(f"Processed {total_papers} papers...")
        # Clear memory
        del chunk
        gc.collect()
except Exception as e:
    logger.error(f"Error processing CSV: {e}")
    raise

# Prepare counts for CSV
counts_data = [
    {
        'Main_Category': MAIN_CATEGORY,
        'Subcategory': subcat,
        'Paper_Count': paper_counts[subcat]
    }
    for subcat in SELECTED_SUBCATEGORIES
]
counts_df = pd.DataFrame(counts_data)

# Sort by Subcategory
counts_df = counts_df.sort_values('Subcategory')

# Save counts to CSV
logger.info(f"Saving math papers per category to {COUNTS_OUTPUT_PATH}...")
try:
    counts_df.to_csv(COUNTS_OUTPUT_PATH, index=False)
    # Verify file was saved
    if os.path.exists(COUNTS_OUTPUT_PATH):
        logger.info(f"Math papers per category successfully saved to {COUNTS_OUTPUT_PATH}")
        logger.info(f"File size: {os.path.getsize(COUNTS_OUTPUT_PATH)} bytes")
    else:
        logger.error(f"Failed to verify saved file: {COUNTS_OUTPUT_PATH}")
        raise FileNotFoundError(f"Failed to verify saved file: {COUNTS_OUTPUT_PATH}")
except Exception as e:
    logger.error(f"Error saving counts CSV to {COUNTS_OUTPUT_PATH}: {e}")
    raise

# Print counts
print("\nMath Papers Per Category:")
print(counts_df)
logger.info("\nMath Papers Per Category:\n" + counts_df.to_string())

# Print summary
print(f"\nTotal Subcategories: {len(SELECTED_SUBCATEGORIES)}")
print(f"Total Papers Counted: {counts_df['Paper_Count'].sum()}")
logger.info(f"\nTotal Subcategories: {len(SELECTED_SUBCATEGORIES)}")
logger.info(f"Total Papers Counted: {counts_df['Paper_Count'].sum()}")

logger.info("Processing complete.")
print("Processing complete.")


Math Papers Per Category:
   Main_Category Subcategory  Paper_Count
0           math     math.AC         3000
1           math     math.AG         3000
2           math     math.AP         3000
3           math     math.AT         3000
4           math     math.CA         3000
5           math     math.CO         3000
6           math     math.CT         3000
7           math     math.CV         3000
8           math     math.DG         3000
9           math     math.DS         3000
10          math     math.FA         3000
11          math     math.GM         3000
12          math     math.GN         3000
13          math     math.GR         3000
14          math     math.GT         3000
15          math     math.HO         3000
16          math     math.IT            0
17          math     math.KT         3000
18          math     math.LO         3000
19          math     math.MG         3000
20          math     math.MP            0
21          math     math.NA         3000
22     

In [2]:
import os
import pandas as pd
from collections import defaultdict
import logging
import gc

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('/kaggle/working/math_papers_log.txt'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Define math subcategories (29 subcategories, excluding math.ST, math.MP, math.IT)
SELECTED_SUBCATEGORIES = [
    'math.AC', 'math.AG', 'math.AP', 'math.AT', 'math.CA',
    'math.CO', 'math.CT', 'math.CV', 'math.DG', 'math.DS',
    'math.FA', 'math.GM', 'math.GN', 'math.GR', 'math.GT',
    'math.HO', 'math.KT', 'math.LO', 'math.MG', 'math.NA',
    'math.NT', 'math.OA', 'math.OC', 'math.PR', 'math.QA',
    'math.RA', 'math.RT', 'math.SG', 'math.SP'
]

# Map deprecated subcategories
DEPRECATED_MAP = {
    'q-alg': 'math.QA',
    'alg-geom': 'math.AG',
    'dg-ga': 'math.DG',
    'funct-an': 'math.FA'
}

# Define main category
MAIN_CATEGORY = 'math'

# Output paths
OUTPUT_DIR = '/kaggle/working/'
COUNTS_OUTPUT_PATH = os.path.join(OUTPUT_DIR, 'math_papers_per_category.csv')

# Verify output directory
logger.info(f"Checking output directory: {OUTPUT_DIR}")
if not os.path.exists(OUTPUT_DIR):
    try:
        os.makedirs(OUTPUT_DIR)
        logger.info(f"Created output directory: {OUTPUT_DIR}")
    except Exception as e:
        logger.error(f"Failed to create output directory {OUTPUT_DIR}: {e}")
        raise
if not os.access(OUTPUT_DIR, os.W_OK):
    logger.error(f"Output directory {OUTPUT_DIR} is not writable")
    raise PermissionError(f"Output directory {OUTPUT_DIR} is not writable")

# Path to input CSV
csv_path = '/kaggle/input/mathematics/math_papers_3000_per_category.csv'

# Verify file exists
logger.info(f"Checking for CSV file at {csv_path}...")
if not os.path.exists(csv_path):
    logger.error(f"CSV file not found: {csv_path}")
    raise FileNotFoundError(f"CSV file not found: {csv_path}")

# Initialize storage
paper_counts = defaultdict(int)  # Track paper counts per subcategory

# Process CSV in chunks
logger.info("Processing CSV file for math subcategories...")
try:
    chunk_size = 10000  # Balanced for ~100–500 MB CSV
    total_papers = 0
    for chunk in pd.read_csv(csv_path, chunksize=chunk_size):
        for _, row in chunk.iterrows():
            categories = row['categories']
            subcategory = row['subcategory']
            if pd.isna(categories) or pd.isna(subcategory):
                continue
            # Split and map categories
            subcategories = [DEPRECATED_MAP.get(subcat, subcat) for subcat in categories.split()]
            # Ensure all subcategories are math-related
            is_math_only = all(
                subcat.startswith('math.') or subcat in DEPRECATED_MAP.values()
                for subcat in subcategories
            )
            if is_math_only and subcategory in SELECTED_SUBCATEGORIES:
                paper_counts[subcategory] += 1
        total_papers += len(chunk)
        logger.info(f"Processed {total_papers} papers...")
        # Clear memory
        del chunk
        gc.collect()
except Exception as e:
    logger.error(f"Error processing CSV: {e}")
    raise

# Prepare counts for CSV
counts_data = [
    {
        'Main_Category': MAIN_CATEGORY,
        'Subcategory': subcat,
        'Paper_Count': paper_counts[subcat]
    }
    for subcat in SELECTED_SUBCATEGORIES
]
counts_df = pd.DataFrame(counts_data)

# Sort by Subcategory
counts_df = counts_df.sort_values('Subcategory')

# Save counts to CSV
logger.info(f"Saving math papers per category to {COUNTS_OUTPUT_PATH}...")
try:
    counts_df.to_csv(COUNTS_OUTPUT_PATH, index=False)
    # Verify file was saved
    if os.path.exists(COUNTS_OUTPUT_PATH):
        logger.info(f"Math papers per category successfully saved to {COUNTS_OUTPUT_PATH}")
        logger.info(f"File size: {os.path.getsize(COUNTS_OUTPUT_PATH)} bytes")
    else:
        logger.error(f"Failed to verify saved file: {COUNTS_OUTPUT_PATH}")
        raise FileNotFoundError(f"Failed to verify saved file: {COUNTS_OUTPUT_PATH}")
except Exception as e:
    logger.error(f"Error saving counts CSV to {COUNTS_OUTPUT_PATH}: {e}")
    raise

# Print counts
print("\nMath Papers Per Category:")
print(counts_df)
logger.info("\nMath Papers Per Category:\n" + counts_df.to_string())

# Print summary
print(f"\nTotal Subcategories: {len(SELECTED_SUBCATEGORIES)}")
print(f"Total Papers Counted: {counts_df['Paper_Count'].sum()}")
logger.info(f"\nTotal Subcategories: {len(SELECTED_SUBCATEGORIES)}")
logger.info(f"Total Papers Counted: {counts_df['Paper_Count'].sum()}")

logger.info("Processing complete.")
print("Processing complete.")


Math Papers Per Category:
   Main_Category Subcategory  Paper_Count
0           math     math.AC         3000
1           math     math.AG         3000
2           math     math.AP         3000
3           math     math.AT         3000
4           math     math.CA         3000
5           math     math.CO         3000
6           math     math.CT         3000
7           math     math.CV         3000
8           math     math.DG         3000
9           math     math.DS         3000
10          math     math.FA         3000
11          math     math.GM         3000
12          math     math.GN         3000
13          math     math.GR         3000
14          math     math.GT         3000
15          math     math.HO         3000
16          math     math.KT         3000
17          math     math.LO         3000
18          math     math.MG         3000
19          math     math.NA         3000
20          math     math.NT         3000
21          math     math.OA         3000
22     

In [3]:
import os
import pandas as pd
import logging
import gc

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('/kaggle/working/math_papers_log.txt'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Define math subcategories (29 subcategories, excluding math.ST, math.MP, math.IT)
SELECTED_SUBCATEGORIES = [
    'math.AC', 'math.AG', 'math.AP', 'math.AT', 'math.CA',
    'math.CO', 'math.CT', 'math.CV', 'math.DG', 'math.DS',
    'math.FA', 'math.GM', 'math.GN', 'math.GR', 'math.GT',
    'math.HO', 'math.KT', 'math.LO', 'math.MG', 'math.NA',
    'math.NT', 'math.OA', 'math.OC', 'math.PR', 'math.QA',
    'math.RA', 'math.RT', 'math.SG', 'math.SP'
]

# Map deprecated subcategories
DEPRECATED_MAP = {
    'q-alg': 'math.QA',
    'alg-geom': 'math.AG',
    'dg-ga': 'math.DG',
    'funct-an': 'math.FA'
}

# Output paths
OUTPUT_DIR = '/kaggle/working/'
OUTPUT_PATH = os.path.join(OUTPUT_DIR, 'math_papers_title_abstract.csv')

# Verify output directory
logger.info(f"Checking output directory: {OUTPUT_DIR}")
if not os.path.exists(OUTPUT_DIR):
    try:
        os.makedirs(OUTPUT_DIR)
        logger.info(f"Created output directory: {OUTPUT_DIR}")
    except Exception as e:
        logger.error(f"Failed to create output directory {OUTPUT_DIR}: {e}")
        raise
if not os.access(OUTPUT_DIR, os.W_OK):
    logger.error(f"Output directory {OUTPUT_DIR} is not writable")
    raise PermissionError(f"Output directory {OUTPUT_DIR} is not writable")

# Path to input CSV
csv_path = '/kaggle/input/mathematics/math_papers_3000_per_category.csv'

# Verify file exists
logger.info(f"Checking for CSV file at {csv_path}...")
if not os.path.exists(csv_path):
    logger.error(f"CSV file not found: {csv_path}")
    raise FileNotFoundError(f"CSV file not found: {csv_path}")

# Initialize storage for processed data
processed_papers = []

# Process CSV in chunks
logger.info("Processing CSV file for math papers...")
try:
    chunk_size = 10000  # Balanced for ~100–500 MB CSV
    total_papers = 0
    for chunk in pd.read_csv(csv_path, chunksize=chunk_size):
        # Filter for selected subcategories
        chunk = chunk[chunk['subcategory'].isin(SELECTED_SUBCATEGORIES)]
        if chunk.empty:
            continue
        # Verify math-only categories
        chunk['is_math_only'] = chunk['categories'].apply(
            lambda x: all(
                DEPRECATED_MAP.get(subcat, subcat).startswith('math.') or
                DEPRECATED_MAP.get(subcat, subcat) in DEPRECATED_MAP.values()
                for subcat in str(x).split()
            ) if pd.notna(x) else False
        )
        chunk = chunk[chunk['is_math_only']]
        if chunk.empty:
            continue
        # Combine title and abstract
        chunk['title_abstract'] = chunk.apply(
            lambda x: (str(x['title']) + ' ' + str(x['abstract'])).strip()
            if pd.notna(x['title']) and pd.notna(x['abstract'])
            else str(x['title']) if pd.notna(x['title'])
            else str(x['abstract']) if pd.notna(x['abstract'])
            else '',
            axis=1
        )
        # Drop unnecessary columns
        chunk = chunk[['subcategory', 'title_abstract']]
        # Append to processed papers
        processed_papers.append(chunk)
        total_papers += len(chunk)
        logger.info(f"Processed {total_papers} papers...")
        # Clear memory
        del chunk
        gc.collect()
except Exception as e:
    logger.error(f"Error processing CSV: {e}")
    raise

# Concatenate all chunks
logger.info("Concatenating processed papers...")
try:
    if processed_papers:
        final_df = pd.concat(processed_papers, ignore_index=True)
    else:
        final_df = pd.DataFrame(columns=['subcategory', 'title_abstract'])
    logger.info(f"Total papers after processing: {len(final_df)}")
except Exception as e:
    logger.error(f"Error concatenating data: {e}")
    raise

# Save to CSV
logger.info(f"Saving processed papers to {OUTPUT_PATH}...")
try:
    final_df.to_csv(OUTPUT_PATH, index=False)
    # Verify file was saved
    if os.path.exists(OUTPUT_PATH):
        logger.info(f"Processed papers successfully saved to {OUTPUT_PATH}")
        logger.info(f"File size: {os.path.getsize(OUTPUT_PATH)} bytes")
    else:
        logger.error(f"Failed to verify saved file: {OUTPUT_PATH}")
        raise FileNotFoundError(f"Failed to verify saved file: {OUTPUT_PATH}")
except Exception as e:
    logger.error(f"Error saving CSV to {OUTPUT_PATH}: {e}")
    raise

# Print summary
print("\nProcessed Papers Summary:")
print(f"Total Papers: {len(final_df)}")
print(f"Subcategories: {final_df['subcategory'].nunique()}")
print("\nPapers per Subcategory:")
print(final_df['subcategory'].value_counts().sort_index())
logger.info(f"\nProcessed Papers Summary:\nTotal Papers: {len(final_df)}")
logger.info(f"Subcategories: {final_df['subcategory'].nunique()}")
logger.info("\nPapers per Subcategory:\n" + final_df['subcategory'].value_counts().sort_index().to_string())

logger.info("Processing complete.")
print("Processing complete.")


Processed Papers Summary:
Total Papers: 87000
Subcategories: 29

Papers per Subcategory:
subcategory
math.AC    3000
math.AG    3000
math.AP    3000
math.AT    3000
math.CA    3000
math.CO    3000
math.CT    3000
math.CV    3000
math.DG    3000
math.DS    3000
math.FA    3000
math.GM    3000
math.GN    3000
math.GR    3000
math.GT    3000
math.HO    3000
math.KT    3000
math.LO    3000
math.MG    3000
math.NA    3000
math.NT    3000
math.OA    3000
math.OC    3000
math.PR    3000
math.QA    3000
math.RA    3000
math.RT    3000
math.SG    3000
math.SP    3000
Name: count, dtype: int64
Processing complete.


In [4]:
import os
import pandas as pd
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('/kaggle/working/math_papers_log.txt'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Path to the CSV file
csv_path = '/kaggle/working/math_papers_title_abstract.csv'

# Verify file exists
logger.info(f"Checking for CSV file at {csv_path}...")
if not os.path.exists(csv_path):
    logger.error(f"CSV file not found: {csv_path}")
    raise FileNotFoundError(f"CSV file not found: {csv_path}")

# Load the CSV
logger.info(f"Loading CSV file from {csv_path}...")
try:
    df = pd.read_csv(csv_path)
    logger.info(f"Successfully loaded CSV with {len(df)} rows and {len(df.columns)} columns")
except Exception as e:
    logger.error(f"Error loading CSV: {e}")
    raise

# Display the first few rows
logger.info("Displaying the first 5 rows of the dataset...")
print("\nFirst 5 rows of math_papers_title_abstract.csv:")
print(df.head())
logger.info("\nFirst 5 rows of math_papers_title_abstract.csv:\n" + df.head().to_string())

# Print basic info
print(f"\nTotal Papers: {len(df)}")
print(f"Subcategories: {df['subcategory'].nunique()}")
print("\nColumns:", list(df.columns))
logger.info(f"\nTotal Papers: {len(df)}")
logger.info(f"Subcategories: {df['subcategory'].nunique()}")
logger.info(f"\nColumns: {list(df.columns)}")

logger.info("Display complete.")
print("Display complete.")


First 5 rows of math_papers_title_abstract.csv:
  subcategory                                     title_abstract
0     math.AC  Duality and normalization, variations on a the...
1     math.AC  Matrix invertible extensions over commutative ...
2     math.AC  Lcm-lattice, Taylor Bases and Minimal Free Res...
3     math.AC  On the $(S_2)$-condition of edge rings for cac...
4     math.AC  Adjacency Spectrum and Wiener Index of the Ess...

Total Papers: 87000
Subcategories: 29

Columns: ['subcategory', 'title_abstract']
Display complete.


In [5]:
import pandas as pd
import os
import csv
from sklearn.model_selection import train_test_split

# File path
file_path = '/kaggle/working/math_papers_title_abstract.csv'

print(f"Checking for {file_path}...")
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}. Please ensure math_papers_title_abstract.csv is generated.")

# Load dataset
print("Loading math_papers_title_abstract.csv...")
try:
    df = pd.read_csv(
        file_path,
        quoting=csv.QUOTE_ALL,
        on_bad_lines='warn',
        engine='python'
    )
    print(f"Loaded {len(df)} rows")
    print(f"Columns: {df.columns.tolist()}")
except Exception as e:
    print(f"Error loading math_papers_title_abstract.csv: {e}")
    raise

# Define selected 29 math categories
categories = [
    'math.AC', 'math.AG', 'math.AP', 'math.AT', 'math.CA',
    'math.CO', 'math.CT', 'math.CV', 'math.DG', 'math.DS',
    'math.FA', 'math.GM', 'math.GN', 'math.GR', 'math.GT',
    'math.HO', 'math.KT', 'math.LO', 'math.MG', 'math.NA',
    'math.NT', 'math.OA', 'math.OC', 'math.PR', 'math.QA',
    'math.RA', 'math.RT', 'math.SG', 'math.SP'
]
label_map = {cat: i for i, cat in enumerate(categories)}
math_prefix = 'math.'

# Use subcategory as category
print("Counting math subcategories...")
df['category'] = df['subcategory']

# Count and display available papers per category
category_counts = df['category'].value_counts().to_dict()
print("\nFound math subcategories:\n")
for cat in categories:
    print(f"{cat}: {category_counts.get(cat, 0)} papers")

# Map labels
df['label'] = df['category'].map(label_map)

# Filter to N rows per category
print("\nFiltering to 3,500 rows per category...")
filtered_df = pd.DataFrame()
rows_per_category = 3000
for cat in categories:
    cat_df = df[df['category'] == cat].head(rows_per_category)
    if len(cat_df) < rows_per_category:
        print(f"⚠ Only {len(cat_df)} papers found for {cat}")
    filtered_df = pd.concat([filtered_df, cat_df], ignore_index=True)

# Final check
actual_counts = filtered_df['category'].value_counts().to_dict()
expected_total = sum(min(category_counts.get(cat, 0), rows_per_category) for cat in categories)
if len(filtered_df) != expected_total:
    raise ValueError(f"Mismatch in expected filtered count. Got {len(filtered_df)} instead of {expected_total}.")

print(f"\nFiltered to {len(filtered_df)} rows")
print(f"Label distribution:\n{filtered_df['label'].value_counts().sort_index()}")

# Split dataset
print("Splitting dataset...")
train_df, temp_df = train_test_split(filtered_df, test_size=0.2, stratify=filtered_df['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

# Save splits
output_dir = '/kaggle/working/'
print(f"Saving train.csv, val.csv, test.csv to {output_dir}...")
train_df.to_csv(os.path.join(output_dir, 'train.csv'), index=False)
val_df.to_csv(os.path.join(output_dir, 'val.csv'), index=False)
test_df.to_csv(os.path.join(output_dir, 'test.csv'), index=False)

print(f"\nTrain size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")
print("Step1 complete.")

Checking for /kaggle/working/math_papers_title_abstract.csv...
Loading math_papers_title_abstract.csv...
Loaded 87000 rows
Columns: ['subcategory', 'title_abstract']
Counting math subcategories...

Found math subcategories:

math.AC: 3000 papers
math.AG: 3000 papers
math.AP: 3000 papers
math.AT: 3000 papers
math.CA: 3000 papers
math.CO: 3000 papers
math.CT: 3000 papers
math.CV: 3000 papers
math.DG: 3000 papers
math.DS: 3000 papers
math.FA: 3000 papers
math.GM: 3000 papers
math.GN: 3000 papers
math.GR: 3000 papers
math.GT: 3000 papers
math.HO: 3000 papers
math.KT: 3000 papers
math.LO: 3000 papers
math.MG: 3000 papers
math.NA: 3000 papers
math.NT: 3000 papers
math.OA: 3000 papers
math.OC: 3000 papers
math.PR: 3000 papers
math.QA: 3000 papers
math.RA: 3000 papers
math.RT: 3000 papers
math.SG: 3000 papers
math.SP: 3000 papers

Filtering to 3,500 rows per category...

Filtered to 87000 rows
Label distribution:
label
0     3000
1     3000
2     3000
3     3000
4     3000
5     3000
6     300

In [6]:
import pandas as pd
from transformers import LongformerTokenizer
import torch
import pickle
import logging
import os

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('/kaggle/working/step2_log.txt'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Define input/output directories
input_dir = '/kaggle/working/'  # Files are in working directory
output_dir = '/kaggle/working/'  # Save outputs here

# Find train.csv and val.csv
print("Searching for train.csv and val.csv in /kaggle/working/...")
logger.info("Searching for train.csv and val.csv in /kaggle/working/...")
train_path = None
val_path = None

for root, dirs, files in os.walk(input_dir):
    for file in files:
        if file == 'train.csv':
            train_path = os.path.join(root, file)
        if file == 'val.csv':
            val_path = os.path.join(root, file)
    if train_path and val_path:
        break

if not train_path or not val_path:
    logger.error("train.csv or val.csv not found in /kaggle/working/.")
    raise FileNotFoundError("train.csv or val.csv not found. Please ensure they are generated.")

print(f"Found train.csv at {train_path}")
print(f"Found val.csv at {val_path}")
logger.info(f"Found train.csv at {train_path}")
logger.info(f"Found val.csv at {val_path}")

# Load tokenizer
print("Loading Longformer tokenizer...")
logger.info("Loading Longformer tokenizer...")
try:
    tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
except Exception as e:
    logger.error(f"Error loading tokenizer: {e}")
    raise

# Load datasets
print("Loading datasets...")
logger.info("Loading datasets...")
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)

# Tokenization function
def tokenize_data(df, max_length=4096):
    texts = df['title_abstract'].tolist()  # Use title_abstract column
    labels = df['label'].tolist()
    encodings = tokenizer(
        texts, 
        truncation=True, 
        padding=True, 
        max_length=max_length, 
        return_tensors='pt'
    )
    return {
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': labels
    }

# Tokenize in batches
batch_size = 100
train_tokenized = []
val_tokenized = []

print("Tokenizing training data...")
logger.info("Tokenizing training data...")
for i in range(0, len(train_df), batch_size):
    batch_df = train_df[i:i+batch_size]
    tokenized_batch = tokenize_data(batch_df)
    train_tokenized.append(tokenized_batch)
    print(f"Tokenized train batch {i//batch_size + 1}/{len(train_df)//batch_size + 1}")
    logger.info(f"Tokenized train batch {i//batch_size + 1}/{len(train_df)//batch_size + 1}")

print("Tokenizing validation data...")
logger.info("Tokenizing validation data...")
for i in range(0, len(val_df), batch_size):
    batch_df = val_df[i:i+batch_size]
    tokenized_batch = tokenize_data(batch_df)
    val_tokenized.append(tokenized_batch)
    print(f"Tokenized val batch {i//batch_size + 1}/{len(val_df)//batch_size + 1}")
    logger.info(f"Tokenized val batch {i//batch_size + 1}/{len(val_df)//batch_size + 1}")

# Save tokenized datasets
print(f"Saving tokenized datasets to {output_dir}...")
logger.info(f"Saving tokenized datasets to {output_dir}...")
with open(os.path.join(output_dir, 'train_tokenized.pkl'), 'wb') as f:
    pickle.dump(train_tokenized, f)
with open(os.path.join(output_dir, 'val_tokenized.pkl'), 'wb') as f:
    pickle.dump(val_tokenized, f)

print(f"Train tokenized: {len(train_tokenized)} batches, Val tokenized: {len(val_tokenized)} batches")
logger.info(f"Train tokenized: {len(train_tokenized)} batches, Val tokenized: {len(val_tokenized)} batches")
print("Step 2 complete.")
logger.info("Step 2 complete.")

Searching for train.csv and val.csv in /kaggle/working/...
Found train.csv at /kaggle/working/train.csv
Found val.csv at /kaggle/working/val.csv
Loading Longformer tokenizer...


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Loading datasets...
Tokenizing training data...
Tokenized train batch 1/697
Tokenized train batch 2/697
Tokenized train batch 3/697
Tokenized train batch 4/697
Tokenized train batch 5/697
Tokenized train batch 6/697
Tokenized train batch 7/697
Tokenized train batch 8/697
Tokenized train batch 9/697
Tokenized train batch 10/697
Tokenized train batch 11/697
Tokenized train batch 12/697
Tokenized train batch 13/697
Tokenized train batch 14/697
Tokenized train batch 15/697
Tokenized train batch 16/697
Tokenized train batch 17/697
Tokenized train batch 18/697
Tokenized train batch 19/697
Tokenized train batch 20/697
Tokenized train batch 21/697
Tokenized train batch 22/697
Tokenized train batch 23/697
Tokenized train batch 24/697
Tokenized train batch 25/697
Tokenized train batch 26/697
Tokenized train batch 27/697
Tokenized train batch 28/697
Tokenized train batch 29/697
Tokenized train batch 30/697
Tokenized train batch 31/697
Tokenized train batch 32/697
Tokenized train batch 33/697
Toke

In [7]:
!pip install datasets


Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.12.0 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.8.4.1 which is incompatible.
torc

In [None]:
import os
import glob
import re
import torch
import numpy as np
import pickle
import gc
import logging
import time
import pandas as pd
from datasets import Dataset
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import precision_recall_fscore_support
from transformers import (
    LongformerForSequenceClassification,
    LongformerTokenizerFast,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

# ----------------------------
# Setup
# ----------------------------
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
torch.backends.cuda.matmul.allow_tf32 = True

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('/kaggle/working/training_log.txt'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

input_dir = '/kaggle/working/'
train_tokenized_path = os.path.join(input_dir, 'train_tokenized.pkl')
val_tokenized_path = os.path.join(input_dir, 'val_tokenized.pkl')
results_dir = os.path.join(input_dir, 'results')
os.makedirs(results_dir, exist_ok=True)

# ----------------------------
# Checkpoint
# ----------------------------
def get_latest_checkpoint(results_dir):
    checkpoint_dirs = glob.glob(os.path.join(results_dir, 'checkpoint-*'))
    if not checkpoint_dirs:
        return None
    checkpoint_nums = [int(re.search(r'checkpoint-(\d+)', d).group(1)) for d in checkpoint_dirs]
    return os.path.join(results_dir, f'checkpoint-{max(checkpoint_nums)}')

checkpoint_path = get_latest_checkpoint(results_dir)
print(f"Checkpoint: {checkpoint_path}" if checkpoint_path else "No checkpoints found.")

# ----------------------------
# Load Data
# ----------------------------
with open(train_tokenized_path, 'rb') as f:
    train_tokenized = pickle.load(f)
with open(val_tokenized_path, 'rb') as f:
    val_tokenized = pickle.load(f)

def flatten_batches(batched_data):
    """Flatten list of batches into a flat list of dicts with same length input tensors."""
    flat_data = []
    for batch in batched_data:
        for i in range(len(batch['input_ids'])):
            item = {
                'input_ids': batch['input_ids'][i][:1024],
                'attention_mask': batch['attention_mask'][i][:1024],
                'labels': int(batch['labels'][i])
            }
            flat_data.append(item)
    return flat_data

train_dataset = Dataset.from_list(flatten_batches(train_tokenized))
val_dataset = Dataset.from_list(flatten_batches(val_tokenized))

del train_tokenized, val_tokenized
gc.collect()
torch.cuda.empty_cache()

# ----------------------------
# Class Weights
# ----------------------------
labels = np.array(train_dataset['labels'])
num_labels = 29
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(labels), y=labels)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

if len(class_weights_tensor) < num_labels:
    pad = torch.zeros(num_labels - len(class_weights_tensor))
    class_weights_tensor = torch.cat((class_weights_tensor, pad))

# ----------------------------
# Tokenizer & Model
# ----------------------------
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')
tokenizer.model_max_length = 1024

if checkpoint_path:
    model = LongformerForSequenceClassification.from_pretrained(
        checkpoint_path, num_labels=num_labels, ignore_mismatched_sizes=True
    )
else:
    model = LongformerForSequenceClassification.from_pretrained(
        'allenai/longformer-base-4096', num_labels=num_labels
    )

model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

# ----------------------------
# Custom Trainer
# ----------------------------
class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# ----------------------------
# Metrics
# ----------------------------
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': (preds == labels).mean(),
        'precision_weighted': precision,
        'recall_weighted': recall,
        'f1_weighted': f1
    }

# ----------------------------
# Training Arguments
# ----------------------------
training_args = TrainingArguments(
    output_dir=results_dir,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir=os.path.join(input_dir, 'logs'),
    logging_steps=10,
    logging_first_step=True,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=False,
    fp16=True,
    report_to='none',
    log_level="info",
    disable_tqdm=False
)

# ----------------------------
# Trainer and Training
# ----------------------------
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer, padding=True),
    class_weights=class_weights_tensor,
)

print("Starting training...")
start_time = time.time()
trainer.train(resume_from_checkpoint=checkpoint_path)
end_time = time.time()

# ----------------------------
# Save Final Model
# ----------------------------
final_model_path = os.path.join(input_dir, 'final_model')
trainer.save_model(final_model_path)
print(f"Model saved to {final_model_path}")
print(f"Training completed in {(end_time - start_time)/60:.2f} minutes.")

# ----------------------------
# Save Final Evaluation Metrics
# ----------------------------
metrics = trainer.evaluate()
pd.DataFrame([metrics]).to_csv(os.path.join(input_dir, "final_eval_metrics.csv"), index=False)
print("Metrics saved to final_eval_metrics.csv")

# Cleanup
del model, trainer
torch.cuda.empty_cache()
gc.collect()

No checkpoints found.


loading file vocab.json from cache at /root/.cache/huggingface/hub/models--allenai--longformer-base-4096/snapshots/301e6a42cb0d9976a6d6a26a079fef81c18aa895/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--allenai--longformer-base-4096/snapshots/301e6a42cb0d9976a6d6a26a079fef81c18aa895/merges.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--allenai--longformer-base-4096/snapshots/301e6a42cb0d9976a6d6a26a079fef81c18aa895/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading file chat_template.jinja from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--allenai--longformer-base-4096/snapshots/301e6a42cb0d9976a6d6a26a079fef81c18aa895/config.json
Model config LongformerConfig {
  "attention_mode": "longformer",
  "attention_probs_dropout_pro

Starting training...


Epoch,Training Loss,Validation Loss
