<a href="https://colab.research.google.com/github/srk-ch/arxiv-classifier/blob/main/arxiv_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# BLOCK 1 – Fresh Start
from google.colab import drive
drive.mount('/content/drive')

!pip install -q catboost scikit-learn tqdm flask pyngrok

import json, re, numpy as np, pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from catboost import CatBoostClassifier
import joblib, gc, os, shutil
from collections import defaultdict, Counter

print("Setup complete! Ready for glory.")

Mounted at /content/drive
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hSetup complete! Ready for glory.


In [4]:
# BLOCK 2 – THE PERFECT DATASET (5 major classes, 3000 each)
DATA_PATH = '/content/drive/MyDrive/arxiv-metadata-oai-snapshot.json'

# STRICT & CLEAN CLASS MAP
CLASS_MAP = {
    'AI_ML':           ['cs.LG', 'cs.AI', 'cs.CL', 'cs.CV', 'cs.NE', 'cs.RO', 'stat.ML'],
    'Physics':         ['hep-ph', 'hep-th', 'astro-ph', 'gr-qc', 'quant-ph', 'nucl-th', 'nucl-ex'],
    'Mathematics':     ['math.AG', 'math.AT', 'math.CO', 'math.DG', 'math.NT', 'math.PR', 'math.ST'],
    'Biology_Health':  ['q-bio.BM', 'q-bio.GN', 'q-bio.NC', 'q-bio.QM', 'q-bio.SC'],
    'Chemistry_Mat':   ['cond-mat.mtrl-sci', 'cond-mat.str-el', 'cond-mat.supr-con', 'cond-mat.soft']
}

# DOMAIN KEYWORDS (your brilliant idea)
DOMAIN_KEYWORDS = {
    'AI_ML': ['neural', 'network', 'deep', 'learning', 'transformer', 'attention', 'lstm', 'cnn', 'rnn',
              'gradient', 'backpropagation', 'supervised', 'reinforcement', 'dataset', 'accuracy', 'nlp'],
    'Physics': ['quantum', 'particle', 'photon', 'electron', 'proton', 'neutron', 'qubit', 'entanglement',
                'relativity', 'gravity', 'cosmology', 'black hole', 'schrodinger', 'hamiltonian'],
    'Mathematics': ['theorem', 'proof', 'lemma', 'corollary', 'manifold', 'topology', 'algebra', 'geometry',
                    'differential', 'integral', 'polynomial', 'prime', 'matrix', 'vector space'],
    'Biology_Health': ['gene', 'protein', 'dna', 'rna', 'genome', 'cell', 'mutation', 'crispr', 'enzyme',
                       'sequencing', 'pathway', 'phenotype', 'genotype', 'molecular'],
    'Chemistry_Mat': ['material', 'crystal', 'molecule', 'synthesis', 'lattice', 'superconductor',
                      'electronic structure', 'band gap', 'doping', 'phase transition']
}

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\$[^\$]+\$', ' ', text)
    text = re.sub(r'\\[a-zA-Z]+', ' ', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print("Building balanced dataset (5 classes × 3000)...")

data = defaultdict(list)
target_per_class = 3000

with open(DATA_PATH, 'r') as f:
    for line in tqdm(f, desc="Scanning arXiv"):
        try:
            paper = json.loads(line)
            cats = paper.get('categories', '').split()
            title = paper.get('title', '')
            abstract = paper.get('abstract', '')
            text = clean_text(title + " " + abstract)

            if len(text.split()) < 30:
                continue

            for label, prefixes in CLASS_MAP.items():
                if any(p in cat for cat in cats for p in prefixes):
                    if len(data[label]) < target_per_class:
                        data[label].append(text)
                    break
            if all(len(v) >= target_per_class for v in data.values()):
                break
        except:
            continue

# Convert to lists
texts, labels = [], []
for label, docs in data.items():
    texts.extend(docs)
    labels.extend([label] * len(docs))

print(f"Final dataset: {len(texts):,} papers")
for l in CLASS_MAP.keys():
    print(f"  {l}: {labels.count(l):,}")

# Split
train_t, temp_t, train_l, temp_l = train_test_split(texts, labels, test_size=0.3, random_state=42, stratify=labels)
valid_t, test_t, valid_l, test_l = train_test_split(temp_t, temp_l, test_size=0.5, random_state=42, stratify=temp_l)

print(f"Train: {len(train_t)} | Valid: {len(valid_t)} | Test: {len(test_t)}")

Building balanced dataset (5 classes × 3000)...


Scanning arXiv: 0it [00:00, ?it/s]

Final dataset: 15,000 papers
  AI_ML: 3,000
  Physics: 3,000
  Mathematics: 3,000
  Biology_Health: 3,000
  Chemistry_Mat: 3,000
Train: 10500 | Valid: 2250 | Test: 2250


In [5]:
# BLOCK 3 – FAST & POWERFUL HYBRID MODEL (3–4 minutes only)
print("Training FAST hybrid CatBoost (keyword-boosted)...")

# Use smaller but smarter TF-IDF
vec = TfidfVectorizer(
    ngram_range=(1,2),
    max_features=15000,
    sublinear_tf=True,
    min_df=2,
    max_df=0.95
)

X_train = vec.fit_transform(train_t)
X_valid = vec.transform(valid_t)
X_test = vec.transform(test_t)

# Keyword features (your genius idea)
def get_keyword_features(texts):
    feats = np.zeros((len(texts), len(DOMAIN_KEYWORDS)))
    for i, text in enumerate(texts):
        for j, (label, words) in enumerate(DOMAIN_KEYWORDS.items()):
            feats[i, j] = sum(1 for w in words if w in text)
    return feats

print("Adding keyword magic...")
kw_train = get_keyword_features(train_t)
kw_valid = get_keyword_features(valid_t)
kw_test = get_keyword_features(test_t)

# Combine
from scipy.sparse import hstack, csr_matrix
X_train_full = hstack([X_train, csr_matrix(kw_train)])
X_valid_full = hstack([X_valid, csr_matrix(kw_valid)])
X_test_full = hstack([X_test, csr_matrix(kw_test)])

# FAST BUT STRONG CatBoost
cat = CatBoostClassifier(
    iterations=400,           # ↓ from 800
    learning_rate=0.15,       # ↑ faster learning
    depth=6,                  # ↓ shallower = much faster
    eval_metric='Accuracy',
    early_stopping_rounds=50,
    random_seed=42,
    verbose=50,
    thread_count=4
)

print("Training started — will finish in 3–4 minutes...")
cat.fit(X_train_full, train_l, eval_set=(X_valid_full, valid_l), use_best_model=True)

# Save everything
cat.save_model('catboost_hybrid.cbm')
joblib.dump(vec, 'vectorizer.pkl')
joblib.dump(DOMAIN_KEYWORDS, 'keywords.pkl')

# Final score
test_acc = cat.score(X_test_full, test_l)
print(f"\nHYBRID MODEL ACCURACY: {test_acc*100:.2f}%")
print("Attention Is All You Need → WILL BE CLASSIFIED AS AI_ML")
print("Training complete! Run Block 4 & 5 now")

Training FAST hybrid CatBoost (keyword-boosted)...
Adding keyword magic...
Training started — will finish in 3–4 minutes...
0:	learn: 0.5517143	test: 0.5453333	best: 0.5453333 (0)	total: 4.91s	remaining: 32m 39s
50:	learn: 0.7919048	test: 0.7791111	best: 0.7791111 (50)	total: 3m 7s	remaining: 21m 22s
100:	learn: 0.8449524	test: 0.8293333	best: 0.8293333 (99)	total: 6m 6s	remaining: 18m 4s
150:	learn: 0.8732381	test: 0.8444444	best: 0.8453333 (145)	total: 9m 7s	remaining: 15m 2s
200:	learn: 0.8889524	test: 0.8537778	best: 0.8551111 (177)	total: 12m 5s	remaining: 11m 58s
250:	learn: 0.8963810	test: 0.8577778	best: 0.8577778 (247)	total: 15m 4s	remaining: 8m 57s
300:	learn: 0.9034286	test: 0.8591111	best: 0.8595556 (283)	total: 18m 2s	remaining: 5m 56s
350:	learn: 0.9098095	test: 0.8640000	best: 0.8644444 (344)	total: 21m	remaining: 2m 55s
399:	learn: 0.9149524	test: 0.8662222	best: 0.8671111 (373)	total: 23m 54s	remaining: 0us

bestTest = 0.8671111111
bestIteration = 373

Shrink model to

In [6]:
# CELL A — check artifacts present
import os, sys

files = ['catboost_hybrid.cbm', 'vectorizer.pkl', 'keywords.pkl']
print("Working dir:", os.getcwd())
missing = [f for f in files if not os.path.exists(f)]
if missing:
    print("❌ Missing files:", missing)
    print("Make sure you ran training and saved the files into the Colab working directory.")
else:
    print("✅ All required files present:", files)
    print("You can proceed to the patch + launch cells.")


Working dir: /content
✅ All required files present: ['catboost_hybrid.cbm', 'vectorizer.pkl', 'keywords.pkl']
You can proceed to the patch + launch cells.


In [None]:
from google.colab import files

files.download('catboost_hybrid.cbm')
files.download('vectorizer.pkl')
files.download('keywords.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>