# Train Domain classifier on the [semantic scholar dataset](https://api.semanticscholar.org/corpus)

In [None]:
import json
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
from sklearn import metrics, set_config
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

from great_ai.utilities.clean import clean
from great_ai.utilities.parallel_map import parallel_map
from great_ai.utilities.language import is_english, predict_language
from great_ai import save_model, configure, LargeFile

from preprocess import preprocess

## Configuration

In [None]:
PREFIX = "domain-"
DATASET_KEY = "data"
MAX_FILE_COUNT = 5
MODEL_KEY = "small-domain-prediction-v2"

In [None]:
configure()
corpus_path = LargeFile(DATASET_KEY).get()

set_config(display="diagram")
plt.rcParams["figure.figsize"] = (30, 15)
plt.rcParams["figure.facecolor"] = "white"
plt.rcParams["font.size"] = 12
plt.rcParams["axes.xmargin"] = 0

## Preprocessing

In [None]:
def clean_file(p: Path) -> None:
    try:
        processed_path = p.with_name(f"{PREFIX}{p.stem}{p.suffix}")

        if processed_path.exists():
            return

        with open(p) as f:
            content = json.load(f)

        result = {
            preprocess(
                clean(f'{c["title"]} {c["abstract"]}', convert_to_ascii=True)
            ): c["domain"]
            for c in content
            if (
                c["domain"]
                and c["abstract"]
                and is_english(predict_language(c["abstract"]))
            )
        }

        with open(processed_path, "w") as f:
            json.dump(result, f)
    except Exception as e:
        print(f"Error ({e}) processing {p}")


parallel_map(
    clean_file,
    list(corpus_path.glob("s2-corpus-*.json"))[:MAX_FILE_COUNT],
    chunk_size=1,
)
None

In [None]:
corpora = list(corpus_path.glob(f"{PREFIX}*.json"))[:MAX_FILE_COUNT]
print(f"Found {len(corpora)} files")

data = []
for p in corpora:
    with open(p) as f:
        data.extend(json.load(f).items())

print(f"Found {len(data)} documents")

X_train, X_test, y_train, y_test = train_test_split(
    [d[0] for d in data], [d[1] for d in data], test_size=0.1, random_state=1
)

X_train = [x for x, y in zip(X_train, y_train) for domain in y]
y_train = [domain for x, y in zip(X_train, y_train) for domain in y]

## Naive Bayes

In [None]:
classifier = GridSearchCV(
    Pipeline(steps=[("vectorizer", TfidfVectorizer(token_pattern=r"[^ ]+")), ("classifier", MultinomialNB())]),
    {
        "vectorizer__max_df": [0.05, 0.1, 0.3],
        "vectorizer__min_df": [5, 10, 30],
        "vectorizer__sublinear_tf": [True, False],
        "classifier__alpha": [0.1, 0.25, 0.5, 0.75, 1],
        "classifier__fit_prior": [True, False],
    },
    scoring="f1_macro",
    cv=3,
    n_jobs=4,
    verbose=1,
)
classifier.fit(X_train, y_train)

results = pd.DataFrame(classifier.cv_results_)
results.sort_values("rank_test_score")

In [None]:
classifier = Pipeline(
    steps=[
        ("vectorizer", TfidfVectorizer(min_df=10, max_df=0.05, sublinear_tf=True, token_pattern=r"[^ ]+")),
        ("classifier", MultinomialNB(alpha=0.5, fit_prior=False)),
    ]
)

classifier.fit(X_train, y_train)

In [None]:
predicted = classifier.predict(X_test)

y_test_aligned = [p if p in y else y[0] for p, y in zip(predicted, y_test)]

print(metrics.classification_report(y_test_aligned, predicted))
metrics.ConfusionMatrixDisplay.from_predictions(
    y_true=y_test_aligned,
    y_pred=predicted,
    xticks_rotation="vertical",
    normalize="pred",
    values_format=".2f",
)
None

In [None]:
save_model(classifier, key=MODEL_KEY, keep_last_n=1)