# Files donwload

Download python files from the repository

In [None]:
import os
# Arquivo .py que implementa o algoritmo bigrama
if not os.path.exists("find_best_hyperparameters.py"):
  url = "https://raw.githubusercontent.com/rubensmchaves/unb/refs/heads/main/nlp/A03_text_classifier/find_best_hyperparameters.py"
  cmd = !wget {url}

Download text corpus

In [None]:
dest_folder = "data"

file_nsf = "NSF"
file_dmoz = "Dmoz-Science"

# create folder
if not os.path.exists(dest_folder):
  os.mkdir(dest_folder)

# download file
if not os.path.exists(dest_folder + "/" + file_nsf + ".csv"):
  url = f"https://raw.githubusercontent.com/ragero/text-collections/refs/heads/master/complete_texts_csvs/{file_nsf}.csv"
  cmd = !wget -P {dest_folder} {url}

if not os.path.exists(dest_folder + "/" + file_dmoz + ".csv"):
  url = f"https://raw.githubusercontent.com/ragero/text-collections/refs/heads/master/complete_texts_csvs/{file_dmoz}.csv"
  cmd = !wget -P {dest_folder} {url}

# Text classification

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from find_best_hyperparameters import fit_tuning
from find_best_hyperparameters import get_multinomial_naive_bayes_params
from find_best_hyperparameters import get_logistic_regression_params
from find_best_hyperparameters import get_support_vector_params
from find_best_hyperparameters import get_tfidf_params
from find_best_hyperparameters import get_tfidf_params
from find_best_hyperparameters import read_dataset
from find_best_hyperparameters import show_score_params
from find_best_hyperparameters import results_to_csv
from find_best_hyperparameters import validate

## Corpus: NSF

### Load texts

Load texts from the dataset file (corpus)

In [None]:
df, X, y = read_dataset(dest_folder + f"/{file_nsf}.csv")

Raw dataframe loaded.

In [None]:
df

Unnamed: 0,file_name,text,class
0,management_data_management.a9820721.txt,an intelligent visual database system hierarch...,data
1,management_data_management.a9116988.txt,spatio-temporal database management for global...,data
2,management_data_management.a9457613.txt,nyi theory and implementation of declarative d...,data
3,management_data_management.a9734191.txt,pecase providing a coherent view of diverse di...,data
4,management_data_management.a9116798.txt,an object-oriented toolbox for use with the pr...,data
...,...,...,...
10519,computing_theory_computing.a9003356.txt,fixed-point logic in finite structures,theory
10520,computing_theory_computing.a0092761.txt,making exponential-time learning algorithms ef...,theory
10521,computing_theory_computing.a9985458.txt,efficient algorithms for problems in combinato...,theory
10522,computing_theory_computing.a9877122.txt,on learning and characterizing classes of bool...,theory


Count the amount of each class.

In [None]:
print(df["class"].value_counts())

class
economics       1409
math            1339
geophysics      1202
oceanography     990
ecology          889
sociology        739
statistics       647
politic          603
software         524
theory           442
data             402
hydro            355
networking       345
neuroscience     307
metals           201
gravitional      130
Name: count, dtype: int64


### Data split

Split the corpus into training data and test data.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1979)

In [None]:
# Show the final size of each set.
print(f"len(X_train): {len(X_train)}")
print(f"len(X_test): {len(X_test)}")

len(X_train): 8419
len(X_test): 2105


### Pipelines

Create pipelines using TF-IDF to create training attribute and add the model.

In [None]:
pipeMNB = Pipeline([("tfidf", TfidfVectorizer()), ("clf", MultinomialNB())])
pipeLR  = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LogisticRegression(random_state=1979))])
pipeSVC = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LinearSVC())])

### GridSearch

For fine tuning we used a pipeline with TF-IDF and three different models (Multinomial Naive Bayes, Logistic Regression and Support Vector Machine). We perform hyperparameter tuning using <code>GridSearchCV</code> with 3-fold cross-validation (<code>cv</code>) and "f1_macro" as the strategy to evaluate the performance of the cross-validated model on the test set (<code>scoring</code>).

Get params for fine tuning of the models.

In [None]:
paramMNB = get_multinomial_naive_bayes_params("clf")
paramLR = get_logistic_regression_params("clf")
paramSVC = get_support_vector_params("clf")

#### Multinomial Naive Bayes

Grid Search fit and tuning to find the best parameters.

In [None]:
grid_search_MNB = fit_tuning(X_train, y_train, pipeMNB, paramMNB)

Show best training scores and params.

In [None]:
show_score_params(grid_search_MNB)


SVC best params:
  Best Score: 0.8222915423483269
  Best Params: {'clf__alpha': 0.1, 'clf__fit_prior': False, 'clf__force_alpha': True}


Saving result into file

In [None]:
results_to_csv(grid_search_MNB, file_nsf + "-mnb-results.csv")


CV Results: False
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
2        0.288176      0.046492         0.119773        0.016992   
3        0.258327      0.058524         0.105789        0.012422   
6        0.111138      0.011474         0.051747        0.000491   
7        0.104210      0.002515         0.049406        0.000911   
10       0.153802      0.036169         0.089116        0.003680   
11       0.181750      0.009000         0.084850        0.001706   
15       0.103558      0.003289         0.050657        0.001922   
14       0.173822      0.013680         0.081625        0.018784   
19       0.104049      0.003967         0.051893        0.003430   
18       0.107895      0.005871         0.050668        0.001136   
0        0.358561      0.063182         0.139649        0.051400   
1        0.235342      0.015704         0.123619        0.048206   
22       0.101902      0.002339         0.053625        0.004575   
23       0.110005      0.0059

#### Logistic Regression

Grid Search fit and tuning to find the best parameters.

In [None]:
grid_search_LR = fit_tuning(X_train, y_train, pipeLR, paramLR)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Show best training scores and params.

In [None]:
show_score_params(grid_search_LR)


SVC best params:
  Best Score: 0.8349633711471355
  Best Params: {'clf__C': 5.994842503189409, 'clf__class_weight': 'balanced', 'clf__max_iter': 100, 'clf__solver': 'lbfgs'}


Saving result into file

In [46]:
results_to_csv(grid_search_LR, file_nsf + "-lr-results.csv")


CV Results: False
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
54       0.326577      0.011490         0.092989        0.010204   
58       0.467232      0.204499         0.107048        0.024515   
56       1.261659      0.347720         0.115120        0.026260   
48       0.388791      0.059007         0.098051        0.016780   
50       0.332911      0.012281         0.087066        0.002386   
..            ...           ...              ...             ...   
15       0.338235      0.090081         0.113577        0.014951   
17       0.339171      0.119893         0.120440        0.003903   
5        0.420528      0.050461         0.130144        0.004551   
3        0.303910      0.070405         0.124891        0.020166   
1        0.331601      0.083577         0.135417        0.005794   

    param_clf__C param_clf__class_weight  param_clf__max_iter  \
54      5.994843                balanced                  100   
58      5.994843                ba

#### Support Vector Classifier

Grid Search fit and tuning to find the best parameters.

In [None]:
grid_search_SVC = fit_tuning(X_train, y_train, pipeSVC, paramSVC)

Show best training scores and params.

In [None]:
show_score_params(grid_search_SVC)


SVC best params:
  Best Score: 0.8363693766855297
  Best Params: {'clf__class_weight': 'balanced', 'clf__tol': 0.05179474679231213}


Saving result into file

In [None]:
results_to_csv(grid_search_SVC, file_nsf + "-svc-results.csv")


CV Results: False
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
19       0.178456      0.006591         0.054943        0.001025   
15       0.242085      0.001171         0.057367        0.001233   
18       0.190429      0.003676         0.054354        0.001350   
17       0.202432      0.006375         0.053388        0.001118   
16       0.221245      0.006060         0.056614        0.001910   
20       0.148184      0.002758         0.056488        0.001600   
5        0.216664      0.046669         0.078723        0.027947   
4        0.269591      0.005475         0.101543        0.003609   
1        0.222689      0.008110         0.056730        0.001844   
0        0.283840      0.036704         0.059090        0.003287   
2        0.289731      0.053492         0.095591        0.006543   
3        0.298541      0.010386         0.096309        0.004724   
14       0.125182      0.006373         0.062940        0.002969   
6        0.117432      0.0058

### Best estimator training

Train the model using the whole `X_train` and `y_train` sets before validate it using the test set (`X_test`).

Get the best estimator (pipeline).

In [None]:
estimator_MNB = grid_search_SVC.best_estimator_
estimator_LR = grid_search_SVC.best_estimator_
estimator_SVC = grid_search_SVC.best_estimator_
print(estimator_MNB)
print(estimator_LR)
print(estimator_SVC)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 LinearSVC(class_weight='balanced', tol=0.05179474679231213))])
Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 LinearSVC(class_weight='balanced', tol=0.05179474679231213))])
Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 LinearSVC(class_weight='balanced', tol=0.05179474679231213))])


Train the estimator using the training sets (`X_train` and `y_train`). A new training step is necessary because the cross-validation step never use the whole training set to train the model.

In [None]:
estimator_MNB.fit(X_train, y_train)
estimator_LR.fit(X_train, y_train)
estimator_SVC.fit(X_train, y_train)

### Validation

Validate the model using the test set (`X_test`).

In [None]:
metrics_mnb = validate(estimator_MNB, X_test, y_test)
metrics_lr  = validate(estimator_LR, X_test, y_test)
metrics_svc = validate(estimator_SVC, X_test, y_test)

Show the validation results from the model (MNB, LR and SVC)

In [None]:
test_results = {
  "Metrics": ["Accuracy", "F1 macro", "F1 micro"],
  "MNB": metrics_mnb,
  "LR": metrics_lr,
  "SVC": metrics_svc
}
print(pd.DataFrame(test_results))

    Metrics       MNB        LR       SVC
0  Accuracy  0.866033  0.866033  0.866033
1  F1 macro  0.864246  0.864246  0.864246
2  F1 micro  0.866033  0.866033  0.866033


## Corpus: Dmoz-Science

### Load texts

Load texts from the dataset file (corpus)

In [None]:
df, X, y = read_dataset(dest_folder + f"/{file_dmoz}.csv")

Raw dataframe loaded.

In [None]:
df

Unnamed: 0,file_name,text,class
0,2786497.txt,Texas A&amp;M Horticultural Extension - Vegeta...,Agriculture
1,2784741.txt,Algaculture Information from Wikipedia on this...,Agriculture
2,2785016.txt,Annual Canarygrass Factsheet on this grain cro...,Agriculture
3,2786133.txt,Black Sigatoka Photographs and information on ...,Agriculture
4,2786989.txt,Irrigation Training and Research Center Univer...,Agriculture
...,...,...,...
5995,2881706.txt,Canmet Energy Technology Center Information ab...,Technology
5996,2886909.txt,Utah Space Association Provides information ab...,Technology
5997,2887110.txt,Mission 51-L Provides a critical view of the r...,Technology
5998,2887157.txt,"New York (SEAoNY) News announcements, committe...",Technology


Count the amount of each class.

In [None]:
print(df["class"].value_counts())

class
Agriculture    500
Astronomy      500
Biology        500
Chemistry      500
Earth          500
Environment    500
Instruments    500
Math           500
Physics        500
Science        500
Social         500
Technology     500
Name: count, dtype: int64


### Data split

Split the corpus into training data and test data.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1979)

In [None]:
# Show the final size of each set.
print(f"len(X_train): {len(X_train)}")
print(f"len(X_test): {len(X_test)}")

len(X_train): 4800
len(X_test): 1200


### Pipelines

Create pipelines using TF-IDF to create training attribute and add the model.

In [None]:
pipeMNB = Pipeline([("tfidf", TfidfVectorizer()), ("clf", MultinomialNB())])
pipeLR  = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LogisticRegression(random_state=1979))])
pipeSVC = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LinearSVC())])

### GridSearch

For fine tuning we used a pipeline with TF-IDF and three different models (Multinomial Naive Bayes, Logistic Regression and Support Vector Machine). We perform hyperparameter tuning using <code>GridSearchCV</code> with 3-fold cross-validation (<code>cv</code>) and "f1_macro" as the strategy to evaluate the performance of the cross-validated model on the test set (<code>scoring</code>).

Get params for fine tuning of the models.

In [None]:
paramMNB = get_multinomial_naive_bayes_params("clf")
paramLR = get_logistic_regression_params("clf")
paramSVC = get_support_vector_params("clf")

#### Multinomial Naive Bayes

Grid Search fit and tuning to find the best parameters.

In [None]:
grid_search_MNB = fit_tuning(X_train, y_train, pipeMNB, paramMNB)

Show best training scores and params.

In [None]:
show_score_params(grid_search_MNB)


SVC best params:
  Best Score: 0.7088756661352962
  Best Params: {'clf__alpha': 0.4111111111111111, 'clf__fit_prior': False, 'clf__force_alpha': True}


Saving result into file

In [None]:
results_to_csv(grid_search_MNB, file_dmoz + "-mnb-results.csv")


CV Results: False
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
10       0.111189      0.003292         0.047309        0.001976   
11       0.113910      0.001920         0.047001        0.001797   
8        0.113638      0.001934         0.046400        0.000887   
9        0.111097      0.003127         0.046737        0.002084   
6        0.109482      0.001287         0.046898        0.001778   
7        0.123145      0.008278         0.047194        0.001657   
4        0.119497      0.002908         0.048004        0.000791   
5        0.117915      0.003181         0.047278        0.000883   
12       0.190500      0.004484         0.079672        0.001043   
13       0.194794      0.011436         0.077720        0.004943   
14       0.200258      0.009346         0.080941        0.003115   
15       0.233364      0.011681         0.091323        0.006596   
16       0.152860      0.045082         0.056757        0.016555   
17       0.118606      0.0067

#### Logistic Regression

Grid Search fit and tuning to find the best parameters.

In [None]:
grid_search_LR = fit_tuning(X_train, y_train, pipeLR, paramLR)

Show best training scores and params.

In [None]:
show_score_params(grid_search_LR)


SVC best params:
  Best Score: 0.7254091882083982
  Best Params: {'clf__C': 5.994842503189409, 'clf__class_weight': 'balanced', 'clf__max_iter': 100, 'clf__solver': 'liblinear'}


Saving result into file

In [45]:
results_to_csv(grid_search_LR, file_dmoz + "-lr-results.csv")


CV Results: False
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
54       0.326577      0.011490         0.092989        0.010204   
58       0.467232      0.204499         0.107048        0.024515   
56       1.261659      0.347720         0.115120        0.026260   
48       0.388791      0.059007         0.098051        0.016780   
50       0.332911      0.012281         0.087066        0.002386   
..            ...           ...              ...             ...   
15       0.338235      0.090081         0.113577        0.014951   
17       0.339171      0.119893         0.120440        0.003903   
5        0.420528      0.050461         0.130144        0.004551   
3        0.303910      0.070405         0.124891        0.020166   
1        0.331601      0.083577         0.135417        0.005794   

    param_clf__C param_clf__class_weight  param_clf__max_iter  \
54      5.994843                balanced                  100   
58      5.994843                ba

#### Support Vector Classifier

Grid Search fit and tuning to find the best parameters.

In [None]:
grid_search_SVC = fit_tuning(X_train, y_train, pipeSVC, paramSVC)

Show best training scores and params.

In [None]:
show_score_params(grid_search_SVC)


SVC best params:
  Best Score: 0.7295312061480009
  Best Params: {'clf__class_weight': None, 'clf__tol': 0.4393970560760795}


Saving result into file

In [44]:
results_to_csv(grid_search_SVC, file_dmoz + "-svc-results.csv")


CV Results: False
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
5        0.138770      0.008844         0.049094        0.001017   
0        0.286087      0.042463         0.075992        0.017698   
17       0.177869      0.002322         0.050078        0.000534   
16       0.189564      0.002523         0.054641        0.008533   
1        0.303036      0.006955         0.087821        0.002285   
15       0.217624      0.005637         0.049704        0.001991   
2        0.294087      0.004707         0.086950        0.001336   
18       0.167556      0.009364         0.047032        0.000946   
3        0.234285      0.054766         0.064790        0.019756   
4        0.159707      0.011556         0.049218        0.001658   
19       0.148738      0.003893         0.048006        0.001279   
20       0.148691      0.009650         0.059365        0.016161   
21       0.191767      0.003695         0.087664        0.003408   
12       0.112309      0.0026

### Best estimator training

Train the model using the whole `X_train` and `y_train` sets before validate it using the test set (`X_test`).

Get the best estimator (pipeline).

In [None]:
estimator_MNB = grid_search_MNB.best_estimator_
estimator_LR = grid_search_LR.best_estimator_
estimator_SVC = grid_search_SVC.best_estimator_
print(estimator_MNB)
print(estimator_LR)
print(estimator_SVC)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 MultinomialNB(alpha=0.4111111111111111, fit_prior=False))])
Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 LogisticRegression(C=5.994842503189409,
                                    class_weight='balanced', random_state=1979,
                                    solver='liblinear'))])
Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf', LinearSVC(tol=0.4393970560760795))])


Train the estimator using the training sets (`X_train` and `y_train`). A new training step is necessary because the cross-validation step never use the whole training set to train the model.

In [None]:
estimator_MNB.fit(X_train, y_train)
estimator_LR.fit(X_train, y_train)
estimator_SVC.fit(X_train, y_train)

### Validation

Validate the model using the test set (`X_test`).

In [None]:
metrics_mnb = validate(estimator_MNB, X_test, y_test)
metrics_lr  = validate(estimator_LR, X_test, y_test)
metrics_svc = validate(estimator_SVC, X_test, y_test)

Show the validation results from the model (MNB, LR and SVC)

In [None]:
test_results = {
  "Metrics": ["Accuracy", "F1 macro", "F1 micro"],
  "MNB": metrics_mnb,
  "LR": metrics_lr,
  "SVC": metrics_svc
}
print(pd.DataFrame(test_results))

    Metrics       MNB        LR       SVC
0  Accuracy  0.717500  0.725000  0.725000
1  F1 macro  0.715701  0.723996  0.723868
2  F1 micro  0.717500  0.725000  0.725000
