# Files donwload

Download python files from the repository

In [14]:
import os
# Arquivo .py que implementa o algoritmo bigrama
if not os.path.exists("find_best_hyperparameters.py"):
  url = "https://raw.githubusercontent.com/rubensmchaves/unb/refs/heads/main/nlp/A03_text_classifier/find_best_hyperparameters.py"
  cmd = !wget {url}

Download text corpus

In [15]:
dest_folder = "data"

file_nsf = "NSF"
file_dmoz = "Dmoz-Science"

# create folder
if not os.path.exists(dest_folder):
  os.mkdir(dest_folder)

# download file
if not os.path.exists(dest_folder + "/" + file_nsf + ".csv"):
  url = f"https://raw.githubusercontent.com/ragero/text-collections/refs/heads/master/complete_texts_csvs/{file_nsf}.csv"
  cmd = !wget -P {dest_folder} {url}

if not os.path.exists(dest_folder + "/" + file_dmoz + ".csv"):
  url = f"https://raw.githubusercontent.com/ragero/text-collections/refs/heads/master/complete_texts_csvs/{file_dmoz}.csv"
  cmd = !wget -P {dest_folder} {url}

# Text classification

In [16]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from find_best_hyperparameters import fit_tuning
from find_best_hyperparameters import get_multinomial_naive_bayes_params
from find_best_hyperparameters import get_logistic_regression_params
from find_best_hyperparameters import get_support_vector_params
from find_best_hyperparameters import get_tfidf_params
from find_best_hyperparameters import get_tfidf_params
from find_best_hyperparameters import read_dataset
from find_best_hyperparameters import show_score_params
from find_best_hyperparameters import results_to_csv
from find_best_hyperparameters import score_test_set

## Corpus: NSF

### Load texts

Load texts from the dataset file (corpus)

In [17]:
df, X, y = read_dataset(dest_folder + f"/{file_nsf}.csv")

Raw dataframe loaded.

In [18]:
df

Unnamed: 0,file_name,text,class
0,management_data_management.a9820721.txt,an intelligent visual database system hierarch...,data
1,management_data_management.a9116988.txt,spatio-temporal database management for global...,data
2,management_data_management.a9457613.txt,nyi theory and implementation of declarative d...,data
3,management_data_management.a9734191.txt,pecase providing a coherent view of diverse di...,data
4,management_data_management.a9116798.txt,an object-oriented toolbox for use with the pr...,data
...,...,...,...
10519,computing_theory_computing.a9003356.txt,fixed-point logic in finite structures,theory
10520,computing_theory_computing.a0092761.txt,making exponential-time learning algorithms ef...,theory
10521,computing_theory_computing.a9985458.txt,efficient algorithms for problems in combinato...,theory
10522,computing_theory_computing.a9877122.txt,on learning and characterizing classes of bool...,theory


Count the amount of each class.

In [19]:
print(df["class"].value_counts())

class
economics       1409
math            1339
geophysics      1202
oceanography     990
ecology          889
sociology        739
statistics       647
politic          603
software         524
theory           442
data             402
hydro            355
networking       345
neuroscience     307
metals           201
gravitional      130
Name: count, dtype: int64


### Data split

Split the corpus into training data and test data.

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1979)

In [21]:
# Show the final size of each set.
print(f"len(X_train): {len(X_train)}")
print(f"len(X_test): {len(X_test)}")

len(X_train): 8419
len(X_test): 2105


### Pipelines

Create pipelines using TF-IDF to create training attribute and add the model.

In [22]:
pipeMNB = Pipeline([("tfidf", TfidfVectorizer()), ("clf", MultinomialNB())])
pipeLR  = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LogisticRegression(random_state=1979))])
pipeSVC = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LinearSVC())])

### GridSearch

For fine tuning we used a pipeline with TF-IDF and three different models (Multinomial Naive Bayes, Logistic Regression and Support Vector Machine). We perform hyperparameter tuning using <code>GridSearchCV</code> with 3-fold cross-validation (<code>cv</code>) and "f1_macro" as the strategy to evaluate the performance of the cross-validated model on the test set (<code>scoring</code>).

Get params for fine tuning of the models.

In [23]:
paramMNB = get_multinomial_naive_bayes_params("clf")
paramLR = get_logistic_regression_params("clf")
paramSVC = get_support_vector_params("clf")

#### Multinomial Naive Bayes

Training.

In [24]:
grid_search_MNB = fit_tuning(X_train, y_train, pipeMNB, paramMNB)

Show best training scores and params.

In [25]:
show_score_params(grid_search_MNB)


SVC best params:
  Best Score: 0.8222915423483269
  Best Params: {'clf__alpha': 0.1, 'clf__fit_prior': False, 'clf__force_alpha': True}


Saving result into file

In [26]:
results_to_csv(grid_search_MNB, file_nsf + "-mnb-results.csv")


CV Results: False
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
2        0.163839      0.000704         0.082256        0.003586   
3        0.173643      0.005211         0.085768        0.003853   
6        0.095978      0.002315         0.047971        0.001117   
7        0.098818      0.006992         0.046179        0.000162   
10       0.098163      0.001108         0.049911        0.001162   
11       0.095811      0.001981         0.049887        0.003193   
15       0.098259      0.000563         0.048204        0.002453   
14       0.095836      0.002271         0.046629        0.000817   
19       0.098679      0.004691         0.046802        0.000991   
18       0.097464      0.002354         0.051470        0.004504   
0        0.173939      0.026636         0.094547        0.004649   
1        0.165702      0.001214         0.085054        0.006334   
22       0.096018      0.000719         0.046398        0.000571   
23       0.095174      0.0003

Verify the result score in the test set.

In [27]:
score_test_set(grid_search_MNB, X_test, y_test)


Test evaluation:
  F1-macro: 0.8467467117985801


#### Logistic Regression

Training.

In [28]:
grid_search_LR = fit_tuning(X_train, y_train, pipeLR, paramLR)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Show best training scores and params.

In [29]:
show_score_params(grid_search_LR)


SVC best params:
  Best Score: 0.8349633711471355
  Best Params: {'clf__C': 5.994842503189409, 'clf__class_weight': 'balanced', 'clf__max_iter': 100, 'clf__solver': 'lbfgs'}


Saving result into file

In [30]:
results_to_csv(grid_search_LR, file_nsf + "-lt-results.csv")


CV Results: False
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
59       2.814038      0.170119         0.115840        0.013411   
55       3.655732      0.661706         0.126490        0.029397   
57       3.505612      0.846266         0.142011        0.017937   
54       0.364597      0.007879         0.054432        0.003834   
56       0.362180      0.002704         0.055876        0.005546   
..            ...           ...              ...             ...   
5        4.482603      1.470172         0.122758        0.010926   
15       0.852089      0.043278         0.125393        0.004369   
3        4.723911      1.260914         0.113809        0.016184   
17       0.840442      0.180675         0.139278        0.009991   
1        5.513883      1.470810         0.123356        0.009740   

    param_clf__C param_clf__class_weight  param_clf__max_iter  \
59      5.994843                balanced                  500   
55      5.994843                ba

Verify the result score in the test set.

In [31]:
score_test_set(grid_search_LR, X_test, y_test)


Test evaluation:
  F1-macro: 0.864368871660672


#### Support Vector Classifier

Training.

In [32]:
grid_search_SVC = fit_tuning(X_train, y_train, pipeSVC, paramSVC)

Show best training scores and params.

In [33]:
show_score_params(grid_search_SVC)


SVC best params:
  Best Score: 0.8373968294014311
  Best Params: {'clf__class_weight': 'balanced', 'clf__tol': 0.4393970560760795}


Saving result into file

In [34]:
results_to_csv(grid_search_SVC, file_nsf + "-svc-results.csv")


CV Results: False
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
20       0.142953      0.005974         0.052780        0.001373   
15       0.228691      0.000668         0.051972        0.000208   
18       0.172589      0.002156         0.051699        0.000774   
17       0.189185      0.002649         0.053527        0.001472   
16       0.208313      0.004510         0.052080        0.001306   
19       0.159968      0.004832         0.051916        0.000459   
5        0.134941      0.000607         0.052030        0.001268   
3        0.169047      0.003205         0.052174        0.000665   
4        0.160000      0.008087         0.053003        0.001476   
1        0.209341      0.012125         0.053054        0.000804   
0        0.222358      0.003323         0.058323        0.008343   
2        0.180856      0.002091         0.051062        0.000472   
11       0.160417      0.004112         0.094135        0.004914   
8        0.094583      0.0009

Verify the result score in the test set.

In [35]:
score_test_set(grid_search_SVC, X_test, y_test)


Test evaluation:
  F1-macro: 0.86368707204037


## Corpus: Dmoz-Science

### Load texts

Load texts from the dataset file (corpus)

In [36]:
df, X, y = read_dataset(dest_folder + f"/{file_dmoz}.csv")

Raw dataframe loaded.

In [37]:
df

Unnamed: 0,file_name,text,class
0,2786497.txt,Texas A&amp;M Horticultural Extension - Vegeta...,Agriculture
1,2784741.txt,Algaculture Information from Wikipedia on this...,Agriculture
2,2785016.txt,Annual Canarygrass Factsheet on this grain cro...,Agriculture
3,2786133.txt,Black Sigatoka Photographs and information on ...,Agriculture
4,2786989.txt,Irrigation Training and Research Center Univer...,Agriculture
...,...,...,...
5995,2881706.txt,Canmet Energy Technology Center Information ab...,Technology
5996,2886909.txt,Utah Space Association Provides information ab...,Technology
5997,2887110.txt,Mission 51-L Provides a critical view of the r...,Technology
5998,2887157.txt,"New York (SEAoNY) News announcements, committe...",Technology


Count the amount of each class.

In [38]:
print(df["class"].value_counts())

class
Agriculture    500
Astronomy      500
Biology        500
Chemistry      500
Earth          500
Environment    500
Instruments    500
Math           500
Physics        500
Science        500
Social         500
Technology     500
Name: count, dtype: int64


### Data split

Split the corpus into training data and test data.

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1979)

In [40]:
# Show the final size of each set.
print(f"len(X_train): {len(X_train)}")
print(f"len(X_test): {len(X_test)}")

len(X_train): 4800
len(X_test): 1200


### Pipelines

Create pipelines using TF-IDF to create training attribute and add the model.

In [41]:
pipeMNB = Pipeline([("tfidf", TfidfVectorizer()), ("clf", MultinomialNB())])
pipeLR  = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LogisticRegression(random_state=1979))])
pipeSVC = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LinearSVC())])

### GridSearch

For fine tuning we used a pipeline with TF-IDF and three different models (Multinomial Naive Bayes, Logistic Regression and Support Vector Machine). We perform hyperparameter tuning using <code>GridSearchCV</code> with 3-fold cross-validation (<code>cv</code>) and "f1_macro" as the strategy to evaluate the performance of the cross-validated model on the test set (<code>scoring</code>).

Get params for fine tuning of the models.

In [42]:
paramMNB = get_multinomial_naive_bayes_params("clf")
paramLR = get_logistic_regression_params("clf")
paramSVC = get_support_vector_params("clf")

#### Multinomial Naive Bayes

Training.

In [43]:
grid_search_MNB = fit_tuning(X_train, y_train, pipeMNB, paramMNB)

Show best training scores and params.

In [44]:
show_score_params(grid_search_MNB)


SVC best params:
  Best Score: 0.7088756661352962
  Best Params: {'clf__alpha': 0.4111111111111111, 'clf__fit_prior': False, 'clf__force_alpha': True}


Saving result into file

In [45]:
results_to_csv(grid_search_MNB, file_dmoz + "-mnb-results.csv")


CV Results: False
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
10       0.097688      0.000712         0.042661        0.001659   
11       0.101924      0.004874         0.043173        0.001044   
8        0.100044      0.002001         0.042304        0.000637   
9        0.103782      0.010585         0.042821        0.001058   
6        0.099201      0.003821         0.044030        0.001578   
7        0.103580      0.009458         0.043204        0.001333   
4        0.173426      0.000712         0.077210        0.002796   
5        0.157764      0.042966         0.061657        0.014031   
12       0.097289      0.001374         0.042941        0.002021   
13       0.097178      0.001051         0.044002        0.002711   
14       0.100819      0.005750         0.043878        0.002486   
15       0.097786      0.001673         0.043342        0.001475   
16       0.097579      0.001894         0.048086        0.004169   
17       0.097632      0.0017

Verify the result score in the test set.

In [46]:
score_test_set(grid_search_MNB, X_test, y_test)


Test evaluation:
  F1-macro: 0.7157006501279084


#### Logistic Regression

Training.

In [47]:
grid_search_LR = fit_tuning(X_train, y_train, pipeLR, paramLR)

Show best training scores and params.

In [48]:
show_score_params(grid_search_LR)


SVC best params:
  Best Score: 0.7254091882083982
  Best Params: {'clf__C': 5.994842503189409, 'clf__class_weight': 'balanced', 'clf__max_iter': 100, 'clf__solver': 'liblinear'}


Saving result into file

In [49]:
results_to_csv(grid_search_LR, file_dmoz + "-lt-results.csv")


CV Results: False
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
54       0.633176      0.453761         0.079443        0.000208   
58       0.313573      0.011970         0.080535        0.001253   
56       0.363075      0.060279         0.095949        0.020180   
48       0.316799      0.013582         0.080598        0.000811   
50       0.313325      0.010293         0.080397        0.001210   
..            ...           ...              ...             ...   
15       0.234905      0.048951         0.118862        0.010880   
17       0.300474      0.055946         0.112599        0.022969   
5        0.344031      0.043469         0.127658        0.001925   
3        0.247871      0.011390         0.111865        0.019259   
1        0.342114      0.010799         0.124857        0.010064   

    param_clf__C param_clf__class_weight  param_clf__max_iter  \
54      5.994843                balanced                  100   
58      5.994843                ba

Verify the result score in the test set.

In [50]:
score_test_set(grid_search_LR, X_test, y_test)


Test evaluation:
  F1-macro: 0.7239961027902931


#### Support Vector Classifier

Training.

In [51]:
grid_search_SVC = fit_tuning(X_train, y_train, pipeSVC, paramSVC)

Show best training scores and params.

In [52]:
show_score_params(grid_search_SVC)


SVC best params:
  Best Score: 0.7302191211071118
  Best Params: {'clf__class_weight': None, 'clf__tol': 0.4393970560760795}


Saving result into file

In [53]:
results_to_csv(grid_search_SVC, file_dmoz + "-svc-results.csv")


CV Results: False
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
5        0.124844      0.005530         0.046003        0.001284   
19       0.230182      0.012645         0.080502        0.000533   
0        0.199049      0.010100         0.047715        0.002147   
1        0.184599      0.016588         0.046116        0.001170   
17       0.239840      0.034069         0.085734        0.004206   
16       0.175288      0.004990         0.048209        0.004424   
15       0.189310      0.002255         0.045307        0.000312   
2        0.163785      0.006290         0.046936        0.002897   
18       0.237091      0.011158         0.080985        0.002064   
4        0.137166      0.003172         0.046857        0.002355   
3        0.152810      0.003347         0.045584        0.000266   
20       0.174952      0.039530         0.059291        0.017063   
24       0.102221      0.002577         0.048479        0.002428   
10       0.100373      0.0021

Verify the result score in the test set.

In [54]:
score_test_set(grid_search_SVC, X_test, y_test)


Test evaluation:
  F1-macro: 0.7237096664333351
