# Files donwload

Download python files from the repository

In [18]:
import os
# Arquivo .py que implementa o algoritmo bigrama
if not os.path.exists("find_best_hyperparameters.py"):
  url = "https://raw.githubusercontent.com/rubensmchaves/unb/refs/heads/main/nlp/A03_text_classifier/find_best_hyperparameters.py"
  cmd = !wget {url}

Download text corpus

In [19]:
dest_folder = "data"

# create folder
if not os.path.exists(dest_folder):
  os.mkdir(dest_folder)

# download file
if not os.path.exists(dest_folder + "/CSTR.csv"):
  url = "https://raw.githubusercontent.com/ragero/text-collections/refs/heads/master/complete_texts_csvs/CSTR.csv"
  cmd = !wget -P {dest_folder} {url}

# Text classification

In [20]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from find_best_hyperparameters import fit_tuning
from find_best_hyperparameters import get_multinomial_naive_bayes_params
from find_best_hyperparameters import get_logistic_regression_params
from find_best_hyperparameters import get_support_vector_params
from find_best_hyperparameters import get_tfidf_params
from find_best_hyperparameters import get_tfidf_params
from find_best_hyperparameters import read_dataset

## Load texts

Load texts from the dataset file (corpus)

In [21]:
df, X, y = read_dataset(dest_folder + "/CSTR.csv")

Raw dataframe loaded.

In [22]:
df

Unnamed: 0,file_name,text,class
0,126.txt,Rhetorical (Rhet) is a programming / knowledge...,ArtificiallIntelligence
1,5.txt,Reduction is the operation of transforming a p...,ArtificiallIntelligence
2,48.txt,"For years, researchers have used knowledge-int...",ArtificiallIntelligence
3,81.txt,Proceedings of a workshop held in conjunction ...,ArtificiallIntelligence
4,25.txt,The Medication Advisor is the latest project o...,ArtificiallIntelligence
...,...,...,...
294,39.txt,Scoring protocols are a broad class of voting ...,Theory
295,5.txt,We study the behavior of Range Voting and Norm...,Theory
296,28.txt,Using entropy of traffic distributions has bee...,Theory
297,27.txt,We study the complexity of influencing electio...,Theory


Count the amount of each class.

In [23]:
print(df["class"].value_counts())

class
ArtificiallIntelligence    128
Robotics                   100
Theory                      46
Systems                     25
Name: count, dtype: int64


## Data split

Split the corpus into training data and test data.

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=1979)

In [25]:
# Show the final size of each set.
print(f"len(X_train): {len(X_train)}")
print(f"len(X_test): {len(X_test)}")

len(X_train): 209
len(X_test): 90


## Pipelines

Create pipelines using TF-IDF to create training attribute and add the model.

In [26]:
pipeMNB = Pipeline([("tfidf", TfidfVectorizer()), ("clf", MultinomialNB())])
pipeLR  = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LogisticRegression(random_state=1979))])
pipeSVC = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LinearSVC())])

## GridSearch

Get params for fine tuning of the models.

In [27]:
paramMNB = get_multinomial_naive_bayes_params("clf")
paramLR = get_logistic_regression_params("clf")
paramSVC = get_support_vector_params("clf")

Get params for fine tuning the TF-IDF and concatenate to other params.

In [28]:
paramTfidf = get_tfidf_params("tfidf")
# paramMNB.update(paramTfidf)
# paramLR.update(paramTfidf)
# paramSVC.update(paramTfidf)

### Multinomial Naive Bayes

In [32]:
grid_search = fit_tuning(X_train, y_train, pipeMNB, paramMNB)

print("\nMNB best params:")
print("  Best Score: ", grid_search.best_score_)
print("  Best Params: ", grid_search.best_params_)


MNB best params:
  Best Score:  0.8708529192224844
  Best Params:  {'clf__alpha': 0.1, 'clf__fit_prior': False, 'clf__force_alpha': True}


In [35]:
print("\nCV Results:")
df_results = pd.DataFrame(grid_search.cv_results_)
df_results = df_results.sort_values(by="mean_test_score", ascending=False)
df_results


CV Results:


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__alpha,param_clf__fit_prior,param_clf__force_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,0.028097,0.001389,0.012106,0.00019,0.1,False,True,"{'clf__alpha': 0.1, 'clf__fit_prior': False, '...",0.866896,0.855808,0.889855,0.870853,0.014178,1
3,0.029733,0.001879,0.013146,0.001432,0.1,False,False,"{'clf__alpha': 0.1, 'clf__fit_prior': False, '...",0.866896,0.855808,0.889855,0.870853,0.014178,1
7,0.032964,0.004606,0.013454,0.002068,0.255556,False,False,"{'clf__alpha': 0.25555555555555554, 'clf__fit_...",0.806595,0.834392,0.74742,0.796135,0.036268,3
6,0.028982,0.000381,0.0148,0.002542,0.255556,False,True,"{'clf__alpha': 0.25555555555555554, 'clf__fit_...",0.806595,0.834392,0.74742,0.796135,0.036268,3
0,0.028786,0.00217,0.01257,0.000909,0.1,True,True,"{'clf__alpha': 0.1, 'clf__fit_prior': True, 'c...",0.806595,0.798388,0.74742,0.784134,0.026176,5
1,0.028301,0.001648,0.012447,0.000288,0.1,True,False,"{'clf__alpha': 0.1, 'clf__fit_prior': True, 'c...",0.806595,0.798388,0.74742,0.784134,0.026176,5
10,0.027234,0.000808,0.012468,0.000112,0.411111,False,True,"{'clf__alpha': 0.4111111111111111, 'clf__fit_p...",0.687042,0.697886,0.639363,0.674764,0.02542,7
11,0.027362,0.000265,0.012429,0.00035,0.411111,False,False,"{'clf__alpha': 0.4111111111111111, 'clf__fit_p...",0.687042,0.697886,0.639363,0.674764,0.02542,7
14,0.027178,0.000374,0.012009,0.000508,0.566667,False,True,"{'clf__alpha': 0.5666666666666667, 'clf__fit_p...",0.592634,0.641406,0.629563,0.621201,0.02077,9
15,0.030722,0.004045,0.013612,0.002249,0.566667,False,False,"{'clf__alpha': 0.5666666666666667, 'clf__fit_p...",0.592634,0.641406,0.629563,0.621201,0.02077,9
