# Files donwload

Download python files from the repository

In [90]:
import os
# Arquivo .py que implementa o algoritmo bigrama
if not os.path.exists("find_best_hyperparameters.py"):
  url = "https://raw.githubusercontent.com/rubensmchaves/unb/refs/heads/main/nlp/A03_text_classifier/find_best_hyperparameters.py"
  cmd = !wget {url}

Download text corpus

In [91]:
dest_folder = "data"

# create folder
if not os.path.exists(dest_folder):
  os.mkdir(dest_folder)

# download file
if not os.path.exists(dest_folder + "/CSTR.csv"):
  url = "https://raw.githubusercontent.com/ragero/text-collections/refs/heads/master/complete_texts_csvs/CSTR.csv"
  cmd = !wget -P {dest_folder} {url}

# Text classification

In [92]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from find_best_hyperparameters import fit_tuning
from find_best_hyperparameters import get_multinomial_naive_bayes_params
from find_best_hyperparameters import get_logistic_regression_params
from find_best_hyperparameters import get_support_vector_params
from find_best_hyperparameters import get_tfidf_params
from find_best_hyperparameters import get_tfidf_params
from find_best_hyperparameters import read_dataset

## Load texts

Load texts from the dataset file (corpus)

In [93]:
df, X, y = read_dataset(dest_folder + "/CSTR.csv")

Raw dataframe loaded.

In [94]:
df

Unnamed: 0,file_name,text,class
0,126.txt,Rhetorical (Rhet) is a programming / knowledge...,ArtificiallIntelligence
1,5.txt,Reduction is the operation of transforming a p...,ArtificiallIntelligence
2,48.txt,"For years, researchers have used knowledge-int...",ArtificiallIntelligence
3,81.txt,Proceedings of a workshop held in conjunction ...,ArtificiallIntelligence
4,25.txt,The Medication Advisor is the latest project o...,ArtificiallIntelligence
...,...,...,...
294,39.txt,Scoring protocols are a broad class of voting ...,Theory
295,5.txt,We study the behavior of Range Voting and Norm...,Theory
296,28.txt,Using entropy of traffic distributions has bee...,Theory
297,27.txt,We study the complexity of influencing electio...,Theory


Count the amount of each class.

In [95]:
print(df["class"].value_counts())

class
ArtificiallIntelligence    128
Robotics                   100
Theory                      46
Systems                     25
Name: count, dtype: int64


## Data split

Split the corpus into training data and test data.

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1979)

In [97]:
# Show the final size of each set.
print(f"len(X_train): {len(X_train)}")
print(f"len(X_test): {len(X_test)}")

len(X_train): 239
len(X_test): 60


## Pipelines

Create pipelines using TF-IDF to create training attribute and add the model.

In [98]:
pipeMNB = Pipeline([("tfidf", TfidfVectorizer()), ("clf", MultinomialNB())])
pipeLR  = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LogisticRegression(random_state=1979))])
pipeSVC = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LinearSVC())])

## GridSearch

For fine tuning we used a pipeline with TF-IDF and three different models (Multinomial Naive Bayes, Logistic Regression and Support Vector Machine). We perform hyperparameter tuning using <code>GridSearchCV</code> with 3-fold cross-validation (<code>cv</code>) and "f1_macro" as the strategy to evaluate the performance of the cross-validated model on the test set (<code>scoring</code>).

Get params for fine tuning of the models.

In [99]:
paramMNB = get_multinomial_naive_bayes_params("clf")
paramLR = get_logistic_regression_params("clf")
paramSVC = get_support_vector_params("clf")

Get params for fine tuning the TF-IDF and concatenate to other params.

In [100]:
paramTfidf = get_tfidf_params("tfidf")
paramMNB.update(paramTfidf)
paramLR.update(paramTfidf)
paramSVC.update(paramTfidf)

### Multinomial Naive Bayes

In [101]:
grid_search_MNB = fit_tuning(X_train, y_train, pipeMNB, paramMNB)

print("\nMNB best params:")
print("  Best Score: ", grid_search_MNB.best_score_)
print("  Best Params: ", grid_search_MNB.best_params_)


MNB best params:
  Best Score:  0.8823014612119159
  Best Params:  {'clf__alpha': 0.1, 'clf__fit_prior': False, 'clf__force_alpha': True, 'tfidf__binary': True, 'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': 'english'}


In [102]:
print("\nCV Results:")
df_MNB = pd.DataFrame(grid_search_MNB.cv_results_)
df_MNB = df_MNB.sort_values(by="mean_test_score", ascending=False)
df_MNB


CV Results:


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__alpha,param_clf__fit_prior,param_clf__force_alpha,param_tfidf__binary,param_tfidf__lowercase,param_tfidf__ngram_range,param_tfidf__stop_words,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
55,0.028879,0.000108,0.012648,0.000128,0.100000,False,True,True,False,"(1, 1)",english,"{'clf__alpha': 0.1, 'clf__fit_prior': False, '...",0.905396,0.859416,0.882093,0.882301,0.018772,1
79,0.029391,0.000589,0.012748,0.000522,0.100000,False,False,True,False,"(1, 1)",english,"{'clf__alpha': 0.1, 'clf__fit_prior': False, '...",0.905396,0.859416,0.882093,0.882301,0.018772,1
253,0.028848,0.000751,0.013023,0.000444,0.411111,False,True,False,True,"(1, 1)",english,"{'clf__alpha': 0.4111111111111111, 'clf__fit_p...",0.882728,0.856439,0.887416,0.875528,0.013633,3
277,0.027663,0.000377,0.012122,0.000283,0.411111,False,False,False,True,"(1, 1)",english,"{'clf__alpha': 0.4111111111111111, 'clf__fit_p...",0.882728,0.856439,0.887416,0.875528,0.013633,3
61,0.029266,0.000884,0.013021,0.001015,0.100000,False,True,False,True,"(1, 1)",english,"{'clf__alpha': 0.1, 'clf__fit_prior': False, '...",0.905396,0.822749,0.887416,0.871854,0.035490,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899,0.055036,0.000531,0.015188,0.000193,1.500000,True,False,True,False,"(3, 3)",english,"{'clf__alpha': 1.5, 'clf__fit_prior': True, 'c...",0.186268,0.164563,0.166506,0.172446,0.009806,947
868,0.079074,0.001746,0.019975,0.000893,1.500000,True,True,True,True,"(3, 3)",,"{'clf__alpha': 1.5, 'clf__fit_prior': True, 'c...",0.168300,0.181982,0.166506,0.172262,0.006912,957
874,0.080665,0.000699,0.017280,0.000306,1.500000,True,True,True,False,"(3, 3)",,"{'clf__alpha': 1.5, 'clf__fit_prior': True, 'c...",0.168300,0.181982,0.166506,0.172262,0.006912,957
898,0.080800,0.001926,0.017483,0.000432,1.500000,True,False,True,False,"(3, 3)",,"{'clf__alpha': 1.5, 'clf__fit_prior': True, 'c...",0.168300,0.181982,0.166506,0.172262,0.006912,957


In [103]:
# Save dataframe into CSV file
df_MNB.to_csv("mnb_results.csv")

### Logistic Regression

In [104]:
grid_search_LR = fit_tuning(X_train, y_train, pipeLR, paramLR)

print("\nMNB best params:")
print("  Best Score: ", grid_search_LR.best_score_)
print("  Best Params: ", grid_search_LR.best_params_)


MNB best params:
  Best Score:  0.8512042026075503
  Best Params:  {'clf__C': 268.2695795279727, 'tfidf__binary': False, 'tfidf__lowercase': True, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': None}


In [105]:
print("\nCV Results:")
df_LR = pd.DataFrame(grid_search_LR.cv_results_)
df_LR = df_LR.sort_values(by="mean_test_score", ascending=False)
df_LR


CV Results:


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_tfidf__binary,param_tfidf__lowercase,param_tfidf__ngram_range,param_tfidf__stop_words,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
204,0.614407,0.320353,0.050955,0.004851,268.269580,False,True,"(1, 1)",,"{'clf__C': 268.2695795279727, 'tfidf__binary':...",0.881547,0.832756,0.839310,0.851204,0.021622,1
229,0.313322,0.058669,0.033193,0.008677,2275.845926,False,True,"(1, 1)",english,"{'clf__C': 2275.845926074791, 'tfidf__binary':...",0.887350,0.817352,0.847379,0.850694,0.028673,2
228,0.491248,0.018184,0.024823,0.001814,2275.845926,False,True,"(1, 1)",,"{'clf__C': 2275.845926074791, 'tfidf__binary':...",0.881547,0.816634,0.839310,0.845830,0.026899,3
235,0.387496,0.089919,0.038134,0.013923,2275.845926,False,False,"(1, 1)",english,"{'clf__C': 2275.845926074791, 'tfidf__binary':...",0.879545,0.801051,0.855712,0.845436,0.032859,4
211,0.685954,0.262234,0.050598,0.005748,268.269580,False,False,"(1, 1)",english,"{'clf__C': 268.2695795279727, 'tfidf__binary':...",0.897763,0.786709,0.847596,0.844023,0.045408,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,0.188992,0.032450,0.030135,0.010641,0.006105,False,True,"(1, 1)",english,"{'clf__C': 0.006105402296585327, 'tfidf__binar...",0.149123,0.149123,0.150442,0.149563,0.000622,241
86,0.241037,0.017208,0.054843,0.016935,0.006105,False,True,"(2, 2)",,"{'clf__C': 0.006105402296585327, 'tfidf__binar...",0.149123,0.149123,0.150442,0.149563,0.000622,241
87,0.189178,0.014196,0.038776,0.008423,0.006105,False,True,"(2, 2)",english,"{'clf__C': 0.006105402296585327, 'tfidf__binar...",0.149123,0.149123,0.150442,0.149563,0.000622,241
88,0.314510,0.082400,0.048404,0.007835,0.006105,False,True,"(3, 3)",,"{'clf__C': 0.006105402296585327, 'tfidf__binar...",0.149123,0.149123,0.150442,0.149563,0.000622,241


In [106]:
# Save dataframe into CSV file
df_LR.to_csv("lr_results.csv")

### Support Vector Classifier

In [107]:
grid_search_SVC = fit_tuning(X_train, y_train, pipeSVC, paramSVC)

print("\nLR best params:")
print("  Best Score: ", grid_search_SVC.best_score_)
print("  Best Params: ", grid_search_SVC.best_params_)


LR best params:
  Best Score:  0.879830991092924
  Best Params:  {'clf__class_weight': 'balanced', 'clf__tol': 0.4393970560760795, 'tfidf__binary': True, 'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': None}


In [108]:
print("\nCV Results:")
df_SVC = pd.DataFrame(grid_search_SVC.cv_results_)
df_SVC = df_SVC.sort_values(by="mean_test_score", ascending=False)
df_SVC


CV Results:


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__class_weight,param_clf__tol,param_tfidf__binary,param_tfidf__lowercase,param_tfidf__ngram_range,param_tfidf__stop_words,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
486,0.030788,0.000543,0.013331,0.000092,balanced,4.393971e-01,True,False,"(1, 1)",,"{'clf__class_weight': 'balanced', 'clf__tol': ...",0.881419,0.839563,0.918511,0.879831,0.032250,1
384,0.036001,0.000769,0.015868,0.003389,balanced,8.483429e-05,True,True,"(1, 1)",,"{'clf__class_weight': 'balanced', 'clf__tol': ...",0.896844,0.831765,0.888603,0.872404,0.028932,2
432,0.033533,0.000546,0.013434,0.000250,balanced,6.105402e-03,True,True,"(1, 1)",,"{'clf__class_weight': 'balanced', 'clf__tol': ...",0.896844,0.831765,0.888603,0.872404,0.028932,2
408,0.058554,0.002221,0.026762,0.003119,balanced,7.196857e-04,True,True,"(1, 1)",,"{'clf__class_weight': 'balanced', 'clf__tol': ...",0.896844,0.831765,0.888603,0.872404,0.028932,2
360,0.064615,0.003780,0.025052,0.001469,balanced,1.000000e-05,True,True,"(1, 1)",,"{'clf__class_weight': 'balanced', 'clf__tol': ...",0.896844,0.831765,0.888603,0.872404,0.028932,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
587,0.053698,0.002315,0.018955,0.006084,balanced,2.275846e+03,True,False,"(3, 3)",english,"{'clf__class_weight': 'balanced', 'clf__tol': ...",0.065217,0.126168,0.150442,0.113943,0.035851,716
646,0.079952,0.002556,0.017277,0.000460,balanced,1.637894e+05,False,False,"(3, 3)",,"{'clf__class_weight': 'balanced', 'clf__tol': ...",0.054983,0.122642,0.147321,0.108315,0.039034,717
275,0.053401,0.001077,0.015088,0.000415,,1.637894e+05,True,False,"(3, 3)",english,"{'clf__class_weight': None, 'clf__tol': 163789...",0.168300,0.089464,0.065934,0.107899,0.043777,718
347,0.052342,0.000418,0.015624,0.001183,,1.000000e+08,True,False,"(3, 3)",english,"{'clf__class_weight': None, 'clf__tol': 100000...",0.141644,0.089464,0.080952,0.104020,0.026830,719


In [109]:
# Save dataframe into CSV file
df_SVC.to_csv("svc_results.csv")