In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate
import pandas as pd
from sklearn.model_selection import GridSearchCV
from loguru import logger as lg
import multiprocessing
import nltk
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
import warnings
warnings.filterwarnings("ignore")
# Download necessary NLTK corpora
# nltk.download('stopwords')

In [2]:
NUM_PROCESSORS = (multiprocessing.cpu_count() - 2)
VERBOSE = 0
DEPARTMENTS_QUANTITY_TO_TRAIN = 6
lg.add("finnlp_binary_intra_class_experiment_2.log", rotation="1024 MB")
lg.info("\n\n####################################### NEW EXECUTION #######################################\n\n")
lg.info(f"Processors (cores) available: {NUM_PROCESSORS}")

2024-02-28 22:36:02.373 | INFO     | __main__:<module>:5 - 

####################################### NEW EXECUTION #######################################


2024-02-28 22:36:02.374 | INFO     | __main__:<module>:6 - Processors (cores) available: 10


In [3]:
brc = pd.read_csv("Banking_Regulation_Corpora_BRC_anonymous.csv" ,encoding="utf-8", low_memory=False, sep=";")

In [4]:
brc.head()

Unnamed: 0,class,department,entry_date,general_id,normative_identifier,publication_date,regulatory_authority,subject,subject_length,subject_unique_words,subject_words,text,text_length,text_unique_words,text_words,type,unique_document_id,url
0,1,DPT_25,2020-12-23,2128,36560,2020-12-23,BACEN,Divulga nome aprovado de pessoas eleitas/nomea...,167,21.0,23.0,Divulgamos o nome aprovado de pessoas eleitas/...,656,67.0,127.0,COMUNICADO,771984,https://www.bcb.gov.br/estabilidadefinanceira/...
1,1,DPT_25,2020-12-28,2129,36569,2020-12-28,BACEN,Divulga nome aprovado de pessoas eleitas/nomea...,167,21.0,23.0,Divulgamos o nome aprovado de pessoas eleitas/...,2110,156.0,322.0,COMUNICADO,772973,https://www.bcb.gov.br/estabilidadefinanceira/...
2,1,DPT_25,2020-12-22,2130,36554,2020-12-22,BACEN,Divulga comunicado do Grupo de Ação Financeira...,120,16.0,18.0,Comunicamos que o Grupo de Ação Financeira con...,792,76.0,127.0,COMUNICADO,771452,https://www.bcb.gov.br/estabilidadefinanceira/...
3,1,DPT_25,2020-12-23,2131,4880,2020-12-23,BACEN,Dispõe sobre o horário de atendimento ao públi...,222,32.0,34.0,"O Banco Central do Brasil, na forma do art. 9º...",4964,365.0,875.0,RESOLUÇÃO CMN,772051,https://www.bcb.gov.br/estabilidadefinanceira/...
4,1,DPT_25,2020-12-23,2132,1125/2020,2020-12-22,BACEN,DEPARTAMENTO DE RESOLUÇÃO E DE AÇÃO SANCIONADO...,165,19.0,23.0,DEPARTAMENTO DE RESOLUÇÃO E DE AÇÃO SANCIONADO...,3382,304.0,492.0,PROCESSO ADMINISTRATIVO SANCIONADOR,771966,https://www.bcb.gov.br/estabilidadefinanceira/...


<h3>Evaluate which regulators get in each department to make the intra-class balancing strategy</h3>

In [5]:
lt = ['DPT_25', 'DPT_3', 'DPT_2', 'DPT_9', 'DPT_5', 'DPT_16']

In [6]:
data_relevant = brc[brc['class'] == 1]

In [7]:
for l in lt:
    dfr = data_relevant[data_relevant.department.isin([l])]
    dddd = dict(dfr.regulatory_authority.value_counts())
    print(l,' - ', list(dict(sorted(dddd.items(), key=lambda item: item[1], reverse=True)).items())[:6])

DPT_25  -  [('BACEN', 666), ('PRESIDÊNCIA_DA_REPÚBLICA', 29), ('COAF', 28), ('DOU', 26), ('ANAC', 14), ('RFB', 14)]
DPT_3  -  [('B3', 302), ('BACEN', 175), ('CETIP', 99), ('ANBIMA', 55), ('CVM', 49), ('RFB', 7)]
DPT_2  -  [('CVM', 253), ('ANBIMA', 221), ('B3', 147), ('COAF', 18), ('BACEN', 16), ('RFB', 13)]
DPT_9  -  [('BACEN', 205), ('CIP', 198), ('DOU', 64), ('BNDES', 55), ('PRESIDÊNCIA_DA_REPÚBLICA', 32), ('NUCLEA', 17)]
DPT_5  -  [('BACEN', 193), ('CVM', 126), ('DOU', 107), ('RFB', 79), ('CFC', 22), ('CPC', 13)]
DPT_16  -  [('B3', 97), ('CETIP', 85), ('BACEN', 68), ('STN', 67), ('RFB', 24), ('ANBIMA', 20)]


In [8]:
# We chose the regulators with more documents in the relevant class of each of the three departments.
# The regulators must exist in both classes of the department.
# Only regulators with more than 10 documents in the relevant class where chosen.
# To make the undersampling, we took the regulator with less documents in the relevant class for each department, 26 documents at DPT_25, for example.

# DPT_25  -  [('BACEN', 666), ('PRESIDÊNCIA_DA_REPÚBLICA', 29), ('COAF', 28), ('DOU', 26)] # relevant
# DPT_25  -  [('DOU', 2982), ('BACEN', 467), ('COAF', 48), ('PRESIDÊNCIA_DA_REPÚBLICA', 32)] # irrelevant

# DPT_3  -  [('B3', 302), ('BACEN', 175), ('ANBIMA', 55), ('CVM', 49)] # relevant
# DPT_3  -  [('B3', 592), ('CVM', 368), ('BACEN', 257), ('ANBIMA', 85)] # irrelevant

# DPT_2  -  [('CVM', 253), ('ANBIMA', 221), ('B3', 147), ('RFB', 13)] # relevant
# DPT_2  -  [('B3', 1921), ('CVM', 783), ('RFB', 678), ('ANBIMA', 399)] # irrelevant

In [9]:
data_irrelevant = brc[brc['class'] == 0]

In [10]:
for l in lt:
    dfi = data_irrelevant[data_irrelevant.department.isin([l])]
    dddd = dict(dfi.regulatory_authority.value_counts())
    print(l,' - ',list(dict(sorted(dddd.items(), key=lambda item: item[1], reverse=True)).items())[:6])

DPT_25  -  [('DOU', 2982), ('BACEN', 467), ('ANAC', 421), ('COAF', 48), ('PRESIDÊNCIA_DA_REPÚBLICA', 32), ('NUCLEA', 26)]
DPT_3  -  [('B3', 592), ('CVM', 368), ('STN', 264), ('BACEN', 257), ('RFB', 161), ('ANBIMA', 85)]
DPT_2  -  [('B3', 1921), ('CVM', 783), ('CAMARA_MUNICIPAL_DO_RIO_DE_JANEIRO', 744), ('RFB', 678), ('ANBIMA', 399), ('SUSEP', 391)]
DPT_9  -  [('BACEN', 826), ('CIP', 712), ('CVM', 531), ('RFB', 331), ('STN', 269), ('DOU', 230)]
DPT_5  -  [('DOU', 1147), ('BACEN', 658), ('RFB', 327), ('PRESIDÊNCIA_DA_REPÚBLICA', 201), ('CVM', 158), ('NUCLEA', 60)]
DPT_16  -  [('BACEN', 337), ('STN', 294), ('RFB', 270), ('CVM', 162), ('B3', 121), ('CAMARA_MUNICIPAL_DO_RIO_DE_JANEIRO', 101)]


In [11]:
data_relevant = brc[brc['class'] == 1]

In [12]:
aux_relevant = []

SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_25 = 26
SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_3 = 49
SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_2 = 13

# DPT_25
dpt_25 = data_relevant[data_relevant.department.isin(['DPT_25'])]

regulator = dpt_25[dpt_25.regulatory_authority.isin(['BACEN'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_25]
aux_relevant.append(regulator)

regulator = dpt_25[dpt_25.regulatory_authority.isin(['PRESIDÊNCIA_DA_REPÚBLICA'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_25]
aux_relevant.append(regulator)

regulator = dpt_25[dpt_25.regulatory_authority.isin(['COAF'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_25]
aux_relevant.append(regulator)

regulator = dpt_25[dpt_25.regulatory_authority.isin(['DOU'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_25]
aux_relevant.append(regulator)
# DPT_25

# DPT_3
dpt_3 = data_relevant[data_relevant.department.isin(['DPT_3'])]

regulator = dpt_3[dpt_3.regulatory_authority.isin(['B3'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_3]
aux_relevant.append(regulator)

regulator = dpt_3[dpt_3.regulatory_authority.isin(['BACEN'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_3]
aux_relevant.append(regulator)

regulator = dpt_3[dpt_3.regulatory_authority.isin(['ANBIMA'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_3]
aux_relevant.append(regulator)

regulator = dpt_3[dpt_3.regulatory_authority.isin(['CVM'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_3]
aux_relevant.append(regulator)
# DPT_3

# DPT_2
dpt_2 = data_relevant[data_relevant.department.isin(['DPT_2'])]

regulator = dpt_2[dpt_2.regulatory_authority.isin(['CVM'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_2]
aux_relevant.append(regulator)

regulator = dpt_2[dpt_2.regulatory_authority.isin(['ANBIMA'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_2]
aux_relevant.append(regulator)

regulator = dpt_2[dpt_2.regulatory_authority.isin(['B3'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_2]
aux_relevant.append(regulator)

regulator = dpt_2[dpt_2.regulatory_authority.isin(['RFB'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_2]
aux_relevant.append(regulator)
# DPT_2

data_relevant = pd.concat(aux_relevant)

data_relevant[data_relevant.text_words < 50]

Unnamed: 0,class,department,entry_date,general_id,normative_identifier,publication_date,regulatory_authority,subject,subject_length,subject_unique_words,subject_words,text,text_length,text_unique_words,text_words,type,unique_document_id,url


In [13]:
data_relevant.shape

(352, 18)

In [14]:
(13*4)+(49*4)+(26*4)

352

In [15]:
aux_irrelevant = []

SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_25 = 26
SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_3 = 49
SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_2 = 13

# DPT_25
dpt_25 = data_irrelevant[data_irrelevant.department.isin(['DPT_25'])]

regulator = dpt_25[dpt_25.regulatory_authority.isin(['BACEN'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_25]
aux_irrelevant.append(regulator)

regulator = dpt_25[dpt_25.regulatory_authority.isin(['PRESIDÊNCIA_DA_REPÚBLICA'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_25]
aux_irrelevant.append(regulator)

regulator = dpt_25[dpt_25.regulatory_authority.isin(['COAF'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_25]
aux_irrelevant.append(regulator)

regulator = dpt_25[dpt_25.regulatory_authority.isin(['DOU'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_25]
aux_irrelevant.append(regulator)
# DPT_25

# DPT_3
dpt_3 = data_irrelevant[data_irrelevant.department.isin(['DPT_3'])]

regulator = dpt_3[dpt_3.regulatory_authority.isin(['B3'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_3]
aux_irrelevant.append(regulator)

regulator = dpt_3[dpt_3.regulatory_authority.isin(['BACEN'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_3]
aux_irrelevant.append(regulator)

regulator = dpt_3[dpt_3.regulatory_authority.isin(['ANBIMA'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_3]
aux_irrelevant.append(regulator)

regulator = dpt_3[dpt_3.regulatory_authority.isin(['CVM'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_3]
aux_irrelevant.append(regulator)
# DPT_3

# DPT_2
dpt_2 = data_irrelevant[data_irrelevant.department.isin(['DPT_2'])]

regulator = dpt_2[dpt_2.regulatory_authority.isin(['CVM'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_2]
aux_irrelevant.append(regulator)

regulator = dpt_2[dpt_2.regulatory_authority.isin(['ANBIMA'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_2]
aux_irrelevant.append(regulator)

regulator = dpt_2[dpt_2.regulatory_authority.isin(['B3'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_2]
aux_irrelevant.append(regulator)

regulator = dpt_2[dpt_2.regulatory_authority.isin(['RFB'])]
regulator = regulator[regulator.text_words >= 50][:SMALLEST_QUANTITY_RELEVANT_SAMPLES_DPT_2]
aux_irrelevant.append(regulator)
# DPT_2

data_irrelevant = pd.concat(aux_irrelevant)

data_irrelevant[data_irrelevant.text_words < 50]

Unnamed: 0,class,department,entry_date,general_id,normative_identifier,publication_date,regulatory_authority,subject,subject_length,subject_unique_words,subject_words,text,text_length,text_unique_words,text_words,type,unique_document_id,url


In [16]:
data_irrelevant.shape

(352, 18)

In [17]:
data_irrelevant.head(3)

Unnamed: 0,class,department,entry_date,general_id,normative_identifier,publication_date,regulatory_authority,subject,subject_length,subject_unique_words,subject_words,text,text_length,text_unique_words,text_words,type,unique_document_id,url
7846,0,DPT_25,2020-08-31,2169,0301/2020,2020-08-31,BACEN,Caracterizado fornecimento intempestivo ao Ba...,294,36.0,49.0,​DEPARTAMENTO DE RESOLUÇÃO E DE AÇÃO SANCIONAD...,758,81.0,113.0,PROCESSO ADMINISTRATIVO SANCIONADOR,729080,https://www.bcb.gov.br/estabilidadefinanceira/...
7847,0,DPT_25,2020-08-31,2170,36121,2020-08-31,BACEN,Divulga nome aprovado de pessoas eleitas/nomea...,167,21.0,23.0,Divulgamos o nome aprovado de pessoas eleitas/...,7080,407.0,966.0,COMUNICADO,729131,https://www.bcb.gov.br/estabilidadefinanceira/...
7848,0,DPT_25,2020-08-31,2171,36123,2020-08-31,BACEN,Divulga nome de pessoas com intenção de ocupar...,140,19.0,21.0,Divulgamos os nomes de pretendentes a cargos d...,1558,149.0,276.0,COMUNICADO,729151,https://www.bcb.gov.br/estabilidadefinanceira/...


In [18]:
data_relevant.count()

class                   352
department              352
entry_date              352
general_id              352
normative_identifier    352
publication_date        352
regulatory_authority    352
subject                 352
subject_length          352
subject_unique_words    352
subject_words           352
text                    352
text_length             352
text_unique_words       352
text_words              352
type                    352
unique_document_id      352
url                     352
dtype: int64

In [19]:
data_relevant = data_relevant[data_relevant.text_words >= 50]
data_relevant.count()

class                   352
department              352
entry_date              352
general_id              352
normative_identifier    352
publication_date        352
regulatory_authority    352
subject                 352
subject_length          352
subject_unique_words    352
subject_words           352
text                    352
text_length             352
text_unique_words       352
text_words              352
type                    352
unique_document_id      352
url                     352
dtype: int64

In [20]:
data_irrelevant.count()

class                   352
department              352
entry_date              352
general_id              352
normative_identifier    352
publication_date        352
regulatory_authority    352
subject                 352
subject_length          352
subject_unique_words    352
subject_words           352
text                    352
text_length             352
text_unique_words       352
text_words              352
type                    352
unique_document_id      352
url                     352
dtype: int64

In [21]:
data_irrelevant = data_irrelevant[data_irrelevant.text_words >= 50]
data_irrelevant.count()

class                   352
department              352
entry_date              352
general_id              352
normative_identifier    352
publication_date        352
regulatory_authority    352
subject                 352
subject_length          352
subject_unique_words    352
subject_words           352
text                    352
text_length             352
text_unique_words       352
text_words              352
type                    352
unique_document_id      352
url                     352
dtype: int64

In [22]:
data_relevant['department'].unique()

array(['DPT_25', 'DPT_3', 'DPT_2'], dtype=object)

In [23]:
len(list(data_relevant['department'].unique()))

3

In [24]:
stop_words = set(stopwords.words('portuguese'))
df_stopwords = pd.DataFrame(list(stop_words), columns =['words'])
df_stopwords.to_csv('stop_words_nltk_portuguese.csv', sep=';', encoding="utf-8")
df_stopwords.head(3)

Unnamed: 0,words
0,estivesse
1,fui
2,serão


In [25]:
def remove_special_characters(df,column):
    df.loc[:,column] = df[column].str.lower()
    
    df.loc[:,column] = df[column].str.replace('á','a')
    df.loc[:,column] = df[column].str.replace('é','e')
    df.loc[:,column] = df[column].str.replace('í','i')
    df.loc[:,column] = df[column].str.replace('ó','o')
    df.loc[:,column] = df[column].str.replace('ú','u')

    df.loc[:,column] = df[column].str.replace('â','a')
    df.loc[:,column] = df[column].str.replace('ê','e')
    df.loc[:,column] = df[column].str.replace('î','i')
    df.loc[:,column] = df[column].str.replace('ô','o')
    df.loc[:,column] = df[column].str.replace('û','u')

    df.loc[:,column] = df[column].str.replace('à','a')
    df.loc[:,column] = df[column].str.replace('è','e')
    df.loc[:,column] = df[column].str.replace('ì','i')
    df.loc[:,column] = df[column].str.replace('ò','o')
    df.loc[:,column] = df[column].str.replace('ù','u')

    df.loc[:,column] = df[column].str.replace('ä','a')
    df.loc[:,column] = df[column].str.replace('ë','e')
    df.loc[:,column] = df[column].str.replace('ï','i')
    df.loc[:,column] = df[column].str.replace('ö','o')
    df.loc[:,column] = df[column].str.replace('ü','u')

    df.loc[:,column] = df[column].str.replace('ã','a')
    df.loc[:,column] = df[column].str.replace('ẽ','e')
    df.loc[:,column] = df[column].str.replace('ĩ','i')
    df.loc[:,column] = df[column].str.replace('õ','o')
    df.loc[:,column] = df[column].str.replace('ũ','u')

    df.loc[:,column] = df[column].str.replace('ç','c')
    
    return df

In [26]:
df_stopwords = remove_special_characters(df_stopwords,'words')

In [27]:
df_stopwords.head(3)

Unnamed: 0,words
0,estivesse
1,fui
2,serao


In [28]:
stop_words = df_stopwords['words'].tolist()

In [29]:
def clean_text(text, stop_words):
    text = [token for token in text.split(" ")]
    text = [word for word in text if not word in stop_words] # remove stopwords
    text = " ".join(text)
    return text

In [30]:
def cleaning(df,column, stop_words):
    # Remove urls
    df.loc[:,column] = df[column].replace(r'https?:\/\/.*?[\s+]', ' ', regex=True)
    # Remove urls without https?
    df.loc[:,column] = df[column].replace(r'www.*?[\s+]', ' ', regex=True)
    # Remove email address
    df.loc[:,column] = df[column].replace(r'(?P<email_address>[\w\.-]+@[\w\.-]+\.[\w]+)', ' ', regex=True)
    
    df.loc[:,column] = df[column].str.replace('º',' ')
    df.loc[:,column] = df[column].str.replace('ª',' ')

    df.loc[:,column] = df[column].str.replace('1',' ')
    df.loc[:,column] = df[column].str.replace('2',' ')
    df.loc[:,column] = df[column].str.replace('3',' ')
    df.loc[:,column] = df[column].str.replace('4',' ')
    df.loc[:,column] = df[column].str.replace('5',' ')
    df.loc[:,column] = df[column].str.replace('6',' ')
    df.loc[:,column] = df[column].str.replace('7',' ')
    df.loc[:,column] = df[column].str.replace('8',' ')
    df.loc[:,column] = df[column].str.replace('9',' ')
    df.loc[:,column] = df[column].str.replace('0',' ')

    df.loc[:,column] = df[column].str.replace('/',' ')

    df.loc[:,column] = df[column].str.replace('\r',' ')
    df.loc[:,column] = df[column].str.replace('\n',' ')
    df.loc[:,column] = df[column].str.replace('\t',' ')
    df.loc[:,column] = df[column].str.replace('\\',' ')

    df.loc[:,column] = df[column].str.replace('-',' ')

    df.loc[:,column] = df[column].str.replace('–',' ')
    df.loc[:,column] = df[column].str.replace('“',' ')
    df.loc[:,column] = df[column].str.replace('”',' ')
    df.loc[:,column] = df[column].str.replace('’',' ')
    df.loc[:,column] = df[column].str.replace('_',' ')
    df.loc[:,column] = df[column].str.replace('.',' ')
    df.loc[:,column] = df[column].str.replace(',',' ')
    df.loc[:,column] = df[column].str.replace('|',' ')
    df.loc[:,column] = df[column].str.replace('=',' ')
    df.loc[:,column] = df[column].str.replace('@',' ')
    df.loc[:,column] = df[column].str.replace('$',' ')

    df.loc[:,column] = df[column].str.replace('°',' ')
    df.loc[:,column] = df[column].str.replace('§',' ')
    df.loc[:,column] = df[column].str.replace('•',' ')
    df.loc[:,column] = df[column].str.replace('▪',' ')

    df.loc[:,column] = df[column].str.replace('%',' ')
    df.loc[:,column] = df[column].str.replace('&',' ')
    df.loc[:,column] = df[column].str.replace('*',' ')
    df.loc[:,column] = df[column].str.replace('+',' ')
    df.loc[:,column] = df[column].str.replace(':',' ')
    df.loc[:,column] = df[column].str.replace(';',' ')

    df.loc[:,column] = df[column].str.replace('!',' ')
    df.loc[:,column] = df[column].str.replace('?',' ')
    df.loc[:,column] = df[column].str.replace('#',' ')
    df.loc[:,column] = df[column].str.replace('\'',' ')
    df.loc[:,column] = df[column].str.replace('"',' ')

    df.loc[:,column] = df[column].str.replace('<',' ')
    df.loc[:,column] = df[column].str.replace('>',' ')

    df.loc[:,column] = df[column].str.replace('(',' ')
    df.loc[:,column] = df[column].str.replace(')',' ')

    df.loc[:,column] = df[column].str.replace('{',' ')
    df.loc[:,column] = df[column].str.replace('}',' ')

    df.loc[:,column] = df[column].str.replace('[',' ')
    df.loc[:,column] = df[column].str.replace(']',' ')

    # Remove the same character repeated more than twice
    df.loc[:,column] = df[column].replace(r'([a-z])\1{2,}', ' ', regex=True)
    
    # To replace more than one white space by only one white space
    df.loc[:,column] = df[column].replace(r'\s+', ' ', regex=True)

    # Remove stopwords
    df.loc[:,column] = df[column].apply(lambda x: clean_text(x, stop_words))

    return df

In [31]:
def remove_duplicates(sentence):
	words = sentence.split(" ")
	result = []
	for word in words:
		if word not in result:
			result.append(word)
	return result

<h3>Selecting the Departments</h3>

In [32]:
boards = data_relevant['department']
# Getting the 6 most populated boards
boards_list = list(boards.unique())[:DEPARTMENTS_QUANTITY_TO_TRAIN]
lg.info(f"boards_list: {boards_list}")

2024-02-28 22:36:09.764 | INFO     | __main__:<module>:4 - boards_list: ['DPT_25', 'DPT_3', 'DPT_2']


<h3>Parameters</h3>

In [33]:
from sklearn.metrics import make_scorer
import sklearn.metrics as sk

CROSS_VALIDATION=10
METRICS = {
           'accuracy': make_scorer(sk.accuracy_score),
           'precision': make_scorer(sk.precision_score, average = 'weighted', zero_division=0),
           'recall': make_scorer(sk.recall_score, average = 'weighted', zero_division=0),
           'f1': make_scorer(sk.f1_score, average = 'binary', zero_division=0),
           'f1_macro': make_scorer(sk.f1_score, average = 'macro', zero_division=0),
           'f1_weighted': make_scorer(sk.f1_score, average = 'weighted', zero_division=0)
           }

RAND_STATE = 42
REFIT_VAL = 'f1'

cls = dict({
        'random_forest': {
            'estimator': RandomForestClassifier(random_state=RAND_STATE, n_jobs = NUM_PROCESSORS),
            'parameters':{'n_estimators': [100, 400, 1000], 'max_depth': [10,30, 100], 'criterion':['entropy', 'log_loss', 'gini']}
            },
        'svm': {
            'estimator': SVC(random_state=RAND_STATE),
            'parameters':{'C': [0.025, 0.08, 0.1, 0.5, 0.8, 1.0, 2.0, 10.0, 100.0, 500.0, 1000.0], 'kernel': ['linear','poly','rbf','sigmoid']}
            },
        'xgboost':{
            'estimator': XGBClassifier(random_state=RAND_STATE, num_class=2, verbosity = 0, silent=True, n_jobs=NUM_PROCESSORS),
            'parameters':{'objective':['reg:squarederror','binary:logistic','multi:softmax','binary:hinge'],'n_estimators': [100,1000],'max_depth': [10,30],'learning_rate':[0.01,0.5]}
        },
        'nb':{
                'estimator': MultinomialNB(),
                'parameters':{'alpha':[1, 0.1, 0.01, 0.001, 0.0001, 0.00001],'force_alpha': [True, False],'fit_prior': [True, False]}
            }
    })

In [34]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

def training_each_inductor_holdout(classifiers, board, X, y):
    for key in list(classifiers.keys()):
        lg.info(f"\n\n####### {key} INDUCTOR EXECUTING ####### - {board}\n\n")

        lg.debug(f"[+] classifiers[key]['estimator']: {classifiers[key]['estimator']} - {key} - {board}")
        lg.debug(f"[+] classifiers[key]['parameters']: {classifiers[key]['parameters']} - {key} - {board}")

        search = GridSearchCV(estimator=classifiers[key]['estimator'],
                            param_grid=classifiers[key]['parameters'],
                            cv = CROSS_VALIDATION,
                            scoring = METRICS,
                            refit = REFIT_VAL,
                            n_jobs = NUM_PROCESSORS,
                            verbose = VERBOSE)

        lg.debug(f"[+] Start GridSearchCV - {key} - {board}")
        search.fit(X, y)
        lg.debug(f"[+] End GridSearchCV - {key} - {board}")

        lg.info(f"search.best_estimator_: {search.best_estimator_} \n")

        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=RAND_STATE, stratify=y)

        clf = search.best_estimator_.fit(x_train, y_train)
        
        y_pred = clf.predict(x_test)

        lg.success(f"\n\n####### METRICS {key} - {board} #######\n\n")

        lg.success(f"{key} - {board} - test_precision: {metrics.precision_score(y_test, y_pred)}")

        lg.success(f"{key} - {board} - test_recall: {metrics.recall_score(y_test, y_pred)}")

        lg.success(f"{key} - {board} - test_f1_binary: {metrics.f1_score(y_test, y_pred)}")

        lg.success(f"{key} - {board} - test_f1_macro: {metrics.f1_score(y_test, y_pred, average='macro')}")

<h3>The Training and Evaluation</h3>

In [35]:
for board in boards_list:
    lg.info(f"\n\n####### {board} EXECUTING #######\n\n")
    total_relevant_documents_of_the_board = data_relevant[data_relevant.department == board].shape[0]
    lg.info(f"total_relevant_documents_of_the_board: {total_relevant_documents_of_the_board} - {board}")
    lg.info(f"Total irrelevant documents of the department: {data_irrelevant[data_irrelevant.department == board].shape[0]}")

    data_relevant_board = data_relevant[data_relevant.department == board]
    lg.info(f"data_relevant_board: {data_relevant_board.shape} - {board}")

    # The total documents of the relevant class (total_relevant_documents_of_the_board) is use to get the first documents in the irrelevant class.
    data_irrelevant_board = data_irrelevant[data_irrelevant.department == board][:total_relevant_documents_of_the_board]
    lg.info(f"data_irrelevant_board: {data_irrelevant_board.shape} - {board}")

    df_gather_datasets = []
    df_gather_datasets.append(data_relevant_board)
    df_gather_datasets.append(data_irrelevant_board)
    # Merging all dataframes in the list as one dataframe.
    df_final_dataset = pd.concat(df_gather_datasets)
    
    lg.info(f"df_final_dataset: {df_final_dataset.shape} - {board}")

    # Cleaning the text and title
    lg.debug("[+] Start cleaning text and title")
    df_final_dataset = remove_special_characters(df_final_dataset, 'text')
    df_final_dataset = remove_special_characters(df_final_dataset, 'subject')
    df_final_dataset = cleaning(df_final_dataset, 'text', stop_words)
    df_final_dataset = cleaning(df_final_dataset, 'subject', stop_words)
    lg.debug("[+] End cleaning text and title")

    # To get only phrases with at lest three words after the preprocessing.
    df_final_dataset = df_final_dataset[df_final_dataset['text'].str.len() >= 3]
    
    data_x = df_final_dataset.text
    lg.debug("[+] Start TF-IDF")

    feature_extraction = TfidfVectorizer()
    data_x = feature_extraction.fit_transform(data_x)
    lg.debug("[+] End TF-IDF")

    data_y = df_final_dataset['class']

    training_each_inductor_holdout(cls, board, data_x, data_y)

    lg.success("\n\n[+][+][+] SUCCESS! [+][+][+]\n\n")

2024-02-28 22:36:09.790 | INFO     | __main__:<module>:2 - 

####### DPT_25 EXECUTING #######


2024-02-28 22:36:09.792 | INFO     | __main__:<module>:4 - total_relevant_documents_of_the_board: 104 - DPT_25
2024-02-28 22:36:09.794 | INFO     | __main__:<module>:5 - Total irrelevant documents of the department: 104
2024-02-28 22:36:09.795 | INFO     | __main__:<module>:8 - data_relevant_board: (104, 18) - DPT_25
2024-02-28 22:36:09.796 | INFO     | __main__:<module>:12 - data_irrelevant_board: (104, 18) - DPT_25
2024-02-28 22:36:09.799 | INFO     | __main__:<module>:20 - df_final_dataset: (208, 18) - DPT_25
2024-02-28 22:36:09.800 | DEBUG    | __main__:<module>:23 - [+] Start cleaning text and title
2024-02-28 22:36:10.677 | DEBUG    | __main__:<module>:28 - [+] End cleaning text and title
2024-02-28 22:36:10.679 | DEBUG    | __main__:<module>:34 - [+] Start TF-IDF
2024-02-28 22:36:10.815 | DEBUG    | __main__:<module>:38 - [+] End TF-IDF
2024-02-28 22:36:10.816 | INFO     | __main__:tr