# Instalação dos pacotes Python necessários
Primeiramente, é preciso instalar o sklearn e o pandas

In [1]:
!pip install scikit-learn



In [2]:
!pip install pandas



# Carregamento dos dados e vetorização
Após isso, carrega-se os dados da base **classic4.csv** e o texto dessa base é vetorizada.


In [30]:
import utils
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn import linear_model
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
# lendo csv
df = utils.read_csv("classic4.csv")
# divide a base de dados em 80% train e 20% test
x_train, x_test = train_test_split(df, test_size=0.2, stratify=df['class'])

categories_on_df = list(set(df['class']))   # pega todas as categorias da base de dados
dict_cats = {y: x for x, y in enumerate(categories_on_df)}
lista_cats = list(dict_cats.keys())
print(lista_cats)
print(f"Numero de classes: {len(lista_cats)}")

x_train_tf, x_test_tf = utils.occ_vectorizer(x_train['text'], x_test['text'])

['med', 'cacm', 'cisi', 'cran']
Numero de classes: 4


# TFIDF
Após a vetorização, conta-se o número de ocorrências de cada termo em um documento em relação a toda a base de dados. Essa medida estatística é conhecida como **TFIDF** (Term Frequency-Inverse Document Frequency)

In [31]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_t = TfidfTransformer()
xtrain_tfidf, xtest_tfidf = utils.tfidif(x_train_tf, x_test_tf)

y_train = utils.get_ytrain(x_train['class'], dict_cats)
y_test = utils.get_ytest(x_test['class'], dict_cats)

# Melhores resultados do Grid Search
Iremos utilizar os melhores hiperparâmetros definidos pelo GridSearch para cada um desses três modelos: Multinomial Naive Bayes, Random Forest e Logistic Regression

In [32]:
csv_file_path = "Grid_Search_Results.csv"

df = pd.read_csv(csv_file_path)

df

Unnamed: 0,Model,Best Score,Best Parameters,Test Set f1_macro
0,Random Forest,0.930299,"{'max_depth': None, 'min_samples_leaf': 1, 'mi...",0.945736
1,Logistic Regression,0.958253,"{'C': 1.0, 'penalty': 'l2', 'solver': 'saga'}",0.965469
2,Naive Bayes,0.961453,"{'alpha': 0.1, 'fit_prior': True, 'force_alpha...",0.967583


# Multinomial Naive Bayes

In [33]:
mnb = MultinomialNB(alpha =  0.1, fit_prior = True, force_alpha = True).fit(xtrain_tfidf, y_train)
predicted = mnb.predict(xtest_tfidf)

print(f"Accuracy: {metrics.accuracy_score(y_test, predicted)}")
print(confusion_matrix(y_test, predicted))
print(f"F1 Score Macro: {f1_score(y_test, predicted, average='macro')}")
print(f"F1 Score Micro: {f1_score(y_test, predicted, average='micro')}")

Accuracy: 0.9548978153629316
[[200   0   6   0]
 [  0 604  34   3]
 [  0  12 280   0]
 [  0   7   2 271]]
F1 Score Macro: 0.9578273362132105
F1 Score Micro: 0.9548978153629316


# Logistic Regression


In [34]:
lreg = linear_model.LogisticRegression(C=1.0, penalty = 'l2', solver = 'saga')
lreg.fit(xtrain_tfidf, y_train)
predicted = lreg.predict(xtest_tfidf)

print(f"Accuracy: {metrics.accuracy_score(y_test, predicted)}")
print(confusion_matrix(y_test, predicted))
print(f"F1 Score Macro: {f1_score(y_test, predicted, average='macro')}")
print(f"F1 Score Micro: {f1_score(y_test, predicted, average='micro')}")

Accuracy: 0.9541930937279774
[[198   2   6   0]
 [  0 630  11   0]
 [  0  28 264   0]
 [  0  15   3 262]]
F1 Score Macro: 0.955275290718991
F1 Score Micro: 0.9541930937279774


# Random Forest

In [35]:
random_f = RandomForestClassifier(n_estimators=200, max_depth = None, min_samples_leaf = 1, min_samples_split = 10).fit(xtrain_tfidf, y_train)
predicted = random_f.predict(xtest_tfidf)

print(f"Accuracy: {metrics.accuracy_score(y_test, predicted)}")
print(confusion_matrix(y_test, predicted))
print(f"F1 Score Macro: {f1_score(y_test, predicted, average='macro')}")
print(f"F1 Score Micro: {f1_score(y_test, predicted, average='micro')}")

Accuracy: 0.9203664552501761
[[180  17   8   1]
 [  0 629  12   0]
 [  0  46 246   0]
 [  0  25   4 251]]
F1 Score Macro: 0.9195146617392481
F1 Score Micro: 0.9203664552501761
