In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection, metrics
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


In [3]:
from tqdm import tqdm
import string
import textblob
from xgboost import XGBClassifier


In [4]:
from tensorflow import keras
from tensorflow.keras import models, optimizers
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.layers import *
# from tensorflow.keras.layers import LSTM, Convolution1D, GRU, Dense, Dropout, Input, Embedding, SpatialDropout1D, Bidirectional

2023-11-12 19:47:56.813154: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-12 19:47:56.896622: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-12 19:47:56.898735: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
df = pd.DataFrame()
for n in range(5):
    df = pd.concat([
        df,
        pd.read_parquet(f'../../dataset/processed/artigos_de_partidos/artigos_partidos_nan.parquet')
    ])

In [6]:
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(df["Conteudo"], df["Vies"], stratify = df["Vies"])

In [7]:
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_valid = encoder.fit_transform(y_valid)

In [8]:
count_vect = CountVectorizer(analyzer = "word", token_pattern = r'\w{1,}')
count_vect.fit(df["Conteudo"])
X_train_count =  count_vect.transform(X_train)
X_valid_count =  count_vect.transform(X_valid)

In [9]:
tfidf_vect = TfidfVectorizer(analyzer = "word", max_features = 100)
tfidf_vect.fit(df["Conteudo"])
X_train_tfidf = tfidf_vect.transform(X_train)
X_valid_tfidf = tfidf_vect.transform(X_valid)

In [10]:
tfidf_vect_ngram = TfidfVectorizer(analyzer = "word", ngram_range = (1,3), max_features = 100)
tfidf_vect_ngram.fit(df["Conteudo"])
X_train_tfidf_ngram =  tfidf_vect_ngram.transform(X_train)
X_valid_tfidf_ngram =  tfidf_vect_ngram.transform(X_valid)

In [12]:
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer = "char", ngram_range = (1,3), max_features = 100)
tfidf_vect_ngram_chars.fit(df["Conteudo"])
X_train_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_train) 
X_valid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_valid) 

In [13]:
token = text.Tokenizer()
token.fit_on_texts(df["Conteudo"])
word_index = token.word_index

In [14]:
X_train_seq = sequence.pad_sequences(token.texts_to_sequences(X_train), maxlen=150)
X_valid_seq = sequence.pad_sequences(token.texts_to_sequences(X_valid), maxlen=150)

In [15]:
def train_model(model, X_train, y_train, X_valid, is_neural):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    if is_neural:
        return metrics.accuracy_score(np.round(y_pred), y_valid)
    else:
        return metrics.accuracy_score(y_pred, y_valid)

In [16]:
model = LogisticRegression(solver='liblinear')

accuracy = train_model(
    model, 
    X_train_count, 
    y_train, 
    X_valid_count, 
    False
)
print ("LR, Count Vectors: ", accuracy)

accuracy = train_model(
    model, 
    X_train_tfidf, 
    y_train, 
    X_valid_tfidf, 
    False
)
print ("LR, WordLevel TF-IDF: ", accuracy)

accuracy = train_model(
    model, 
    X_train_tfidf_ngram, 
    y_train, 
    X_valid_tfidf_ngram, 
    False
)
print ("LR, N-Gram Vectors: ", accuracy)

accuracy = train_model(
    model, 
    X_train_tfidf_ngram_chars, 
    y_train, 
    X_valid_tfidf_ngram_chars, 
    False
)
print ("LR, CharLevel Vectors: ", accuracy)

LR, Count Vectors:  0.9996585165960934
LR, WordLevel TF-IDF:  0.8947548149159951
LR, N-Gram Vectors:  0.8970086053817784
LR, CharLevel Vectors:  0.7688840322360333
