In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib
import string
import matplotlib.pyplot as plt
import seaborn as sns
import json
import io
from sklearn.feature_extraction import _stop_words
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from __future__ import print_function
from gensim.models import KeyedVectors
from sklearn.metrics import classification_report
from scipy.sparse import hstack
import torch
from transformers import BertTokenizer, BertModel

In [None]:
# Function to print dataset rows
def print_str_cells(df, col_name, n_lines, hide_special_chars=False):
    for i in range(0,n_lines+1):
        print("LINE NUMBER: {}".format(i))
        if hide_special_chars:
            print(df.loc[i, col_name])
        else:
            print(repr(df.loc[i, col_name]))
        print("================== \n")

In [None]:
# Load File
df = pd.read_csv('./dataset_II_pre_process_II.csv')

In [None]:
df

# Explorando o dataset

In [None]:
df

In [None]:
df.Relevante.value_counts()

In [None]:
# Print rows of the (title)
print_str_cells(df, "title", 3)

In [None]:
# Print rows of the (content)
print_str_cells(df, "content", 3)

In [None]:
df['Relevante'].unique()

In [None]:
# Analyzing the class distribution of the dataset
rel_dist = df.groupby('Relevante').size().reset_index()
rel_dist = rel_dist.rename(columns={0: 'posts'})
rel_dist.plot(kind='barh', x="Relevante", y="posts", figsize=(10,6))
plt.show()

# Preprocessing

In [None]:
'''
# In this case, data cleaning has already been done in pre_process_II
# The cleaned data is in the column df['full_text']
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from unidecode import unidecode


###############

df['title'] = df['title'].astype(str)
df['content'] = df['content'].astype(str)
df['answers'] = df['answers'].astype(str)

full_text_col = 'full_text'
required_columns = ["title", "content", "answers", full_text_col]
df[full_text_col] = df["title"] + " " + df["content"] + df["answers"]

###############

nltk.download('stopwords')

stop_words = set(stopwords.words('portuguese'))
stop_words.update(["pra","etc", "none", "vai", "ter", "nan", "user", "author", "title", "none",
                    "name", "score", "content", "down", "votes", "created", "comments", "comment",
                    "answercontent", "vote", "type", "points", "aqui", "pode", "sobre", "fazer",
                    "alguem", "tudo", "regular", "coisa", "bem", "vou", "sei", "boca", "algum",
                    "alguns", "alguma", "algo", "nada", "bom", "entao", "acho", "quer", "the",
                    "and", "you", "cara", "coisas", "sim", "ainda", "ver", "usar", "assim",
                    "index"])

# Replace characters with accents with their unaccented equivalents
stop_words_without_accents = set()
for word in stop_words:
    stop_words_without_accents.add(unidecode(word))
stop_words = stop_words_without_accents

def preprocess_text(text):    
    # Replace characters with accents with their unaccented equivalents
    text = unidecode(text)
    # Convert the text to lowercase
    text = text.lower()
    # Replace \\n with a white space
    text = re.sub(r'\s*\\n\s*', ' ', text)
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    # Remove .onion links
    text = re.sub(r'\S*\.onion\S*', ' ', text)
    # Remove URLs
    text = re.sub(r'http\S+', ' ', text)
    # Remove the titles that appear in answers
    text = re.sub(r"'title': [^,]*,", ' ', text)
    # Remove the usernames that appear in answers
    text = re.sub(r"'name': [^,]*,", ' ', text)
    # Remove the types that appear in answers 
    text = re.sub(r"'type': [^,]*,", ' ', text)
    # Remove the authors that appear in answers 
    text = re.sub(r"'author': [^,]*,", ' ', text)
    # Remove sequences of kkkk
    text = re.sub(r'k{2,}\S*', ' ', text)
    
    # Remove terms with more than 4 consecutive consonants
    consonants_5m = "([bcdfghjklmnpqrstvwxyz]{5,})"
    text = re.sub(consonants_5m, " ", text)
    # Remove terms with more than 5 consecutive vowels
    vowels_6m = "([aeiou]{6,})"
    text = re.sub(vowels_6m, " ", text)
    
    # Replace non-letter characters with white spaces
    text = re.sub('[^A-Za-z]+', ' ', text)
    # Remove extra white spaces and replace sequences of white spaces with a single white space
    text = re.sub('\s+', ' ', text.strip())
    # Remove stop words
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [word for word in words if len(word) > 2]
    text = ' '.join(words)
    return text
    
# Apply the preprocessing function to df[full_text_col]
df[full_text_col] = df[full_text_col].apply(preprocess_text)

'''

# Features for - TF Unigram - TF Bigram - TF-IDF Unigram and TF-IDF Bigram

In [None]:
####
# Column full_text (X_text) - Column Relevant (y)

le = LabelEncoder()
X_text = df['full_text']

y = le.fit_transform(df['Relevante'])


In [None]:
y

In [None]:
# Print some items from the dataset
#teste_exemplo = df['full_text'][:4]
teste_exemplo = X_text[:4]
teste_exemplo.values

# Term Frequency

In [None]:
#TF Unigram
tf_vec_unigram = TfidfVectorizer(use_idf=False, norm="l1")
X_tf_unigram = tf_vec_unigram.fit_transform(X_text)


#TF Bigram
tf_vec_bigram = TfidfVectorizer(use_idf=False, norm="l1", ngram_range=(2, 2))
X_tf_bigram = tf_vec_bigram.fit_transform(X_text)


#TF-IDF Unigram
tfidf_vec_unigram = TfidfVectorizer(norm="l1")
X_tfidf_unigram = tfidf_vec_unigram.fit_transform(X_text)

#TF-IDF Bigram
tfidf_vec_bigram = TfidfVectorizer(norm="l1", ngram_range=(2, 2))
X_tfidf_bigram = tfidf_vec_bigram.fit_transform(X_text)

In [None]:
#print(X_tf_unigram)
#print(X_tf_bigram)
#print(X_tfidf_unigram)
#print(X_tfidf_bigram)
#X_tfidf_unigram.shape

# Word2vec

In [None]:
# Criando o modelo
#en_model = KeyedVectors.load_word2vec_format('../wiki.pt.vec')
en_model = KeyedVectors.load_word2vec_format('/home/sfilho/wiki.pt.vec')

In [None]:
# Getting the tokens 
words = []
for word in en_model.key_to_index:
    words.append(word)

In [None]:
len(words)

In [None]:
words[700:710]

In [None]:
####
# feature (full_text)

le = LabelEncoder()
X_w_text = df['full_text']

y_w = le.fit_transform(df['Relevante'])

In [None]:
#X_w[1]
#X_w_text[1]
#X_w_ioc

In [None]:
# DF TRAIN
texts = []
labels = []
for i in range(len(X_w_text)):
    texts.append(str(X_w_text[i]))
    labels.append(str(y_w[i]))

In [None]:
df.Relevante.value_counts()

In [None]:
texts[0:3]

In [None]:
labels[0:3]

In [None]:
# Using only the full_text column to create features
X_vec = []
for t in texts:
    vec = []
    for d in t.split():
        try:
            vec.append(en_model.get_vector(d.replace("(","").replace(")","")))
        except:
            vec.append(np.zeros(300))
    X_vec.append(np.mean(vec, axis=0))

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(labels)
le.classes_

In [None]:
labels = le.transform(labels)

In [None]:
labels[0:10]

In [None]:
pd.Series(labels).value_counts()

In [None]:
# Train/Test Split - Word2vec
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(X_vec, labels, test_size=.2)

In [None]:
print(len(X_train_w2v))
print(len(X_test_w2v))
print(len(y_train_w2v))
print(len(y_test_w2v))

In [None]:
#X_test_w2v[5]

In [None]:
X_train_w2v = pd.DataFrame(X_train_w2v)
X_test_w2v = pd.DataFrame(X_test_w2v)

In [None]:
X_train_w2v.shape

In [None]:
X_test_w2v.shape

In [None]:
#labels[320:340]
#X_vec[12]

# Train/Test Split - TF Unigram - TF Bigram - TF-IDF Unigram - TF-IDF Bigram

In [None]:
# Train/Test Split - TF Unigram - TF Bigram - TF-IDF Unigram - TF-IDF Bigram

X_train_tf_unigram, X_test_tf_unigram, y_train_tf_unigram, y_test_tf_unigram = train_test_split(X_tf_unigram, y, test_size=.2)
X_train_tf_bigram, X_test_tf_bigram, y_train_tf_bigram, y_test_tf_bigram = train_test_split(X_tf_bigram, y, test_size=.2)
X_train_tfidf_unigram, X_test_tfidf_unigram, y_train_tfidf_unigram, y_test_tfidf_unigram = train_test_split(X_tfidf_unigram, y, test_size=.2)
X_train_tfidf_bigram, X_test_tfidf_bigram, y_train_tfidf_bigram, y_test_tfidf_bigram = train_test_split(X_tfidf_bigram, y, test_size=.2)

In [None]:
print("TF Unigram - Treino:", X_train_tf_unigram.shape[0], "Teste:", X_test_tf_unigram.shape[0])
print("TF Bigram - Treino:", X_train_tf_bigram.shape[0], "Teste:", X_test_tf_bigram.shape[0])
print("TF-IDF Unigram - Treino:", X_train_tfidf_unigram.shape[0], "Teste:", X_test_tfidf_unigram.shape[0])
print("TF-IDF Bigram - Treino:", X_train_tfidf_bigram.shape[0], "Teste:", X_test_tfidf_bigram.shape[0])


In [None]:
train_classes_tf_unigram = np.bincount(y_train_tf_unigram)
test_classes_tf_unigram = np.bincount(y_test_tf_unigram)

train_classes_tf_bigram = np.bincount(y_train_tf_bigram)
test_classes_tf_bigram = np.bincount(y_test_tf_bigram)

train_classes_tfidf_unigram = np.bincount(y_train_tfidf_unigram)
test_classes_tfidf_unigram = np.bincount(y_test_tfidf_unigram)

train_classes_tfidf_bigram = np.bincount(y_train_tfidf_bigram)
test_classes_tfidf_bigram = np.bincount(y_test_tfidf_bigram)

print("TF Unigram - Treino:", train_classes_tf_unigram)
print("TF Unigram - Teste:", test_classes_tf_unigram)

print("TF Bigram - Treino:", train_classes_tf_bigram)
print("TF Bigram - Teste:", test_classes_tf_bigram)

print("TF-IDF Unigram - Treino:", train_classes_tfidf_unigram)
print("TF-IDF Unigram - Teste:", test_classes_tfidf_unigram)

print("TF-IDF Bigram - Treino:", train_classes_tfidf_bigram)
print("TF-IDF Bigram - Teste:", test_classes_tfidf_bigram)

# Support Vector Machines

In [None]:
## USE TQDM - SVM
from tqdm import tqdm
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

param_grid = [
  {'C': [1, 10, 100, 1000]}
 ]
svc = LinearSVC(max_iter=1200000)


# Creating a decorator to wrap each GridSearchCV with tqdm
def grid_search_with_tqdm(clf, X, y):
    with tqdm(total=len(param_grid), desc="GridSearchCV") as pbar:
        clf = GridSearchCV(clf, param_grid)
        clf.fit(X, y)
        pbar.update(1)
    return clf


# Wrapping each GridSearchCV object with tqdm to monitor progress
clf_svm_tf_unigram = grid_search_with_tqdm(svc, X_train_tf_unigram, y_train_tf_unigram)
clf_svm_tf_bigram = grid_search_with_tqdm(svc, X_train_tf_bigram, y_train_tf_bigram)
clf_svm_tfidf_unigram = grid_search_with_tqdm(svc, X_train_tfidf_unigram, y_train_tfidf_unigram)
clf_svm_tfidf_bigram = grid_search_with_tqdm(svc, X_train_tfidf_bigram, y_train_tfidf_bigram)
clf_svm_w2v = grid_search_with_tqdm(svc, X_train_w2v, y_train_w2v)
######

svm_y_pred_tf_unigram = clf_svm_tf_unigram.predict(X_test_tf_unigram)
svm_y_pred_tf_bigram = clf_svm_tf_bigram.predict(X_test_tf_bigram)
svm_y_pred_tfidf_unigram = clf_svm_tfidf_unigram.predict(X_test_tfidf_unigram)
svm_y_pred_tfidf_bigram = clf_svm_tfidf_bigram.predict(X_test_tfidf_bigram)
svm_y_pred_w2v = clf_svm_w2v.predict(X_test_w2v)

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_tf_unigram, svm_y_pred_tf_unigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('SVM TF Unigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('SVM_TF_Unigram.png', dpi=500)
print(classification_report(y_test_tf_unigram, svm_y_pred_tf_unigram))

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_tf_bigram, svm_y_pred_tf_bigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('SVM TF Bigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('SVM_TF_Bigram.png', dpi=500)
print(classification_report(y_test_tf_bigram, svm_y_pred_tf_bigram))

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_tfidf_unigram, svm_y_pred_tfidf_unigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('SVM TF-IDF Unigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('SVM_TF-IDF_Unigram.png', dpi=500)
print(classification_report(y_test_tfidf_unigram, svm_y_pred_tfidf_unigram))

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_tfidf_bigram, svm_y_pred_tfidf_bigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('SVM TF-IDF Bigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('SVM_TF-IDF_Bigram.png', dpi=500)
print(classification_report(y_test_tfidf_bigram, svm_y_pred_tfidf_bigram))

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_w2v, svm_y_pred_w2v)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('SVM W2V')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('SVM_W2V.png', dpi=500)
print(classification_report(y_test_w2v, svm_y_pred_w2v))

# Random Forest

In [None]:
# Random Forest
clf_rf_tf_unigram = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train_tf_unigram, y_train_tf_unigram)
clf_rf_tf_bigram = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train_tf_bigram, y_train_tf_bigram)
clf_rf_tfidf_unigram = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train_tfidf_unigram, y_train_tfidf_unigram)
clf_rf_tfidf_bigram = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train_tfidf_bigram, y_train_tfidf_bigram)
clf_rf_w2v = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train_w2v, y_train_w2v)

rf_y_pred_tf_unigram = clf_rf_tf_unigram.predict(X_test_tf_unigram)
rf_y_pred_tf_bigram = clf_rf_tf_bigram.predict(X_test_tf_bigram)
rf_y_pred_tfidf_unigram = clf_rf_tfidf_unigram.predict(X_test_tfidf_unigram)
rf_y_pred_tfidf_bigram = clf_rf_tfidf_bigram.predict(X_test_tfidf_bigram)
rf_y_pred_w2v = clf_rf_w2v.predict(X_test_w2v)

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_tf_unigram, rf_y_pred_tf_unigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('RF TF Unigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('RF_TF_Unigram.png', dpi=500)
print(classification_report(y_test_tf_unigram, rf_y_pred_tf_unigram))

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_tf_bigram, rf_y_pred_tf_bigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('RF TF Bigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('RF_TF_Bigram.png', dpi=500)
print(classification_report(y_test_tf_bigram, rf_y_pred_tf_bigram))

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_tfidf_unigram, rf_y_pred_tfidf_unigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('RF TF-IDF Unigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('RF_TF-IDF_Unigram.png', dpi=500)
print(classification_report(y_test_tfidf_unigram, rf_y_pred_tfidf_unigram))

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_tfidf_bigram, rf_y_pred_tfidf_bigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('RF TF-IDF Bigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('RF_TF-IDF_Bigram.png', dpi=500)
print(classification_report(y_test_tfidf_bigram, rf_y_pred_tfidf_bigram))

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_w2v, rf_y_pred_w2v)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('RF W2V')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('RF_W2V.png', dpi=500)
print(classification_report(y_test_w2v, rf_y_pred_w2v))

# Logistic Regression

In [None]:
# Logistic Regression
clf_lr_tf_unigram = LogisticRegression(C=1000,max_iter=1000,random_state=42).fit(X_train_tf_unigram, y_train_tf_unigram)
clf_lr_tf_bigram = LogisticRegression(C=1000,max_iter=1000,random_state=42).fit(X_train_tf_bigram, y_train_tf_bigram)
clf_lr_tfidf_unigram = LogisticRegression(C=1000,max_iter=1000,random_state=42).fit(X_train_tfidf_unigram, y_train_tfidf_unigram)
clf_lr_tfidf_bigram = LogisticRegression(C=1000,max_iter=1000,random_state=42).fit(X_train_tfidf_bigram, y_train_tfidf_bigram)
clf_lr_w2v = LogisticRegression(C=1000,max_iter=1000,random_state=42).fit(X_train_w2v, y_train_w2v)

lr_y_pred_tf_unigram = clf_lr_tf_unigram.predict(X_test_tf_unigram)
lr_y_pred_tf_bigram = clf_lr_tf_bigram.predict(X_test_tf_bigram)
lr_y_pred_tfidf_unigram = clf_lr_tfidf_unigram.predict(X_test_tfidf_unigram)
lr_y_pred_tfidf_bigram = clf_lr_tfidf_bigram.predict(X_test_tfidf_bigram)
lr_y_pred_w2v = clf_lr_w2v.predict(X_test_w2v)

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_tf_unigram, lr_y_pred_tf_unigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('LR TF Unigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('LR_TF_Unigram.png', dpi=500)
print(classification_report(y_test_tf_unigram, lr_y_pred_tf_unigram))

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_tf_bigram, lr_y_pred_tf_bigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('LR TF Bigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('LR_TF_Bigram.png', dpi=500)
print(classification_report(y_test_tf_bigram, lr_y_pred_tf_bigram))

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_tfidf_unigram, lr_y_pred_tfidf_unigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('LR TF-IDF Unigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('LR_TF-IDF_Unigram.png', dpi=500)
print(classification_report(y_test_tfidf_unigram, lr_y_pred_tfidf_unigram))

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_tfidf_bigram, lr_y_pred_tfidf_bigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('LR TF-IDF Bigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('LR_TF-IDF_Bigram.png', dpi=500)
print(classification_report(y_test_tfidf_bigram, lr_y_pred_tfidf_bigram))

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_w2v, lr_y_pred_w2v)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('LR W2V')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('LR_W2V.png', dpi=500)
print(classification_report(y_test_w2v, lr_y_pred_w2v))

# LightGBM

In [None]:
# LightGBM
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

## convert data to float64
lgb_X_train_tf_unigram = X_train_tf_unigram.astype(np.float64)
lgb_y_train_tf_unigram = y_train_tf_unigram.astype(np.float64)
lgb_X_train_tf_bigram = X_train_tf_bigram.astype(np.float64)
lgb_y_train_tf_bigram = y_train_tf_bigram.astype(np.float64)
lgb_X_train_tfidf_unigram = X_train_tfidf_unigram.astype(np.float64)
lgb_y_train_tfidf_unigram = y_train_tfidf_unigram.astype(np.float64)
lgb_X_train_tfidf_bigram = X_train_tfidf_bigram.astype(np.float64)
lgb_y_train_tfidf_bigram = y_train_tfidf_bigram.astype(np.float64)
##
lgb_X_train_w2v = X_train_w2v
lgb_X_test_w2v  = X_test_w2v
##


## convert data to float64
lgb_X_test_tf_unigram = X_test_tf_unigram.astype(np.float64)
lgb_y_test_tf_unigram = y_test_tf_unigram.astype(np.float64)
lgb_X_test_tf_bigram = X_test_tf_bigram.astype(np.float64)
lgb_y_test_tf_bigram = y_test_tf_bigram.astype(np.float64)
lgb_X_test_tfidf_unigram = X_test_tfidf_unigram.astype(np.float64)
lgb_y_test_tfidf_unigram = y_test_tfidf_unigram.astype(np.float64)
lgb_X_test_tfidf_bigram = X_test_tfidf_bigram.astype(np.float64)
lgb_y_test_tfidf_bigram = y_test_tfidf_bigram.astype(np.float64)
##
lgb_y_test_w2v = y_test_w2v
lgb_y_train_w2v = y_train_w2v
####
##

train_data = lgb.Dataset(lgb_X_train_tf_unigram, label=lgb_y_train_tf_unigram)
clf_lgb_tf_unigram = lgb.train(params, train_data, 100)

train_data = lgb.Dataset(lgb_X_train_tf_bigram, label=lgb_y_train_tf_bigram)
clf_lgb_tf_bigram = lgb.train(params, train_data, 100)

train_data = lgb.Dataset(lgb_X_train_tfidf_unigram, label=lgb_y_train_tfidf_unigram)
clf_lgb_tfidf_unigram = lgb.train(params, train_data, 100)

train_data = lgb.Dataset(lgb_X_train_tfidf_bigram, label=lgb_y_train_tfidf_bigram)
clf_lgb_tfidf_bigram = lgb.train(params, train_data, 100)

train_data = lgb.Dataset(lgb_X_train_w2v, label=lgb_y_train_w2v)
clf_lgb_w2v = lgb.train(params, train_data, 100)


lgb_y_pred_tf_unigram = clf_lgb_tf_unigram.predict(lgb_X_test_tf_unigram)
lgb_y_pred_tf_bigram = clf_lgb_tf_bigram.predict(lgb_X_test_tf_bigram)
lgb_y_pred_tfidf_unigram = clf_lgb_tfidf_unigram.predict(lgb_X_test_tfidf_unigram)
lgb_y_pred_tfidf_bigram = clf_lgb_tfidf_bigram.predict(lgb_X_test_tfidf_bigram)
lgb_y_pred_w2v = clf_lgb_w2v.predict(lgb_X_test_w2v)


In [None]:
lgb_y_pred_tf_unigram = np.round(lgb_y_pred_tf_unigram).astype(int)


In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(lgb_y_test_tf_unigram, lgb_y_pred_tf_unigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('LGB TF Unigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('LGB_TF_Unigram.png', dpi=500)
print(classification_report(lgb_y_test_tf_unigram, lgb_y_pred_tf_unigram))

In [None]:
lgb_y_pred_tf_bigram = np.round(lgb_y_pred_tf_bigram).astype(int)

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(lgb_y_test_tf_bigram, lgb_y_pred_tf_bigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('LGB TF Bigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('LGB_TF_Bigram.png', dpi=500)
print(classification_report(lgb_y_test_tf_bigram, lgb_y_pred_tf_bigram))

In [None]:
lgb_y_pred_tfidf_unigram = np.round(lgb_y_pred_tfidf_unigram).astype(int)

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(lgb_y_test_tfidf_unigram, lgb_y_pred_tfidf_unigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('LGB TF-IDF Unigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('LGB_TF-IDF_Unigram.png', dpi=500)
print(classification_report(lgb_y_test_tfidf_unigram, lgb_y_pred_tfidf_unigram))

In [None]:
lgb_y_pred_tfidf_bigram = np.round(lgb_y_pred_tfidf_bigram).astype(int)

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(lgb_y_test_tfidf_bigram, lgb_y_pred_tfidf_bigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('LGB TF-IDF Bigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('LGB_TF-IDF_Bigram.png', dpi=500)
print(classification_report(lgb_y_test_tfidf_bigram, lgb_y_pred_tfidf_bigram))

In [None]:
lgb_y_pred_w2v = np.round(lgb_y_pred_w2v).astype(int)

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(lgb_y_test_w2v, lgb_y_pred_w2v)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('LGB W2V')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('LGB_W2V.png', dpi=500)
print(classification_report(lgb_y_test_w2v, lgb_y_pred_w2v))

# XGBoost

In [None]:
# XGBoost

clf_xgb_tf_unigram = XGBClassifier(max_depth=9, 
                    learning_rate=0.1,
                    n_estimators=2245, 
                    silent=True, 
                    nthread=-1,
                    missing=np.nan, 
                    objective='binary:logistic',
                    gamma=0.0, 
                    min_child_weight=1, 
                    max_delta_step=2, 
                    subsample=0.39, 
                    colsample_bytree=0.53, 
                    base_score=0.5, 
                    seed=395277)

clf_xgb_tf_bigram = XGBClassifier(max_depth=9, 
                    learning_rate=0.1,
                    n_estimators=2245, 
                    silent=True, 
                    nthread=-1,
                    missing=np.nan, 
                    objective='binary:logistic',
                    gamma=0.0, 
                    min_child_weight=1, 
                    max_delta_step=2, 
                    subsample=0.39, 
                    colsample_bytree=0.53, 
                    base_score=0.5, 
                    seed=395277)

clf_xgb_tfidf_unigram = XGBClassifier(max_depth=9, 
                    learning_rate=0.1,
                    n_estimators=2245, 
                    silent=True, 
                    nthread=-1,
                    missing=np.nan, 
                    objective='binary:logistic',
                    gamma=0.0, 
                    min_child_weight=1, 
                    max_delta_step=2, 
                    subsample=0.39, 
                    colsample_bytree=0.53, 
                    base_score=0.5, 
                    seed=395277)

clf_xgb_tfidf_bigram = XGBClassifier(max_depth=9, 
                    learning_rate=0.1,
                    n_estimators=2245, 
                    silent=True, 
                    nthread=-1,
                    missing=np.nan, 
                    objective='binary:logistic',
                    gamma=0.0, 
                    min_child_weight=1, 
                    max_delta_step=2, 
                    subsample=0.39, 
                    colsample_bytree=0.53, 
                    base_score=0.5, 
                    seed=395277)

clf_xgb_w2v = XGBClassifier(max_depth=9, 
                    learning_rate=0.1,
                    n_estimators=2245, 
                    silent=True, 
                    nthread=-1,
                    missing=np.nan, 
                    objective='binary:logistic',
                    gamma=0.0, 
                    min_child_weight=1, 
                    max_delta_step=2, 
                    subsample=0.39, 
                    colsample_bytree=0.53, 
                    base_score=0.5, 
                    seed=395277)
###

clf_xgb_tf_unigram.fit(X_train_tf_unigram, y_train_tf_unigram)
clf_xgb_tf_bigram.fit(X_train_tf_bigram, y_train_tf_bigram)
clf_xgb_tfidf_unigram.fit(X_train_tfidf_unigram, y_train_tfidf_unigram)
clf_xgb_tfidf_bigram.fit(X_train_tfidf_bigram, y_train_tfidf_bigram)
clf_xgb_w2v.fit(X_train_w2v, y_train_w2v)


xgb_y_pred_tf_unigram = clf_xgb_tf_unigram.predict(X_test_tf_unigram)
xgb_y_pred_tf_bigram = clf_xgb_tf_bigram.predict(X_test_tf_bigram)
xgb_y_pred_tfidf_unigram = clf_xgb_tfidf_unigram.predict(X_test_tfidf_unigram)
xgb_y_pred_tfidf_bigram = clf_xgb_tfidf_bigram.predict(X_test_tfidf_bigram)
xgb_y_pred_w2v = clf_xgb_w2v.predict(X_test_w2v)

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_tf_unigram, xgb_y_pred_tf_unigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('XGB TF Unigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('XGB_TF_Unigram.png', dpi=500)
print(classification_report(y_test_tf_unigram, xgb_y_pred_tf_unigram))

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_tf_bigram, xgb_y_pred_tf_bigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('XGB TF Bigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('XGB_TF_Bigram.png', dpi=500)
print(classification_report(y_test_tf_bigram, xgb_y_pred_tf_bigram))

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_tfidf_unigram, xgb_y_pred_tfidf_unigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('XGB TF-IDF Unigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('XGB_TF-IDF_Unigram.png', dpi=500)
print(classification_report(y_test_tfidf_unigram, xgb_y_pred_tfidf_unigram))

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_tfidf_bigram, xgb_y_pred_tfidf_bigram)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('XGB TF-IDF Bigram')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('XGB_TF-IDF_Bigram.png', dpi=500)
print(classification_report(y_test_tfidf_bigram, xgb_y_pred_tfidf_bigram))

In [None]:
target_names = ['NÃO', 'SIM']
cf_matrix = metrics.confusion_matrix(y_test_w2v, xgb_y_pred_w2v)
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
ax.set_title('XGB W2V')
ax.set_xlabel('Valores previstos')
ax.set_ylabel('Valores reais')
ax.xaxis.set_ticklabels(target_names)
ax.yaxis.set_ticklabels(target_names)
figure = ax.get_figure()
figure.savefig('XGB_W2V.png', dpi=500)
print(classification_report(y_test_w2v, xgb_y_pred_w2v))

# Save the Best Model for Use in New Posts

In [None]:
import pickle

# Save the best-trained model to test on new data
with open('modelo_lgb_tfidf_unigram_ptbr.pkl', 'wb') as file:
    pickle.dump(clf_lgb_tfidf_unigram, file)

In [None]:
# Save the TF-IDF vectorizer to a file using pickle
with open('tfidf_vec_unigram_ptbr.pkl', 'wb') as file:
    pickle.dump(tfidf_vec_unigram, file)