In [22]:
import json

data_file = open("./training_data_set.json")
raw_data = json.load(data_file)
data_file.close()

sites = raw_data.get("sitesAndTags", [])
print("Quantidade total de sites:", len(sites))

Quantidade total de sites: 200


In [23]:
from sklearn.model_selection import train_test_split
import requests
from bs4 import BeautifulSoup

# Function to remove tags
def remove_tags(html):
    soup = BeautifulSoup(html, "html.parser")
  
    for data in soup(['style', 'script']):
        data.decompose()
  
    return ' '.join(soup.stripped_strings)

sites_contents = []
y_sites = []

for [url, usefulness] in sites:
    r = requests.get(url)
    filtered_content = remove_tags(r.content)
    sites_contents.append(filtered_content)
    y_sites.append(usefulness)
    print(url)

https://www.bigboygames.com.br/jogo-fallen-legion-revenants-vanguard-edition-nintendo-switch
https://www.bigboygames.com.br/world-of-warriors-ps4-4955-p993556
https://www.bigboygames.com.br/jogo-monster-hunter-world-seminovo-ps4-11807-br-p1006227
https://www.bigboygames.com.br/jogo-back-4-blood-xbox-one
https://www.bigboygames.com.br/jogo-far-cry-6-xbox-one-p1006787
https://www.bigboygames.com.br/controle-baseus-sem-fio-transparente-seminovo-nintendo-switch
https://www.bigboygames.com.br/cartao-xbox-live-brasil-r200-5598-p995690
https://www.bigboygames.com.br/case-zelda-botw-seminovo-nintendo-switch-lite-15816
https://www.bigboygames.com.br/console-nintendo-switch-lite-amarelo-seminovo-16101
https://www.bigboygames.com.br/case-protetora-nintendo-swtich-lite-cinza-amarelo-14940
https://www.ibyte.com.br/jogo-hades-xbox/p
https://www.ibyte.com.br/marvel-s-spider-man-miles-morales-ps5/p
https://www.ibyte.com.br/game-fifa-18-xbox-one/p
https://www.shockgames.com.br/produto/horizon-zero-dawn

In [24]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sites_contents, y_sites, random_state = 1337)

Otimização do modelo SVM

In [26]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=1337,
                          max_iter=5, tol=None)),
])

parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'vect__min_df': (0.001, 0.005, 0.01, 0.05, 0.1, 0.15, 0.2),
    'vect__max_features': range(500, 10000, 500),
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [27]:
gs_clf.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        SGDClassifier(alpha=0.001, max_iter=5,
                                                      random_state=1337,
                                                      tol=None))]),
             n_jobs=-1,
             param_grid={'clf__alpha': (0.01, 0.001),
                         'tfidf__use_idf': (True, False),
                         'vect__max_features': range(500, 10000, 500),
                         'vect__min_df': (0.001, 0.005, 0.01, 0.05, 0.1, 0.15,
                                          0.2),
                         'vect__ngram_range': [(1, 1), (1, 2)]})

In [28]:
gs_clf.best_score_

0.7

In [29]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

print("Stats for optimized svm")

pred = gs_clf.predict(X_test)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, pred, labels=[True, False])
print("Precision:", precision)
print("recall:", recall)
print("f1:", f1)
print("===============================================================")

clf__alpha: 0.001
tfidf__use_idf: True
vect__max_features: 1500
vect__min_df: 0.05
vect__ngram_range: (1, 1)


Testing for Gaussian Naive Bayes

In [31]:
from sklearn.naive_bayes import GaussianNB
import numpy as np

gnb_text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', GaussianNB()),
])

gnb_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'vect__min_df': (0.001, 0.005, 0.01, 0.05, 0.1, 0.15, 0.2),
    'vect__max_features': range(500, 10000, 500),
    'tfidf__use_idf': (True, False),
    'clf__var_smoothing': np.logspace(0,-9, num=100),
}

gs_gnb_clf = GridSearchCV(gnb_text_clf, gnb_parameters, cv=5, n_jobs=-1)

In [32]:
gs_gnb_clf.fit(X_train, y_train)

In [None]:
for param_name in sorted(gnb_parameters.keys()):
    print("%s: %r" % (param_name, gs_gnb_clf.best_params_[param_name]))

print("Stats for optimized gnb")

pred = gs_clf.predict(X_test)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, pred, labels=[True, False])
print("Precision:", precision)
print("recall:", recall)
print("f1:", f1)
print("===============================================================")