<a href="https://colab.research.google.com/github/seomukul/BeRT-Internal-Linking/blob/main/bertopic_serp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scrapea y Clasifica los tópicos de la SERP:

1.   Cambia en el código la keyword por lo que quieras (ej. keyword = "mejores cafeteras")
2.   Pulsa el play
3.   Espera a que el script acabe

=> Los tópicos aparerán debajo del script (una tabla completa por cada tópico y 2 gráficas resumen)

Un saludo desde Mallorca,

[Jose Gris](https://twitter.com/JoseGrisSEO) 😎

<br>

----
Si te son útiles mis colabs [invítame a un café](https://www.buymeacoffee.com/josegris)

---

In [None]:
#Cambia la keyword por lo que quieras (ej. keyword = "mejores cafeteras")
keyword = "fiebre"

!pip install bertopic
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Datos que guardo para cada resultado
class pagina():
    def __init__(self, url):
        self.url = url
        self.texto = ""
        self.enlaces  = []
        self.headings = []

# Clase para recuperar los resultados de las SERP y scrapearlos
class serp():

    def __init__(self, query):
        self.query = query.replace(" ", "+")
        self.ok = False
        self.incidendias = []
        self.paginas = []
        self.start()

    def start(self):
        URL = "https://www.google.com/search?hl=es&gl=es&q=%s&oq=%s" % (self.query, self.query)
        print(URL)
        headers =  {"user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"}
        enlacesSerp = []
        resp = requests.get(URL, headers = headers)
        if resp.status_code == 200:
            soup = BeautifulSoup(resp.content, "html.parser")
            links = soup.find_all("div", {"class" : "g"})
            print("Procesando urls:")
            for x in links:
                links0 = x.find_all("a", href=True)
                if len(links0) == 0:continue
                link = links0[0]['href']
                #corrijo feature snippet
                if "#:~:text" in link:
                    link = link.split("#:~:text")[0]
                if link.startswith("http") == False:continue



                if not link in enlacesSerp:
                    print(link)
                    enlacesSerp.append(link)
                    resp = requests.get(link, headers = headers)
                    soup = BeautifulSoup(resp.content, "html.parser")

                    if resp.status_code == 200:
                        pag = pagina(link)

                        #recupero texto
                        texto = ""
                        tag = soup.body
                        for string in tag.strings:
                            texto = texto + " " + string
                        pag.texto = texto

                        #recupero enlaces
                        for en in soup.find_all("a", href=True):
                            if "#" in en["href"] or "action=edit" in en["href"]:continue
                            if en.text not in pag.enlaces:
                                pag.enlaces.append(en.text)

                        #recupero headings
                        for heading in soup.find_all(['h1', 'h2', 'h3', 'h4']):
                            if heading.text not in pag.headings:
                                pag.headings.append([heading.name, heading.text.strip('\n').strip()])

                        self.paginas.append(pag)
                    else:
                        self.incidendias.append(f"Status code: {resp.status_code} url: {link}")

                #break
            if len(self.paginas) < 5:
                self.ok = True
                self.incidendias.append("Menos de 5 resultados escrapeados")
            else:
                self.ok = True


        else:
            self.ok = False

        #reporto incidencias scrapeo
        print(str(len(self.paginas)) + " páginas escrapeadas correctamente")

        if len(self.incidendias) > 0:
            print("Incidencias")
            print("-----------")
            for x in self.incidendias:
                print(x)

import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from bertopic import BERTopic

#scrapeo keywords en google
scrap = serp(keyword)

if scrap.ok == False:
    print("No se ha completado el scrap, se detiene el análisis => Sorry...")
    quit()

# Comienzo a preparar el modelo
print("Empezamos a crear el modelo...")
corpus  = ""
for art in scrap.paginas:
    texto = art.texto
    corpus = corpus + " " + texto


# Limpio el texto antes de procesarlo
corpus = re.sub(r'\s+', ' ', corpus)
corpus = re.sub(r'\n+', ' ', corpus)
corpus = re.sub(r'\t+', ' ', corpus)
corpus = re.sub(r'http\S+', '', corpus)
frases = nltk.sent_tokenize(corpus)

tokenizado = [nltk.word_tokenize(sent) for sent in frases]

# Quito Stop Words
from nltk.corpus import stopwords
for i in range(len(tokenizado)):
    tokenizado[i] = [w for w in tokenizado[i] if w not in stopwords.words('spanish')]
    tokenizado[i] = " ".join(tokenizado[i])

# Creo el modelo
model = BERTopic(language="spanish")
#model = BERTopic(verbose=True, language="spanish")
#model = BERTopic(verbose=True, language="spanish", n_gram_range=(1,3)) ngrams parece ponerlos por separado y conjuntos => mejor no usar
#model = BERTopic(verbose=True, language="spanish", n_gram_range=(1,3), nr_topics="auto")
#model = BERTopic(verbose=True, language="spanish", n_gram_range=(1,3), calculate_probabilities=True)
#model = BERTopic(verbose=True, language="spanish", n_gram_range=(1,3), nr_topics="auto", calculate_probabilities=True)

topics, probabilities = model.fit_transform(tokenizado)

# Imprimo resulados
freq = model.get_topic_info()
print("-----------")
print("Número de frases por tópico")
print("-----------")
print(freq)

#Imprimo todos los tópicos incluyendo todas las keys (en la gráfica sólo incluiré 5 keys por tópico por claridad)
all_topics = model.get_topics()
for topico in all_topics:
  print("-----------")
  print(" => Tópico " + str(topico))
  df = pd.DataFrame(model.get_topic(topico))
  print(df)

# Visualizo gráficas de resultados
#fig1 = model.visualize_barchart()
fig1 = model.visualize_barchart(top_n_topics=len(model.get_topics()))
#fig1 = model.visualize_barchart(n_words=10)
#fig1 = model.visualize_barchart(top_n_topics=len(model.get_topics()), n_words=10)
fig1.show()


fig2 = model.visualize_topics()
fig2.show()


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.10.0-py2.py3-none-any.whl (58 kB)
[K     |████████████████████████████████| 58 kB 1.9 MB/s 
Collecting hdbscan>=0.8.28
  Downloading hdbscan-0.8.28.tar.gz (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 9.7 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 6.1 MB/s 
Collecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 6.7 MB/s 
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 29.5 MB/s 
Collecting sen

Downloading:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.8M [00:00<?, ?B/s]



-----------
Número de frases por tópico
-----------
    Topic  Count                                         Name
0       0    119            0_fiebre_infecciones_infección_la
1      -1    117           -1_fiebre_la_infección_información
2       1     97              1_temperatura_37_corporal_calor
3       2     53  2_sede_curriculum_especialista_departamento
4       3     40                        3_tiene_niños_hijo_si
5       4     33     4_examen_pacientes_detectar_inspeccionar
6       5     26           5_globalnotice_inglés_también_font
7       6     18                  6_médica_área_para_paciente
8       7     18                  7_líquidos_beber_agua_fríos
9       8     16      8_tomar_paracetamol_aspirina_ibuprofeno
10      9     16    9_síntomas_obtenga_meningitis_información
11     10     14       10_dolor_cabeza_intenso_articulaciones
12     11     12         11_hay_obtenga_información_modificar
13     12     11                        12_document_var_cm_c1
14     13     10  