In [1]:
# Montamos la unidad Drive para acceder a los archivos de Google Drive
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/TFM

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/11XUs_KemRo-9g_Wq5j0mL3xQwNLaexMX/TFM


In [2]:
# Importamos las librerías necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Cargamos el archivo csv en un DataFrame
df = pd.read_csv("vaccination_all_tweets.csv")

In [3]:
# Cargamos la librería y las funciones necesarias
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
import string


tweet= df['text']
# Mostramos la frase por pantalla
print("Frase a preprocesar:\n", tweet)

Frase a preprocesar:
 0         Same folks said daikon paste could treat a cyt...
1         While the world has been on the wrong side of ...
2         #coronavirus #SputnikV #AstraZeneca #PfizerBio...
3         Facts are immutable, Senator, even when you're...
4         Explain to me again why we need a vaccine @Bor...
                                ...                        
228202    45+ #URBAN #Bengaluru #CovidVaccine Availabili...
228203    18-44 #BBMP #Bengaluru #CovidVaccine Availabil...
228204    18-44 #URBAN #Bengaluru #CovidVaccine Availabi...
228205    They promote their Vaccines leaving out the st...
228206    45+ #URBAN #Bengaluru #CovidVaccine Availabili...
Name: text, Length: 228207, dtype: object


In [4]:
import re
# Limpiar texto
def clean_text(tweet):
    text = re.sub(r'http\S+', '', tweet)  # Eliminar URLs
    text = re.sub(r'@\w+', '', tweet)     # Eliminar menciones
    text = re.sub(r'#\w+', '', tweet)     # Eliminar hashtags
    text = re.sub(r'\d+', '', tweet)      # Eliminar números
    text = re.sub(r'\s+', ' ', tweet)     # Eliminar espacios adicionales
    text = text.lower()                   # Convertir a minúsculas
    return text

# Aplicar la función de limpieza a cada tweet
df['cleaned_tweet'] = df['text'].apply(clean_text)

# Verificar la limpieza correcta de los datos
print(df[['text', 'cleaned_tweet']].head())

                                                text  \
0  Same folks said daikon paste could treat a cyt...   
1  While the world has been on the wrong side of ...   
2  #coronavirus #SputnikV #AstraZeneca #PfizerBio...   
3  Facts are immutable, Senator, even when you're...   
4  Explain to me again why we need a vaccine @Bor...   

                                       cleaned_tweet  
0  same folks said daikon paste could treat a cyt...  
1  while the world has been on the wrong side of ...  
2  #coronavirus #sputnikv #astrazeneca #pfizerbio...  
3  facts are immutable, senator, even when you're...  
4  explain to me again why we need a vaccine @bor...  


In [5]:
import nltk
from nltk.tokenize import word_tokenize

# Descargar el paquete punkt si no lo has hecho ya
nltk.download('punkt')

# Suponiendo que `tweet` es una serie de pandas que contiene el texto que deseas tokenizar
# Convertir la serie de pandas en una cadena de texto
tweet_texto = tweet.values[0]  # Si `tweet` contiene solo un texto, extraerlo usando `.values[0]`

todos_tokens = []

# Iterar sobre cada tweet y tokenizarlo
for tweet in tweet:
    # Tokenizar el texto del tweet actual
    tokens = word_tokenize(tweet)
    # Agregar los tokens de este tweet a la lista de todos los tokens
    todos_tokens.extend(tokens)

# Mostrar algunos tokens de ejemplo
print("Algunos tokens: ", todos_tokens[:10])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Algunos tokens:  ['Same', 'folks', 'said', 'daikon', 'paste', 'could', 'treat', 'a', 'cytokine', 'storm']


In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
import string
import pandas as pd

# Descargar los paquetes necesarios si no lo has hecho ya
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')

# Lista para almacenar los resultados de preprocesamiento de cada tweet
tweets_preprocesados = []

# Iterar sobre cada tweet y aplicar preprocesamiento
for tweet in df['text']:
    # Tokenizar el tweet
    tokens = word_tokenize(tweet)

    # Eliminar las stop words
    stop_words = set(stopwords.words('english'))
    tokens_nosw = [token for token in tokens if token.lower() not in stop_words]

    # Eliminar números y signos de puntuación
    filtered_tokens = [token for token in tokens_nosw if not token.isdigit() and not token in string.punctuation]

    # Convertir todos los tokens a minúsculas
    clean_tokens = [token.lower() for token in filtered_tokens]

    # Aplicar las técnicas de stemming y lematización
    stemmer = SnowballStemmer('spanish')
    lemmatizer = WordNetLemmatizer()
    stemmed_tokens = [stemmer.stem(token) for token in clean_tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in clean_tokens]

    # Seleccionar tokens finales (en este caso, los tokens stemmatizados)
    final_tokens = stemmed_tokens.copy()

    # Aplicar POS tagging
    pos_tags = pos_tag(final_tokens)

    # Agregar los resultados a la lista de tweets preprocesados
    tweets_preprocesados.append({
        'original': tweet,
        'sin_stopwords': tokens_nosw,
        'sin_numeros_puntuacion': filtered_tokens,
        'minusculas': clean_tokens,
        'stemming': stemmed_tokens,
        'lematizacion': lemmatized_tokens,
        'pos_tags': pos_tags
    })

# Convertir la lista de tweets preprocesados a un DataFrame
df_preprocesado = pd.DataFrame(tweets_preprocesados)

# Mostrar el DataFrame preprocesado
print(df_preprocesado)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                                 original  \
0       Same folks said daikon paste could treat a cyt...   
1       While the world has been on the wrong side of ...   
2       #coronavirus #SputnikV #AstraZeneca #PfizerBio...   
3       Facts are immutable, Senator, even when you're...   
4       Explain to me again why we need a vaccine @Bor...   
...                                                   ...   
228202  45+ #URBAN #Bengaluru #CovidVaccine Availabili...   
228203  18-44 #BBMP #Bengaluru #CovidVaccine Availabil...   
228204  18-44 #URBAN #Bengaluru #CovidVaccine Availabi...   
228205  They promote their Vaccines leaving out the st...   
228206  45+ #URBAN #Bengaluru #CovidVaccine Availabili...   

                                            sin_stopwords  \
0       [folks, said, daikon, paste, could, treat, cyt...   
1       [world, wrong, side, history, year, ,, hopeful...   
2       [#, coronavirus, #, SputnikV, #, AstraZeneca, ...   
3       [Facts, immutab

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Aplicamos el método TF-IDF a las frases
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df["text"])

# Mostramos el vocabulario (Corpus)
print("Vocabulario", tfidf.get_feature_names_out())

# Valor IDF para todas las palabras del vocabulario
print("IDF for all words in the vocabulary :\n", tfidf.idf_)

# Mostramos la representación TFIDF de algunos tokens
print('\nRepresentación TFIDF de "{}" es \n{}'
      .format(df["text"][0], tfidf_matrix[0].toarray()))
print('Representación TFIDF de "{}" es \n{}'
      .format(df["text"][1], tfidf_matrix[1].toarray()))
print('Representación TFIDF de "{}" es \n{}'
      .format(df["text"][2],tfidf_matrix[2].toarray()))

Vocabulario ['00' '000' '0000' ... '𝟳𝟮' '𝟻𝟺𝟾' '𝟿𝟶𝟿']
IDF for all words in the vocabulary :
 [ 5.99977462  5.66293092 11.95171841 ... 12.64486559 12.64486559
 12.64486559]

Representación TFIDF de "Same folks said daikon paste could treat a cytokine storm #PfizerBioNTech https://t.co/xeHhIMg1kF" es 
[[0. 0. 0. ... 0. 0. 0.]]
Representación TFIDF de "While the world has been on the wrong side of history this year, hopefully, the biggest vaccination effort we've ev… https://t.co/dlCHrZjkhm" es 
[[0. 0. 0. ... 0. 0. 0.]]
Representación TFIDF de "#coronavirus #SputnikV #AstraZeneca #PfizerBioNTech #Moderna #Covid_19 Russian vaccine is created to last 2-4 years… https://t.co/ieYlCKBr8P" es 
[[0. 0. 0. ... 0. 0. 0.]]


In [8]:
# Guardar el DataFrame procesado
df.to_csv('tweets_processed.csv', index=False)

# Crear y guardar el DataFrame de palabras clave (vocabulario)
vocab_df = pd.DataFrame(tfidf.get_feature_names_out(), columns=['keywords'])
vocab_df.to_csv('keywords.csv', index=False)
