# Preprocessing Tweets

In [26]:
import numpy as np
import pandas as pd
import emoji
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer   
from nltk.stem.wordnet import WordNetLemmatizer
import re

In [27]:
stopwords_eng =stopwords.words("english")
stopwords_es =stopwords.words("spanish")
stopwords = stopwords_eng + stopwords_es

def cleanTweets(s):
    #Function to clean tweets, for now i am keeping emojis and hashtags. Alternative version
    if type(s)==np.float:
        return ""
    #Demojize text
    s=emoji.demojize(s,delimiters=("", " "))
    
    #Remove new lines, etc.
    s = s.replace(r'<lb>', "\n")
    s = s.replace(r'<tab>', "\i")
    s = re.sub(r'<br */*>', "\n", s)
    s = s.replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&")
    s = s.replace("&amp;", "&")
    s = s.replace("\n", " ")
    
    # markdown urls
    s = re.sub(r'\(https*://[^\)]*\)', "", s)
    # normal urls
    s = re.sub(r'https*://[^\s]*', "", s)
    #s = re.sub(r'_+', ' ', s)
    s = re.sub(r'"+', '"', s)
    #Remove punctuation    
    s = re.sub('[()!?]', ' ', s)
    s = re.sub('\[.*?\]',' ', s)
    # custom removals
    s = re.sub(r'@[A-Za-z0-9_]+', "@usermention", s) # replace mentions
    #s = re.sub(r':[^:]+','[emoji]',s) # remove demojized text
    
    #Split multiword hashtags into individual words - they could contain spin
    #s = re.sub(r'#(\w+)', rep, s) # split hashtags
    
    s=s.lower()
    
    #Remove stopwords
    s=s.split()
    s= [w for w in s if not s in stopwords]
    
    
    s=" ".join(word for word in s)
    
    return str(s)


stemmer_eng=SnowballStemmer("english")
stemmer_es=SnowballStemmer("spanish")
lem = WordNetLemmatizer()

def stem_lematize(s, modulation):
    tokens = re.split(r'\W+', s)
    stems = []
    for token in tokens:
        if modulation==1:
            eng=stemmer_es.stem(token)
            stems.append(stemmer_es.stem(eng))
        if modulation==2:
            stems.append(lem.lemmatize(token))
        if modulation==0:
            stems.append(tokens)
    s=" ".join(word for word in stems)
    return s

In [20]:
#Read-In Corpus
corpus_data=pd.read_csv("Data/all_tweets_filtered.csv")

  corpus_data=pd.read_csv("Data/all_tweets_filtered.csv")


In [21]:
corpus_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 973904 entries, 0 to 973903
Data columns (total 22 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   Unnamed: 0                             973904 non-null  int64  
 1   id                                     973904 non-null  int64  
 2   author_id                              973904 non-null  int64  
 3   created_at                             973904 non-null  object 
 4   text                                   973904 non-null  object 
 5   public_metrics.like_count              973904 non-null  int64  
 6   public_metrics.quote_count             973904 non-null  int64  
 7   public_metrics.reply_count             973904 non-null  int64  
 8   public_metrics.retweet_count           973904 non-null  int64  
 9   author.username                        973904 non-null  object 
 10  Country                                973904 non-null  

In [23]:
corpus_data['cleaned_text'] = [cleanTweets(text) for text in corpus_data['text']]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if type(s)==np.float:


In [28]:
corpus_data['stem_text'] = [stem_lematize(text, 1) for text in corpus_data['cleaned_text']]

In [34]:
corpus_data["text"].head(30)

0     Les recomiendo ver este hilo donde el capo de ...
1     Que mala noticia, el acuerdo permitirá abrir e...
2     Mayoría (o todos) los q hoy critican heroicame...
3     Lo de Abel debe ser aclarado ahora por la máxi...
4     #FelizSábado no olvidemos que esta semana tuvi...
5     Durante este año he recibido muchas denuncias ...
6     Leo en redes y escucho en los pasillos el #Vam...
7     Buena nueva! Acabamos de ampliar la capacidad ...
8     Una #NuevaConstitucionParaChile es el primer p...
9     Han sido días durísimos, pero hoy damos un tre...
10    La Catedral de #PuertoMontt fue construida en ...
11    Quienes dice #NoalaAsambleConstituyente como l...
12    Sólo un plebiscito garantizará cual es la opci...
13    Desde el primer día de movilizaciones hemos pr...
14    A unos amigos se les quemó parte de su casa en...
15    @SylviaEyzaguirr Desde el Partido Liberal hemo...
16    @JorgeUrzuaLira1 Jorge, condeno la violencia e...
17    14 movimientos políticos hemos dicho q una

In [32]:
corpus_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 973904 entries, 0 to 973903
Data columns (total 24 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   Unnamed: 0                             973904 non-null  int64  
 1   id                                     973904 non-null  int64  
 2   author_id                              973904 non-null  int64  
 3   created_at                             973904 non-null  object 
 4   text                                   973904 non-null  object 
 5   public_metrics.like_count              973904 non-null  int64  
 6   public_metrics.quote_count             973904 non-null  int64  
 7   public_metrics.reply_count             973904 non-null  int64  
 8   public_metrics.retweet_count           973904 non-null  int64  
 9   author.username                        973904 non-null  object 
 10  Country                                973904 non-null  

In [33]:
corpus_data.to_csv("all_tweets_preprocessed.csv") 