## Data Cleaning

### Imports

In [None]:
!pip install nltk

In [None]:
!pip install unidecode

In [None]:
!pip install clean-text

In [None]:
!pip install openpyxl

In [1]:
import unidecode 
import numpy as np
import pandas as pd
import re
from cleantext import clean
import nltk
from nltk.corpus import stopwords
from nltk.corpus import stopwords 
nltk.download('stopwords') 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Priscila\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Reading file

In [2]:
read_file = pd.read_excel("Data/Tweet_Processed_DataCleaning_Test.xlsx")

In [3]:
read_file.to_csv ("Data/Tweet_Processed_DataCleaning_Test.csv", index = None, header=True)

In [4]:
cleaning_data = pd.read_csv('Data/Tweet_Processed_DataCleaning_Test.csv')
cleaning_data.head()

Unnamed: 0.1,Unnamed: 0,Date_,Time_,User,Tweet,Unnamed: 5,Unnamed: 6
0,0.0,2020-01-03 00:00:00,22:49:14,NonDucorDuco11,@EuEdsonDuarte @LiloVLOG @jairbolsonaro Exatam...,0.0,
1,1.0,2020-01-03 00:00:00,17:48:18,Circuito_D,A China fecha o primeiro laboratório do mundo ...,0.0,
2,2.0,2020-01-03 00:00:00,20:42:25,rafaelbboa,Janeiro: China mente sobre a % de mortos nos c...,0.0,
3,3.0,2020-01-03 00:00:00,15:18:53,DiabinhaBem,Nível de poluição na China cai drasticamente a...,0.0,
4,4.0,2020-01-03 00:00:00,19:20:35,therezafontoura,@eikebatista Os 19 que cruzam os oceanos traze...,0.0,


### Fixing dataframe

In [5]:
cleaning_data["Label"] = cleaning_data["Unnamed: 5"]

In [6]:
cleaning_data = cleaning_data[['Tweet', 'Label']]
cleaning_data.head()

Unnamed: 0,Tweet,Label
0,@EuEdsonDuarte @LiloVLOG @jairbolsonaro Exatam...,0.0
1,A China fecha o primeiro laboratório do mundo ...,0.0
2,Janeiro: China mente sobre a % de mortos nos c...,0.0
3,Nível de poluição na China cai drasticamente a...,0.0
4,@eikebatista Os 19 que cruzam os oceanos traze...,0.0


In [7]:
cleaning_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24200 entries, 0 to 24199
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Tweet   24200 non-null  object 
 1   Label   24200 non-null  float64
dtypes: float64(1), object(1)
memory usage: 378.2+ KB


In [8]:
cleaning_data = cleaning_data.dropna(how='any',axis=0) 

In [9]:
cleaning_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24200 entries, 0 to 24199
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Tweet   24200 non-null  object 
 1   Label   24200 non-null  float64
dtypes: float64(1), object(1)
memory usage: 567.2+ KB


### Regular Data Cleaning for NLP

In [10]:
data_nlp_cleaning = cleaning_data.copy()
data_nlp_cleaning.head()

Unnamed: 0,Tweet,Label
0,@EuEdsonDuarte @LiloVLOG @jairbolsonaro Exatam...,0.0
1,A China fecha o primeiro laboratório do mundo ...,0.0
2,Janeiro: China mente sobre a % de mortos nos c...,0.0
3,Nível de poluição na China cai drasticamente a...,0.0
4,@eikebatista Os 19 que cruzam os oceanos traze...,0.0


In [11]:
def remove_url(text):
    """
    This method will remove all ocurrences of urls in the tweets
    
    arguments:
        input_text: "text" of type "String". 
                    
    return:
        value: "text" after removal of all ocurrences of urls.        
    
    """
    
    remove_url = re.sub(r'http\S+', '', text)
    remove_com = re.sub(r"\ [A-Za-z]*\.com", " ", remove_url)
    return remove_com

In [12]:
def remove_accented_characters(text):
    """
    This method will remove all accented from the character in the tweets
    
    arguments:
        input_text: "text" of type "String". 
                    
    return:
        value: "text" with removed accented characters.        
        
    """
    text = unidecode.unidecode(text)
    return text

In [13]:
def lower_case_text(text):
    
    """
    The method will convert the tweet in lower case.
    
    arguments:
         input_text: "text" of type "String".
         
    return:
         value: text in lowercase
    
    """
    text = text.lower()
    return text

In [14]:
def remove_special_characters(text):
    """
    This method will remove specail characters from the tweets.   
    
    arguments:
         input_text: "text" of type "String".
         
    return:
        value: Text with removed special characters that don't require.
    
   """
    text_formatted = re.sub(r"[^a-zA-Z-,.?!]+", ' ', text) 
    return text_formatted

In [15]:
def remove_mention(text):
    """
    This method will remove mention from the tweet.
    
    arguments:
         input_text: "text" of type "String".
         
    return:
        value: Text after remove all mentions.
    
   """
    no_mention = re.sub("@[A-Za-z0-9_]+","", text)  
    return no_mention

In [16]:
#https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    """
    This method will remove emojis from the tweet.
    
    arguments:
         input_text: "text" of type "String".
         
    return:
        value: Text after remove all emojis.
    
   """
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [17]:
stoplist = stopwords.words('portuguese') 
stoplist = set(stoplist)
def remove_stopwords(text):
    """
    This method will remove stopwords from the tweet.
    
    arguments:
         input_text: "text" of type "String".
         
    return:
        value: Text after omitted all stopwords.
    
   """
    text = repr(text)
    No_StopWords = [word for word in word_tokenize(text) if word.lower() not in stoplist ]
    words_string = ' '.join(No_StopWords)    
    return words_string

In [18]:
for i in range(len(data_nlp_cleaning['Tweet'])):
    temp = data_nlp_cleaning.iloc[i, 0]
    #removing url
    temp = remove_url(temp)
    temp = remove_accented_characters(temp)
    temp = lower_case_text(temp)
    temp = remove_special_characters(temp)
    temp = remove_mention(temp)
    temp = remove_emoji(temp)
    data_nlp_cleaning['Tweet'][i] = temp
    #data_nlp_cleaning.loc[i, 0] = temp
    
data_nlp_cleaning.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_nlp_cleaning['Tweet'][i] = temp


Unnamed: 0,Tweet,Label
0,euedsonduarte lilovlog jairbolsonaro exatamen...,0.0
1,a china fecha o primeiro laboratorio do mundo ...,0.0
2,janeiro china mente sobre a de mortos nos caso...,0.0
3,nivel de poluicao na china cai drasticamente a...,0.0
4,eikebatista os que cruzam os oceanos trazem u...,0.0


In [19]:
data_nlp_cleaning.to_csv ("Data/Tweet_Processed_DataCleaning_Done.csv", index = None, header=True)