# Data Analysis - All dataset

<h2>Libraries</h2>

In [None]:
import pandas as pd
import nltk
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter,OrderedDict
from nltk.tokenize import RegexpTokenizer
from nltk.probability import FreqDist

nltk.download('stopwords')
stop = stopwords.words('portuguese')

<h2>Dataframe</h2>

In [None]:
def readCSV():
    df =  pd.read_csv('tweet_all2.csv',  sep=';;', on_bad_lines='skip',lineterminator='\r', engine ='python')
    text = df['text']
    return df

<h2>Clear Tweets</h2>
<br>
Removing rows with specific words (the whole row, because is related to a subject that won't be useful) - here it also eliminates if the word has is a hashtag

In [None]:
def deleteTweetsIfcontainsWord():
    # Specify the list of specific words you want to filter out
    specific_words = ['trânsito', 'rodovia', 'tráfego', 'ecovias','TIM', 'tim', 'Kaysar' , 'novela', 'Bandeirantes', 'operacaobetalab', 'bbb', 'transito', 'trânsito', 'orcars', 'oscar', 'grammy', 'grammys', 'kaysar', 'timbetalab', ' masterchefbr', 'enem', 'futebol', ' bundesliga', 'corinthians']

    # Fill NaN values in the 'text' column with an empty string
    df['text'].fillna('', inplace=True)

    # Create a boolean mask for rows containing specific words in the 'text' column
    mask = df['text'].str.contains('|'.join(specific_words), case=False)

    # Apply the mask to filter out rows with specific words
    df_filtered = df[~mask]
    return df_filtered

<h2>Clear Text</h2>
<br>
Removing specific elements inside the Text field, like URLs, username, TAG

In [None]:
def removeURL():
    # Define a regular expression pattern to match URLs
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    # Remove URLs from the 'text' column using regular expressions
    df_filtered.loc[:, 'text'] = df_filtered['text'].str.replace(url_pattern, '', regex=True)
    return df_filtered
    
def removeListWords():
    # Define a list of specific words you want to remove
    specific_words_to_remove = ['RT', 'q' , 'vc', 'htt...', 'http...']

    # Create a regular expression pattern to match the specific words
    pattern = r'\b(?:' + '|'.join(specific_words_to_remove) + r')\b'

    # Remove the specific words from the 'text' column using regular expressions
    df_filtered.loc[:, 'text'] = df_filtered['text'].str.replace(pattern, '', regex=True)
    return df_filtered

def removeUsername():
    # Define a regular expression pattern to match mentions (usernames)
    mention_pattern = r'@\w+'

    # Remove mentions from the 'text' column using regular expressions
    df_filtered.loc[:, 'text'] = df_filtered['text'].str.replace(mention_pattern, '', regex=True)
    return df_filtered

<h2>Text Processing</h2>

In [None]:
def tokenization():
    # Tokenize the text in the 'text' column using NLTK
    df_filtered['tokens'] = df_filtered['text'].apply(word_tokenize)
    return df_filtered
    
def printNToken(n):
    # Display the first n rows with tokenized text
    print(df_filtered[['text', 'tokens']].head(n))
    
def removeStopwords():
    # Additional list of stopwords
    additional_stopwords = ['a', 'à', 'agora', 'ainda', 'além', 'algo', 'algumas', 'alguns', 'ali', 'ano', 'anos', 'antes', 'ao', 'aos', 'apenas',
    'apoio', 'após', 'aquela', 'aquelas', 'aquele', 'aqueles', 'aquilo', 'área', 'as', 'às', 'assim', 'até', 'atrás',
    'através', 'baixo', 'bastante', 'bem', 'boa', 'boas', 'bom', 'bons', 'breve', 'cá', 'cada', 'catorze', 'cedo', 'cento',
    'certamente', 'certeza', 'cinco', 'coisa', 'coisas', 'com', 'como', 'conselho', 'contra', 'contudo', 'corrente',
    'cuja', 'cujas', 'cujo', 'cujos', 'da', 'dá', 'dão', 'daquela', 'daquelas', 'daquele', 'daqueles', 'daqui', 'daquilo',
    'das', 'de', 'debaixo', 'dela', 'delas', 'dele', 'deles', 'dêm', 'demais', 'dentro', 'depois', 'desde', 'dessa',
    'dessas', 'desse', 'desses', 'desta', 'destas', 'deste', 'destes', 'deve', 'deverá', 'dez', 'dezanove', 'dezasseis',
    'dezassete', 'dezoito', 'dia', 'diante', 'diz', 'dizem', 'do', 'dona', 'donas', 'dono', 'donos', 'dos', 'doze', 'duas',
    'dúvida', 'e', 'ela', 'elas', 'ele', 'eles', 'em', 'embora', 'entre', 'então', 'entanto', 'era', 'eram', 'éramos',
    'é', 'essa', 'essas', 'esse', 'esses', 'esta', 'estamos', 'está', 'estão', 'estar', 'estará', 'estas', 'estás', 'estava',
    'estavam', 'este', 'estes', 'esteve', 'estive', 'estivemos', 'estiveram', 'estiveste', 'estivestes', 'estou', 'eu',
    'exemplo', 'falta', 'fará', 'favor', 'faz', 'fazeis', 'fazem', 'fazemos', 'fazer', 'fazes', 'fez', 'fim', 'final', 'foi',
    'fomos', 'for', 'fora', 'foram', 'formos', 'fosse', 'fossem', 'foste', 'fostes', 'fui', 'geral', 'grande', 'grandes',
    'grupo', 'hoje', 'hora', 'horas', 'ir', 'irá', 'isso', 'isto', 'já', 'lá', 'lado', 'ligado', 'local', 'logo', 'longe',
    'lugar', 'maior', 'maioria', 'maiorias', 'mais', 'mal', 'mas', 'máximo', 'me', 'meio', 'menor', 'menos', 'mês',
    'meses', 'meu', 'meus', 'mil', 'minha', 'minhas', 'momento', 'muito', 'muitos', 'na', 'nada', 'naquela', 'naquelas',
    'naquele', 'naqueles', 'nas', 'nem', 'nenhuma', 'nessa', 'nessas', 'nesse', 'nesses', 'nesta', 'nestas', 'neste', 'nestes',
    'ninguém', 'no', 'nos', 'nós', 'nossa', 'nossas', 'nosso', 'nossos', 'nova', 'novas', 'nove', 'novo', 'novos', 'num',
    'numa', 'número', 'nunca', 'nuns', 'o', 'obra', 'obrigada', 'obrigado', 'oitava', 'oitavo', 'oito', 'onde', 'ontem',
    'onze', 'os', 'ou', 'outra', 'outras', 'outro', 'outros', 'para', 'parece', 'parte', 'partir', 'paucas', 'pela', 'pelas',
    'pelo', 'pelos', 'perto', 'pode', 'pôde', 'podem', 'poderá', 'podia', 'pois', 'ponto', 'pontos', 'por', 'porque', 'porquê',
    'pouca', 'pouco', 'poucos', 'primeira', 'primeiras', 'primeiro', 'primeiros', 'própria', 'próprias', 'próprio', 'próprios',
    'quáis', 'qual', 'qualquer', 'quando', 'quanto', 'quarta', 'quarto', 'quatro', 'que', 'quem', 'quer', 'quê', 'quinta',
    'quinto', 'quinze', 'relação', 'sabe', 'são', 'se', 'segunda', 'segundo', 'sei', 'seis', 'seja', 'sejam', 'sempre', 'sendo',
    'ser', 'será', 'seu', 'seus', 'sexta', 'sexto', 'sim', 'sistema', 'sob', 'sobre', 'sois', 'somos', 'sou', 'sua', 'suas',
    'tal', 'talvez', 'também', 'tanta', 'tantas', 'tanto', 'tantos', 'te', 'tem', 'têm', 'temos', 'tendes', 'tenho', 'tens',
    'ter', 'terá', 'terão', 'terceira', 'terceiro', 'teu', 'teus', 'teve', 'ti', 'tido', 'tinha', 'tinham', 'tive', 'tivemos',
    'tiveram', 'tiveste', 'tivestes', 'toda', 'todas', 'todo', 'todos', 'trabalho', 'três', 'treze', 'tu', 'tua', 'tuas',
    'tudo', 'última', 'últimas', 'último', 'últimos', 'um', 'uma', 'umas', 'uns', 'usa', 'usar', 'vai', 'vais', 'valor',
    'veja', 'vem', 'vens', 'ver', 'verdade', 'verdadeiro', 'vez', 'vezes', 'viagem', 'vindo', 'vinte', 'você', 'vocês',
    'vos', 'vós', 'vossa', 'vossas', 'vosso', 'vossos', 'zero','.', '!' , '?', '“', '”','`','``',':','\\','\'\'',',','#','-','(',')','[',']',';','|','/']
    
    # Combine both lists of stopwords
    portuguese_stopwords = stop + additional_stopwords
    df_filtered['filtered_tokens'] = df_filtered['tokens'].apply(lambda tokens: [word for word in tokens if word.lower() not in portuguese_stopwords])
    return df_filtered

def printNRowsAfterStopword(n):
    # Display the first n rows with tokenized and filtered text
    print(df_filtered[['text', 'filtered_tokens']].head(n))    
    
def removePunct():    
        # Define a list of specific words you want to remove
    specific_words_to_remove = ['.', '!' , '?', '“', '”','`']

    # Create a regular expression pattern to match the specific words
    pattern = r'\b(?:' + '|'.join(specific_words_to_remove) + r')\b'

    # Remove the specific words from the 'text' column using regular expressions
    df_filtered.loc[:, 'text'] = df_filtered['text'].str.replace(pattern, '', regex=True)
    
    return df_filtered
    #df_filtered['cleaned_tokens'] = df_filtered['filtered_tokens'].apply(lambda tokens: [re.sub(punctuation_pattern, '', word) for word in tokens]
    
def printNRowsAfterPunct(n):
    # Display the first 10 rows with cleaned tokens
    print(df_filtered[['text', 'cleaned_tokens']].head(10))

<h2>Text Analysis</h2>

In [None]:
def wordFrequency():
    # Compute word frequencies using FreqDist
    word_freq = FreqDist(df_filtered['filtered_tokens'].explode())

    #Print the most common words and their frequencies
    most_common_words = word_freq.most_common()
    for word, freq in most_common_words:
        print(f'{word}: {freq}')

<h2>Output information</h2>

In [None]:
def printNumberOfTweets(df):
    print("Total amount of tweets",len(df))

def printListAttributes():
    print(list("Total amount of tweets after filter",df.columns.values))
    
def printHead(df,n):
    print(df.head(n))

<h2>Main</h2>

1) Read CSV

In [None]:
df = readCSV()

2) Delete <i>Tweets</i> based on specific list of words

In [None]:
df_filtered = deleteTweetsIfcontainsWord()

3) Clean Text field

In [None]:
df_filtered = removeURL()
df_filtered = removeListWords()
df_filtered = removeUsername()

In [None]:
printNumberOfTweets(df)
printNumberOfTweets(df_filtered)
#printHead(df_filtered,2)

4) Text processing 

In [None]:
df_filtered = tokenization()
#printNToken(10)

df_filtered = removeStopwords()
#printNRowsAfterStopword(10)

5) Text Analysis

In [None]:
wordFrequency()