In [61]:
# Training the Naive Bayes model on the Training set

import snscrape.modules.twitter as sntwitter
import datetime as dt

# Importing the libraries
import numpy as np, pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Data Preprocessing and Feature Engineering
from textblob import TextBlob
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import spacy

sns.set()  # use seaborn plotting style
nlp = spacy.load("pt_core_news_lg")


In [None]:
# Fazendo o Scraps dos tweets

def ScrapTweets(max_tweets: int, search_string: str):
    # Definindo o máximo de tweets
    maxTweets = max_tweets

    # Criando a lista para salvar os dados
    tweets_list = []

    # Usando o TwitterSearchScraper para fazer o scrape dos dados data e salvar em uma lista
    for i, tweet in enumerate(
        sntwitter.TwitterSearchScraper(search_string).get_items()
    ):
        if i > maxTweets:
            break
        tweets_list.append(
            [
                tweet.id,
                tweet.date,
                tweet.rawContent,
                tweet.lang,
            ]
        )
    # Criando um dataframe com os dados
    tweets_df = pd.DataFrame(
        tweets_list,
        columns=[
            "TweetId",
            "DataHora",
            "Texto",
            "Idioma",
        ],
    )
    # Adicionando a coluna de data, mês e hora
    tweets_df["dia"] = tweets_df.DataHora.dt.day
    tweets_df["mes"] = tweets_df.DataHora.dt.month
    tweets_df["ano"] = tweets_df.DataHora.dt.year

    return tweets_df

In [None]:
tweets_feliz = ScrapTweets(10000, "#feliz")
tweets_feliz["emocao"] = "feliz"

tweets_amor = ScrapTweets(10000, "#amo")
tweets_amor["emocao"] = "amor"

tweets_paixao = ScrapTweets(10000, "paixao")
tweets_paixao["emocao"] = "paixao"

tweets_alegria = ScrapTweets(10000, "#alegria") #
tweets_alegria["emocao"] = "alegria"

tweets_otimismo = ScrapTweets(10000, "#otimismo") #
tweets_otimismo["emocao"] = "otimismo"

tweets_confianca = ScrapTweets(10000, "#confiança") #
tweets_confianca["emocao"] = "confiança"

tweets_medo = ScrapTweets(10000, "#medo") #
tweets_medo["emocao"] = "medo"

tweets_nojo = ScrapTweets(10000, "#nojo") #
tweets_nojo["emocao"] = "nojo"

tweets_triste = ScrapTweets(10000, "#triste") #
tweets_triste["emocao"] = "triste"

tweets_chateado = ScrapTweets(5000, "#chateado")
tweets_chateado["emocao"] = "chateado"

tweets_chateada = ScrapTweets(5000, "#chateada")
tweets_chateada["emocao"] = "chateado"

tweets_raiva = ScrapTweets(10000, "#raiva") #
tweets_raiva["emocao"] = "raiva"

tweets_vergonha = ScrapTweets(10000, "#vergonha")
tweets_vergonha["emocao"] = "vergonha"


tweets = pd.concat(
    [
        tweets_feliz,
        tweets_amor,
        tweets_paixao,
        tweets_alegria,
        tweets_otimismo,
        tweets_confianca,
        tweets_medo,
        tweets_nojo,
        tweets_triste,
        tweets_chateado,
        tweets_chateada,
        tweets_raiva,
        tweets_vergonha
    ]
)

# Exportar dataframe para CSV
tweets.to_csv("text-query-tweets.csv", sep=",", index=False)

In [None]:
# # importing the dataset
# tweets = pd.read_csv("text-query-tweets.csv", sep=",")
# tweets.head()


In [38]:
# removendo duplicatas
tweets.drop_duplicates(subset="TweetId", keep=False, inplace=True)

# Filtrando o idimoa para português
tweets = tweets[tweets["Idioma"] == "pt"]

# Removendo colunas desnecessárias
tweets.drop(
    ["TweetId", "DataHora", "Idioma", "dia", "mes", "ano"], axis=1, inplace=True
)

tweets.reset_index(drop=True, inplace=True)
tweets.head()

Unnamed: 0,Texto,emocao
0,"@dizaoleao Estou de fora, #triste",triste
1,"Promessa não deu certo, partiu pra ameaça #tri...",triste
2,#Triste ver minha cidade dividida por causa do...,triste
3,"@Hugo_aguiar09 tô precisando, to meio #triste",triste
4,É engraçado quando você namora por muito tempo...,triste


In [39]:
def define_sentiment(sentiment):
    if sentiment in ["feliz", "amor", "paixao", "alegria", "otimismo", "confiança"]:
        return 1
    elif sentiment in ["medo", "nojo", "triste", "chateado", "raiva", "vergonha"]:
        return 0
    else:
        return np.nan

tweets['sentimento'] = tweets['emocao'].apply(define_sentiment)

In [77]:
# funções de tratamento de dados
def remove_usernames(tweet):
        tweet = re.sub(r'@\S+', '', tweet)
        tweet = re.sub(r'http\S+', '', tweet)
        return tweet

def form_sentence(tweet):
    tweet_blob = TextBlob(tweet)
    return ' '.join(tweet_blob.words)

def no_stopwords(tweet):
    clean_mess = [word for word in tweet.split() if word.lower() not in stopwords.words('portuguese')]
    clean_mess = ' '.join(clean_mess)
    return clean_mess


def normalization(tweet_list):
        doc = nlp(tweet_list)
        return ' '.join([token.lemma_ for token in doc if token.pos_ == 'VERB' or token.pos_ == 'NOUN' or token.pos_ == 'ADJ'])

# funçao analyser
def text_processing(tweet):
    new_tweet = remove_usernames(tweet)
    no_punc_tweet = form_sentence(new_tweet)
    no_stopwords_tweet = no_stopwords(no_punc_tweet)
    return normalization(no_stopwords_tweet)


In [81]:
tweets['Texto'] = tweets['Texto'].apply(text_processing)
tweets.head(20)

Unnamed: 0,Texto,emocao,sentimento
0,triste,triste,0
1,promessa dar certo partir ameaça triste revolt...,triste,0
2,Triste ver cidade dividir causa ego político c...,triste,0
3,precisar triste,triste,0
4,engraçar namorar tempo ficar solteiro pessoa a...,triste,0
5,triste panna cota Vim salivar,triste,0
6,cara triste,triste,0
7,triste querer dizer coisa saber heh,triste,0
8,inércia intelectual maioria povo gritante para...,triste,0
9,triste desilusão depressão dor decepção transt...,triste,0


In [82]:
# Buscando categorias
text_categories = tweets["sentimento"].unique()

# Separando os dados de treino e teste
X = tweets["Texto"]
y = tweets["sentimento"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("We have {} unique classes".format(len(text_categories)))
print("We have {} training samples".format(len(X_train)))
print("We have {} test samples".format(len(X_test)))


We have 2 unique classes
We have 128588 training samples
We have 32147 test samples


In [118]:
print(tweets['sentimento'].value_counts())
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

under_sampler = RandomUnderSampler(random_state=42)

X_train_res, y_train_res = under_sampler.fit_resample(X_train.values.reshape(-1, 1), y_train)
X_train_res = X_train_res.reshape(-1)
print(f"Training target statistics: {Counter(y_train_res)}")

X_test_res, y_test_res = under_sampler.fit_resample(X_test.values.reshape(-1, 1), y_test)
X_test_res = X_test_res.reshape(-1)
print(f"Training target statistics: {Counter(y_test_res)}")



0    142307
1     18428
Name: sentimento, dtype: int64
Training target statistics: Counter({0: 14709, 1: 14709})
Testing target statistics: Counter({0: 28428, 1: 3719})


In [119]:
text_clf = Pipeline(
    [
        ("vect", CountVectorizer(analyzer='word')),
        ("tfidf", TfidfTransformer()),
        ("clf", MultinomialNB()),
    ]
)

# tuned_parameters = {
#     'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
#     'tfidf__use_idf': (True, False),
#     'tfidf__norm': ('l1', 'l2'),
#     'clf__alpha': [1, 1e-1, 1e-2]
# }

# clf = GridSearchCV(text_clf, tuned_parameters, cv=10, scoring='accuracy')
# clf.fit(X_train, y_train)
text_clf.fit(X_train_res, y_train_res)
predictions = text_clf.predict(X_test_res)
print(classification_report(predictions, y_test_res))
print(confusion_matrix(predictions, y_test_res))
print(accuracy_score(predictions, y_test_res))

# print(classification_report(y_test, clf.predict(X_test), digits=4))


              precision    recall  f1-score   support

           0       0.96      0.98      0.97     27694
           1       0.86      0.72      0.78      4453

    accuracy                           0.95     32147
   macro avg       0.91      0.85      0.88     32147
weighted avg       0.94      0.95      0.94     32147

[[27182   512]
 [ 1246  3207]]
0.9453137151211621


In [102]:
# def model_prediction (text):
#     return text_clf.predict([text_processing(text)])

In [None]:
from joblib import dump

dump(text_clf, "tweeter_multinomialNB_pt.joblib")
