In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC #LinearSVC should scale better to a higher number of samples than SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Conv1D, MaxPooling1D, Bidirectional
from keras.utils import np_utils
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from transformers import RobertaTokenizer, TFRobertaModel
from transformers import XLNetTokenizer, TFXLNetModel
from scipy.stats import pearsonr

pd.set_option('display.max_colwidth', 50)


In [2]:
import spacy
from spacy.lang.pt import Portuguese
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [3]:
nlp = spacy.load("pt_core_news_lg")

In [4]:
# Load the spaCy language model for European Portuguese
#nlp = Portuguese()

In [5]:
def check(df):
    l=[]
    columns=df.columns
    for col in columns:
        dtypes=df[col].dtypes
        nunique=df[col].nunique()
        sum_null=df[col].isnull().sum()
        l.append([col,dtypes,nunique,sum_null])
    df_check=pd.DataFrame(l)
    df_check.columns=['column','dtypes','nunique','sum_null']
    return df_check 

Step 1: Load CSV file into a Pandas DataFrame and clean it (missing, null and duplicated data)

In [6]:
# Load data from CSV file
import pandas as pd
df = pd.read_csv('Final_dataset_portuguese.csv').sample(10000).reset_index(drop='index')
df.head()

Unnamed: 0,Text,Source,URL,Label
0,O SNS está um caos completo Onde estão os gran...,,https://www.direitapolitica.com/o-sns-esta-um-...,0
1,Está a circular no Facebook uma imagem que sup...,facebook,https://poligrafo.sapo.pt/fact-check/fact-chec...,1
2,"Covid-19: Espanha decreta ""fim da crise sanitá...",SIC Noticias,https://sicnoticias.pt/especiais/coronavirus/2...,1
3,Tem medo de dentistas? Conheça o SNAP-ON SMILE...,amilcar freitas,https://arquivo.pt/wayback/20160603024458mp_/h...,0
4,Cientistas descobrem a razão que leva os bebés...,SIC Noticias,https://sicnoticias.pt/mundo/2018-03-12-Cienti...,1


In [7]:
check(df)

Unnamed: 0,column,dtypes,nunique,sum_null
0,Text,object,8938,0
1,Source,object,842,2006
2,URL,object,10000,0
3,Label,int64,2,0


In [8]:
#df.drop(columns=['Unnamed: 0'],inplace=True)

In [9]:
# delete missing data
#df.dropna(inplace=True)

Step 2: Preprocess the text data

Stopword Removal

In [10]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('portuguese'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

df['text_preprocessed'] = df['Text'].apply(remove_stopwords)

In [11]:
#stop_words = set(stopwords.words('portuguese'))
#df['title'] = df['title'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words]))
#df['text_preprocessed'] = df['text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words]))

In [12]:
df.head()

Unnamed: 0,Text,Source,URL,Label,text_preprocessed
0,O SNS está um caos completo Onde estão os gran...,,https://www.direitapolitica.com/o-sns-esta-um-...,0,SNS caos completo Onde grandoleiros ? outro go...
1,Está a circular no Facebook uma imagem que sup...,facebook,https://poligrafo.sapo.pt/fact-check/fact-chec...,1,circular Facebook imagem supostamente demonstr...
2,"Covid-19: Espanha decreta ""fim da crise sanitá...",SIC Noticias,https://sicnoticias.pt/especiais/coronavirus/2...,1,Covid-19 : Espanha decreta `` fim crise sanitá...
3,Tem medo de dentistas? Conheça o SNAP-ON SMILE...,amilcar freitas,https://arquivo.pt/wayback/20160603024458mp_/h...,0,medo dentistas ? Conheça SNAP-ON SMILE Snap-On...
4,Cientistas descobrem a razão que leva os bebés...,SIC Noticias,https://sicnoticias.pt/mundo/2018-03-12-Cienti...,1,Cientistas descobrem razão leva bebés mexerem-...


Stemming

In [13]:
# stemmer = PorterStemmer()

# #df['title'] = df['title'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))
# df['text_preprocessed'] = df['text_preprocessed'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))

# df.head()

Lemmatization

In [14]:
# Function for lemmatization
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    return " ".join(lemmatized_tokens)

df['text_preprocessed'] = df['text_preprocessed'].apply(lemmatize_text)

In [15]:
df.head()

Unnamed: 0,Text,Source,URL,Label,text_preprocessed
0,O SNS está um caos completo Onde estão os gran...,,https://www.direitapolitica.com/o-sns-esta-um-...,0,SNS caos completo onde Grandoleiros ? outro go...
1,Está a circular no Facebook uma imagem que sup...,facebook,https://poligrafo.sapo.pt/fact-check/fact-chec...,1,circular Facebook imagem supostamente demonstr...
2,"Covid-19: Espanha decreta ""fim da crise sanitá...",SIC Noticias,https://sicnoticias.pt/especiais/coronavirus/2...,1,Covid-19 : Espanha decretar ` ` fim crise sani...
3,Tem medo de dentistas? Conheça o SNAP-ON SMILE...,amilcar freitas,https://arquivo.pt/wayback/20160603024458mp_/h...,0,medo dentista ? Conheça sNAP-ON SMILE Snap-On ...
4,Cientistas descobrem a razão que leva os bebés...,SIC Noticias,https://sicnoticias.pt/mundo/2018-03-12-Cienti...,1,cientista descobrer razão levar bebé mexerer s...


Ngrams

In [16]:
#from nltk import ngrams

#df['title_ngrams'] = df['title'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in word_tokenize(x)]))

Additional linguistic features

In [17]:
def count_words(text):
    words = word_tokenize(text)
    return len(words)

def count_nouns(text):
    doc = nlp(text)
    noun_count = sum([1 for token in doc if token.pos_ == 'NOUN'])
    return noun_count

def count_verbs(text):
    doc = nlp(text)
    verb_count = sum([1 for token in doc if token.pos_ == 'VERB'])
    return verb_count

def count_adjectives(text):
    doc = nlp(text)
    adj_count = sum([1 for token in doc if token.pos_ == 'ADJ'])
    return adj_count

def count_adverbs(text):
    doc = nlp(text)
    adv_count = sum([1 for token in doc if token.pos_ == 'ADV'])
    return adv_count

df['count_words_text'] = df['text_preprocessed'].apply(count_words)
df['num_nouns_text'] = df['text_preprocessed'].apply(count_nouns)
df['num_verbs_text'] = df['text_preprocessed'].apply(count_verbs)
df['num_adj_text'] = df['text_preprocessed'].apply(count_adjectives)
df['num_adv_text'] = df['text_preprocessed'].apply(count_adverbs)

 Sentiment score

In [18]:
sia = SentimentIntensityAnalyzer()

def get_avg_sentiment_score(text):
    return sia.polarity_scores(text)['compound']

def get_pos_sentiment_score(text):
    return sia.polarity_scores(text)['pos']

def get_neu_sentiment_score(text):
    return sia.polarity_scores(text)['neu']

def get_neg_sentiment_score(text):
    return sia.polarity_scores(text)['neg']

df['sentiment_avg_text'] = df['text_preprocessed'].apply(lambda x: get_avg_sentiment_score(x))

Step 3: Create final feature column

In [19]:
# Concatenate the preprocessed text, title, sentiment, and additional features
df['features'] = df.apply(lambda x: ' '.join([x['text_preprocessed'],
                                              str(x['sentiment_avg_text']),
                                              str(x['count_words_text']), 
                                              str(x['num_nouns_text']),
                                              str(x['num_verbs_text']),
                                              str(x['num_adj_text']),
                                              str(x['num_adv_text']),
                                              str(x["Source"])]), axis=1)

In [20]:
df.head()

Unnamed: 0,Text,Source,URL,Label,text_preprocessed,count_words_text,num_nouns_text,num_verbs_text,num_adj_text,num_adv_text,sentiment_avg_text,features
0,O SNS está um caos completo Onde estão os gran...,,https://www.direitapolitica.com/o-sns-esta-um-...,0,SNS caos completo onde Grandoleiros ? outro go...,71,16,14,7,6,0.0,SNS caos completo onde Grandoleiros ? outro go...
1,Está a circular no Facebook uma imagem que sup...,facebook,https://poligrafo.sapo.pt/fact-check/fact-chec...,1,circular Facebook imagem supostamente demonstr...,108,33,25,11,3,-0.25,circular Facebook imagem supostamente demonstr...
2,"Covid-19: Espanha decreta ""fim da crise sanitá...",SIC Noticias,https://sicnoticias.pt/especiais/coronavirus/2...,1,Covid-19 : Espanha decretar ` ` fim crise sani...,405,113,60,48,11,0.3818,Covid-19 : Espanha decretar ` ` fim crise sani...
3,Tem medo de dentistas? Conheça o SNAP-ON SMILE...,amilcar freitas,https://arquivo.pt/wayback/20160603024458mp_/h...,0,medo dentista ? Conheça sNAP-ON SMILE Snap-On ...,352,103,48,68,14,0.9662,medo dentista ? Conheça sNAP-ON SMILE Snap-On ...
4,Cientistas descobrem a razão que leva os bebés...,SIC Noticias,https://sicnoticias.pt/mundo/2018-03-12-Cienti...,1,cientista descobrer razão levar bebé mexerer s...,213,71,50,37,10,0.296,cientista descobrer razão levar bebé mexerer s...


In [21]:
df.to_csv("Final_preprocessed_dataset_portuguese.csv", index = False)