# Processing data for NLP

In [1]:
##Import libraries
import pandas as pd
import numpy as np

import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk.corpus import stopwords

### Defining functions: 
- Tokenize
- Lemmatize
- Stemmize
- Remove stopwords

In [2]:
def tokenize(s):

    tokens = word_tokenize(s)
    return tokens

def stemmer_words(l):
    sb = SnowballStemmer('portuguese')
    stemmed = [sb.stem(word) for word in l] 
    
    return stemmed

def lemmatize(l):
    
    lemmatizer = WordNetLemmatizer() 
    lemmatized = [lemmatizer.lemmatize(word) for word in l]

    return lemmatized

def remove_stopwords(l):
    stopwords.words('portuguese')
    without_sw = [word for word in l if not word in stopwords.words()]
    " ".join(without_sw)

    return without_sw

### Importing dataset

In [3]:
news = pd.read_csv('news.tsv').drop("Unnamed: 0", axis = 1)
news.head(3)

Unnamed: 0,title,text,tag,date,author,url,rating,month,year
0,Corinthians publicou nota de repúdio sobre a...,corinthians noto oficial repúdio o realização ...,entretenimento,2021-02,Edgard Matsuki,boatos.org,0.0,Feb,2021
1,É falso que família de Ciro Gomes tenha 77 emp...,haver indício o família gomar participar negóc...,política,2020-07,Comprova,projetocomprova.com.br,0.0,Jul,2020
2,Ex chefe da comunicação da PF volta ao posto a...,o delegar polícia federal fabio ricardo ciavol...,política,2020-05,FelipeNériG1,g1.globo.com,1.0,May,2020


### Encoding tags

In [4]:
import sklearn.preprocessing as preprocessing
from sklearn.preprocessing import LabelEncoder

In [5]:
le = LabelEncoder()

In [6]:
news['tagenc']=le.fit_transform(news.tag.tolist())

In [7]:
tags=pd.DataFrame(pd.get_dummies(news.rating)[0.0])
tags['tagenc']=news['tagenc']

In [8]:
tags.corr()

Unnamed: 0,0.0,tagenc
0.0,1.0,-0.426596
tagenc,-0.426596,1.0


In [9]:
news.drop(columns=['tag'])

Unnamed: 0,title,text,date,author,url,rating,month,year,tagenc
0,Corinthians publicou nota de repúdio sobre a...,corinthians noto oficial repúdio o realização ...,2021-02,Edgard Matsuki,boatos.org,0.0,Feb,2021,2
1,É falso que família de Ciro Gomes tenha 77 emp...,haver indício o família gomar participar negóc...,2020-07,Comprova,projetocomprova.com.br,0.0,Jul,2020,5
2,Ex chefe da comunicação da PF volta ao posto a...,o delegar polícia federal fabio ricardo ciavol...,2020-05,FelipeNériG1,g1.globo.com,1.0,May,2020,5
3,bolsonaro assinar decretar proibir o entrar vi...,quinzena setembro recortar jornal começar o co...,2019-09,gilmarlopes,e-farsas.com,0.0,Sep,2019,5
4,PT e esquerda estão abrindo buracos em estra...,diabólico bandido pt abrir buraco estrar feder...,2019-06,Raiane Gonoli,boatos.org,0.0,Jun,2019,5
...,...,...,...,...,...,...,...,...,...
11906,Deolane Bezerra esposa de MC Kevin é advog...,o advogar deolane bezerro fichar visitar presa...,2021-05,Raiane Gonoli,boatos.org,0.0,May,2021,2
11907,Corregedoria da PGR abre sindicância para apur...,o corregedora geral ministério público federal...,2022-05,Aguirre Talento,extra.globo.com,1.0,May,2022,0
11908,Após se aproximar de Bolsonaro governador do ...,o governador distrito federal ibaneis rocha md...,2020-04,Julia Lindnere Mateus,noticias.uol.com.br,1.0,Apr,2020,6
11909,Quando o crime só vem no fim Ou Atirando fle...,artigo o livrar suspeição merecer leitura aten...,2020-03,noticias,noticias.uol.com.br,1.0,Mar,2020,5


### Applying functions to text column

In [10]:
news['text_processed'] = news['text'].apply(tokenize)

In [11]:
news['text_processed'] = news['text_processed'].apply(stemmer_words)

In [12]:
# let's apply this to the all the newsfeed 
#nltk.download('averaged_perceptron_tagger')

# unfortunately pos_tag and lemmatize use different codes for parts of speech 
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper() # gets first letter of POS categorization
    tag_dict = {"J": wordnet.ADJ, 
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) # get returns second argument if first key does not exist #default value if its not of the possibilties

In [13]:
news['text_processed'] = news['text_processed'].apply(lemmatize)

In [14]:
news['text_processed'] =news['text_processed'].apply(remove_stopwords)

## Creating a different set for validation and train-test

In [15]:
#saving a sample for validation

In [16]:
# Remove month and year again - leave only date
news.drop(columns = ['month', 'year'], inplace = True)

In [17]:
news.shape

(11911, 9)

In [18]:
val_news = news.loc[0:999]

In [19]:
val_news.shape

(1000, 9)

In [20]:
val_news.to_csv("val_news.tsv")

In [21]:
newsu = news.loc[1000:]

In [22]:
newsu.isna().sum()

title             0
text              0
tag               0
date              0
author            0
url               0
rating            0
tagenc            0
text_processed    0
dtype: int64

In [23]:
newsu['rating'].value_counts()

1.0    5468
0.0    5443
Name: rating, dtype: int64

In [24]:
newsu.to_csv("newsu.tsv")