# Processing data for NLP

In [105]:
##Import libraries
import pandas as pd
import numpy as np

import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk.corpus import stopwords

### Defining functions: 
- Tokenize
- Lemmatize
- Stemmize
- Remove stopwords

In [106]:
def tokenize(s):

    tokens = word_tokenize(s)
    return tokens

def stemmer_words(l):
    sb = SnowballStemmer('portuguese')
    stemmed = [sb.stem(word) for word in l] 
    
    return stemmed

def lemmatize(l):
    
    lemmatizer = WordNetLemmatizer() 
    lemmatized = [lemmatizer.lemmatize(word) for word in l]

    return lemmatized

def remove_stopwords(l):
    stopwords.words('portuguese')
    without_sw = [word for word in l if not word in stopwords.words()]
    " ".join(without_sw)

    return without_sw

### Importing dataset

In [107]:
news = pd.read_csv('news.tsv').drop("Unnamed: 0", axis = 1)
news.head(3)

Unnamed: 0,title,text,tag,date,author,url,rating,month,year
0,Corinthians publicou nota de repúdio sobre a...,corinthians noto oficial repúdio o realização ...,entretenimento,2021-02,Edgard Matsuki,boatos.org,0.0,Feb,2021
1,É falso que família de Ciro Gomes tenha 77 emp...,haver indício o família gomar participar negóc...,política,2020-07,Comprova,projetocomprova.com.br,0.0,Jul,2020
2,Ex chefe da comunicação da PF volta ao posto a...,o delegar polícia federal fabio ricardo ciavol...,política,2020-05,FelipeNériG1,g1.globo.com,1.0,May,2020


### Encoding tags

import sklearn.preprocessing as preprocessing
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

news['tagenc']=le.fit_transform(news.tag.tolist())

tags=pd.DataFrame(pd.get_dummies(news.rating)[0.0])
tags['tagenc']=news['tagenc']

tags.corr()

In [108]:
news2 = news.drop(columns=['month', 'year','rating'])

In [128]:
def lower(s):
    s.lower()
    return s

In [129]:
for column in news2.columns:
    news2[column] = news2[column].apply(lower)

AttributeError: 'list' object has no attribute 'lower'

### Applying functions to all text columns

In [111]:
for column in news2.columns:
    news2[column] = news2[column].apply(tokenize)

In [112]:
for column in news2.columns:
    news2[column] = news2[column].apply(stemmer_words)

In [113]:
# let's apply this to the all the newsfeed 
#nltk.download('averaged_perceptron_tagger')

# unfortunately pos_tag and lemmatize use different codes for parts of speech 
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper() # gets first letter of POS categorization
    tag_dict = {"J": wordnet.ADJ, 
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) # get returns second argument if first key does not exist #default value if its not of the possibilties

In [114]:
for column in news2.columns:
    news2[column] = news2[column].apply(lemmatize)

#Wait for now, too slow to run
for column in news2.columns:
    news2[column] = news2[column].apply(remove_stopwords)

## Creating a different set for validation and train-test

In [115]:
news2['rating'] = news['rating']

In [116]:
#news2['tag'] = news['tag']

In [117]:
#saving a sample for validation

In [118]:
# Remove month and year again - leave only date
#news.drop(columns = ['month', 'year'], inplace = True)

In [119]:
news2.shape

(11911, 7)

In [120]:
val_news = news2.loc[0:999]

In [121]:
val_news.shape

(1000, 7)

In [122]:
val_news.to_csv("val_news2.tsv")

In [123]:
newsu2 = news2.loc[1000:]

In [124]:
newsu2.isna().sum()

title     0
text      0
tag       0
date      0
author    0
url       0
rating    0
dtype: int64

In [125]:
#newsu2['rating'].value_counts()

In [126]:
newsu2.to_csv("newsu2.tsv")

In [127]:
newsu2

Unnamed: 0,title,text,tag,date,author,url,rating
1000,"[weintraub, diz, que, alun, inscrit, decid, da...","[hor, anunc, pesquis, alun, inscrit, enem, opi...",[saúd],[2020-05],[educaca],[educacao.uol.com.br],1.0
1001,"[segund, do, da, coronavac, nã, está, relacion...","[do, escalon, comum, program, vacin, e, serv, ...",[saúd],[2021-01],[comprov],[projetocomprova.com.br],0.0
1002,"[prisã, na, cpi, é, suc, de, brasil, onde, a, ...","[o, ex, diretor, logíst, ministéri, saud, robe...",[brasil],[2021-07],[notic],[noticias.uol.com.br],0.0
1003,"[alvo, da, justic, ex, govern, ricard, coutinh...","[acus, particip, esquem, desvi, dinheir, públi...",[polít],[2020-09],"[raniery, soaresespecialp, estad]",[noticias.uol.com.br],1.0
1004,"[o, pap, francisc, hologram]","[o, víd, surg, o, red, social, o, seman, abril...",[entreten],[2020-04],[gilmarlop],[e-farsas.com],0.0
...,...,...,...,...,...,...,...
11906,"[deolan, bezerr, espos, de, mc, kevin, é, advo...","[o, advog, deolan, bezerr, fich, visit, pres, ...",[entreten],[2021-05],"[raian, gonol]",[boatos.org],0.0
11907,"[corregedor, da, pgr, abre, sindic, par, apur,...","[o, corregedor, geral, ministéri, públic, fede...",[brasil],[2022-05],"[aguirr, talent]",[extra.globo.com],1.0
11908,"[após, se, aproxim, de, bolsonar, govern, do, ...","[o, govern, distrit, federal, iban, roch, mdb,...",[saúd],[2020-04],"[jul, lindner, mateus]",[noticias.uol.com.br],1.0
11909,"[quand, o, crim, só, vem, no, fim, ou, atir, f...","[artig, o, livr, suspeiçã, merec, leitur, aten...",[polít],[2020-03],[notic],[noticias.uol.com.br],1.0
