In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../clean_dataset_no_stemming.csv")
display(df['clean_text'].head(5))
print(df.shape)

0    uk manufacturing sector continue face serious ...
1    climate change fight aids leading list concern...
2    shares europe leading reinsurers travel firms ...
3    shares india largest power producer national t...
4    luxury goods group lvmh sold loss making chris...
Name: clean_text, dtype: object

(2225, 5)


### Binarizando

In [5]:
new_target = {'sport': 1, 'business': 0, 'politics': 2, 'tech': 3, 'entertainment': 4}
df['binary_target'] = df['target'].map(new_target)
print(df['binary_target'].value_counts())

1    511
0    510
2    417
3    401
4    386
Name: binary_target, dtype: int64


### Balanceando

In [7]:
dataset_balanceado = df[(df['binary_target']==0) | (df['binary_target']==1)]
display(dataset_balanceado)

Unnamed: 0,filename,title,article,target,clean_text,binary_target
0,289.txt,UK economy facing 'major risks'\n,The UK manufacturing sector will continue to f...,business,uk manufacturing sector continue face serious ...,0
1,504.txt,Aids and climate top Davos agenda\n,Climate change and the fight against Aids are ...,business,climate change fight aids leading list concern...,0
2,262.txt,Asian quake hits European shares\n,Shares in Europe's leading reinsurers and trav...,business,shares europe leading reinsurers travel firms ...,0
3,276.txt,India power shares jump on debut\n,"Shares in India's largest power producer, Nati...",business,shares india largest power producer national t...,0
4,510.txt,Lacroix label bought by US firm\n,Luxury goods group LVMH has sold its loss-maki...,business,luxury goods group lvmh sold loss making chris...,0
...,...,...,...,...,...,...
1819,253.txt,Old Firm pair handed suspensions\n,Celtic's Henri Camara and Nacho Novo of Ranger...,sport,celtic henri camara nacho novo rangers suspend...,1
1820,247.txt,Real will finish abandoned match\n,Real Madrid and Real Socieded will play the fi...,sport,real madrid real socieded play final six minut...,1
1821,509.txt,Melzer shocks Agassi in San Jose\n,Second seed Andre Agassi suffered a comprehens...,sport,second seed andre agassi suffered comprehensiv...,1
1822,290.txt,O'Gara revels in Ireland victory\n,Ireland fly-half Ronan O'Gara hailed his side'...,sport,ireland fly half ronan ogara hailed side victo...,1


### Carregando modelo BERT

In [8]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Tokenizando

In [9]:
tokenized = dataset_balanceado['clean_text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors


### Padding

In [10]:
max_len = 0
counter = 0
max_possible_len = 512
for index,i in enumerate(tokenized.values):
    # cropping input to length 512
    if len(i) > max_possible_len:
        tokenized.values[index] = i[:max_possible_len-1]
        # get the length
    if len(i) > max_len:
        max_len = len(tokenized.values[index])
print(max_len)
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

511


In [11]:
np.array(padded).shape

(1021, 511)

### Masking

In [12]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(1021, 511)

### Deep Learning

In [None]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)