# Scraping news from the web

In [1]:
from data.articles import load_news_articles

In [2]:
articles = load_news_articles()

Loading articles from https://www.nrk.no/: 100%|██████████| 89/89 [00:41<00:00,  2.16it/s]
Loading articles from https://www.vg.no/: 100%|██████████| 190/190 [00:41<00:00,  4.60it/s]
Loading articles from https://www.aftenposten.no/: 100%|██████████| 71/71 [00:24<00:00,  2.88it/s]
Loading articles from https://www.nettavisen.no/: 100%|██████████| 26/26 [00:12<00:00,  2.07it/s]


### Loading tokenizer

In [3]:
from data.norec.dataloader import load_full_dataset
from data.dataloader import load_data

dataset = load_full_dataset()
data = load_data(dataset)

maxlen = data['maxlen']
tokenizer = data['tokenizer']

Cleaning text...
Fitting tokenizer...
Converting texts to sequences...
Padding sequences...
One-hot encoding labels...
Done!


In [4]:
articles["text"] = articles["text"].apply(lambda x: x.lower())
articles["text"] = articles["text"].apply(lambda x: x.replace("\n", " "))
articles["text"] = articles["text"].apply(lambda x: " ".join([word for word in x.split() if word.isalpha()]))

# Predicting sentiment of news articles

In [5]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model


def get_sentiment(text):
    # Loading trained model
    model = load_model('../training/models/best_model_0.7956791520118713.h5')

    # Preprocessing text
    sequenced = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequenced, maxlen=maxlen)

    # Predicting sentiment
    prediction = model.predict(padded)
    sentiments = ['negative', 'neutral', 'positive']
    return sentiments[np.argmax(prediction)]


articles['sentiment'] = articles['text'].apply(get_sentiment)

OSError: No file or directory found at ../training/models/best_model_0.7956791520118713.h5

In [None]:
articles

In [None]:
# Plot the number of sentiments
articles.sentiment.value_counts()

In [None]:
print(articles[articles.sentiment == 'positive'][0]["text"])

In [None]:
print(articles[articles.sentiment == 'negative'][0]["text"])

In [None]:
print(articles[articles.sentiment == 'neutral'][0]["text"])