## Function to parse url

In [1]:
from newspaper import Article

In [2]:
url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'

In [50]:
def get_text_title(url):
    article = Article(url)
    article.download()
    article.parse()
    text = article.text
    title = article.title
    return title, text

In [13]:
title, text = get_text_title(url)

## Preprocess

In [19]:
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

def rm_punctuation(tokenized):
    regex = re.compile('[%s]' % re.escape(string.punctuation)) 
    new_tokenized = []
    for token in tokenized:
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_tokenized.append(new_token)
    return new_tokenized

def rm_stopwords(words):
    new_words = []
    for word in words:
        if not word in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stemming(words):
    snowball = SnowballStemmer('english')
    new_words = []
    for word in words:
        new_words.append(snowball.stem(word))
    return new_words

def clean_text(text):
    # tokenize
    tokenized = word_tokenize(text)
    punc_rmd = rm_punctuation(tokenized)
    stw_rmd = rm_stopwords(punc_rmd)
    stmd = stemming(stw_rmd)
    new_text = " ".join(stmd)
    return new_text

In [22]:
clean_text(title)

'new year new law obamacar pot gun drone'

## Embedding

In [34]:
import pickle
from keras.preprocessing.sequence import pad_sequences

In [29]:
tokenizer = pickle.load(open("./data/tokenizer.pkl", 'rb'))

Using TensorFlow backend.


In [62]:
def word2indx(text, trained_tokenizer, max_len=2000):
    if isinstance(text, str):
        # single string text
        text = [text]
    tokenizer = trained_tokenizer
    sequences = tokenizer.texts_to_sequences(text)
    data = pad_sequences(sequences, maxlen=max_len)
    return data

In [67]:
inded = word2indx(text, tokenizer)
inded

array([[    0,     0,     0, ..., 18190,   691,     6]], dtype=int32)

## Make prediction

In [69]:
from keras.models import load_model

model = load_model("./data/models/CNN_LSTM2")

In [101]:
def predict_from_index(model, index):
    prediction = model.predict(index).flatten()
    prob_true = prediction[0] * 100
    prob_false = prediction[1] * 100
    if prob_true >= 0.7:
        print("Likely a true news! (%0.3f percent confident)" % prob_true)
    elif prob_true >= 0.5:
        print("Maybe a true news, not every sure (%0.3f percent confident)")
    elif prob_true < 0.5 & prob_true > 0.3:
        print("This is mabye a fake news (%0.3f percent confident)" % prob_false)
    elif prob_false >= 0.7:
        print("Attention! Likely a fake news! (%0.3f percent confident)"% prob_false)
    else:
        print("We are %0.3f percent confident that this is a true news" % prob_true)

In [102]:
predict_from_index(model, inded)

Likely a true news! (99.831 percent confident)


In [103]:
def predict_news(model, url, trained_tokenizer, max_len=2000):
    title, text = get_text_title(url)
    cleaned = clean_text(text)
    index = word2indx(cleaned, trained_tokenizer, max_len)
    predict_from_index(model, index)

In [100]:
predict_news(model, url, tokenizer)

This is likely to be a true news (98.860 percent confident)


## Try out

In [52]:
from src.prediction import predict_news, get_text_title
from keras.models import load_model
import pickle

In [13]:
model = load_model("./data/models/CNN_LSTM2")
tokenizer = pickle.load(open("./data/tokenizer.pkl", 'rb'))                  

In [59]:
url = "http://abcnews.com.co/way-choose-best-karaoke-microphone/"
title, text = get_text_title(url)
text

'Micro karaoke is an indispensable device in the karaoke set; it plays a relatively important role to make great sound. However, in order to choose the best karaoke microphone or the best mic for YouTube, you should refer to the post which will help you find the best way to choose the best one. Most people prefer wireless microphones, because they are very convenient and they can move in the room without having to worry about the wire. However, when you buy this type of microphone we must pay attention to the following points, because it directly affects the problem of ear or microphone. Follow the instructions to choose the best wireless microphone set for you.\n\n1 Wireless microphone is only convenient in wide and non-entangled space\n\nAccording to many years’ experiences of supply and installation of karaoke, we find that with the narrow room area, the best solution is to choose a wired microphone, because the wireless mic usually has the phenomenon of howling in the narrow space.

## Fake news websites

247NewsMedia.com
http://www.rappler.com/nation/173832-cbcp-list-websites-fake-news

In [67]:
url = "http://abcnews.com.co/specifics-tugboats-maritime-law/"

In [68]:
predict_news(model, url, tokenizer)

Attention! Likely a fake news! (99.537 percent confident)


In [69]:
from keras.utils import plot_model

In [70]:
plot_model(model, to_file="./data/model.png")