In [1]:
import warnings
warnings.filterwarnings('ignore')

import ipywidgets as widgets
from IPython.display import display, clear_output

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

import string
import re
import nltk
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

# from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import SGDClassifier

import pickle

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\polin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\polin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# !jupyter nbextension enable --py widgetsnbextension --sys-prefix
# !jupyter serverextension enable voila --sys-prefix

In [4]:
def read_news_from_main_web_page():
    news_data = {'Article title': [], 'Link': [], 'text': [], 'cleaned_text': [], 'Date': [], 'Categories': [], 'Predicted categories': []}
    page = "http://catdailynews.com/"
    protocol_list = requests.get(page).text
    soup = BeautifulSoup(protocol_list, 'lxml')
    
    for ai in soup.find_all('a'):
        if ai.div:
            news_data['Article title'].append(ai.div.text)
            news_data['Link'].append(ai.get('href'))
    return news_data

def remove_stopwords(text):
    return " ".join([w if w not in eng_stopwords else ' ' for w in text.split(' ')])

def remove_punctuation(text):
    return "".join([ch if ch not in string.punctuation else ' ' for ch in text])

def remove_numbers(text):
    return ''.join([i if not i.isdigit() else ' ' for i in text])

def remove_multiple_spaces(text):
    return re.sub(r'\s+', ' ', text, flags=re.I)

def lemmatize_text(text):
    return " ".join([lemma.lemmatize(word.lower()) for word in text.split(' ')])

def news_text_cleaning(text):
    text_removed_stopwords = remove_stopwords(text)
    text_removed_punctuation = remove_punctuation(text_removed_stopwords)
    text_removed_numbers = remove_numbers(text_removed_punctuation)
    text_removed_multiple_spaces = remove_multiple_spaces(text_removed_numbers)
    text_cleaned = lemmatize_text(text_removed_multiple_spaces)
    return text_cleaned

def predict_humor(text):
    return humor_clf.predict([text])

def predict_other_categories(text):
    return other_categ_clf.predict([text])

def predict_categories(text):
    predicted_humor = predict_humor(text)
    predicted_humor = str(predicted_humor[0])
    predicted_other_categ = predict_other_categories(text)
    predicted_other_categ = str(predicted_other_categ[0])
    if predicted_humor == 'not Humor':
        return predicted_other_categ
    elif predicted_humor == 'Humor' and predicted_other_categ == 'No category':
        return predicted_humor
    else:
        return predicted_humor + ', ' + predicted_other_categ

lemma = WordNetLemmatizer()

eng_stopwords = stopwords.words("english")
eng_stopwords.extend(['…', '«', '»', '...'])

with open("humor_clf.pkl", "rb") as file:
    humor_clf = pickle.load(file)
    
with open("other_categ_clf.pkl", "rb") as file:
    other_categ_clf = pickle.load(file)

def predict_category(news_data):
    for idx, link in enumerate(news_data['Link']):
        protocol_list = requests.get(link).text
        soup = BeautifulSoup(protocol_list, 'lxml')

        text_parts = soup.find_all('p')
        text = ''
        for p in text_parts:
            text += p.text
        news_data['text'].append(text)
        text_cleaned = news_text_cleaning(text)
        news_data['cleaned_text'].append(text_cleaned)

        for divi in soup.find_all('div'):
            if divi.get("class") == ['pull-left']:
                date = divi.text.split('  ')[0][:-2]
        news_data['Date'].append(date)

        categories = []
        for divi in soup.find_all('div'):
            if divi.get("class") == ['category-tag']:
                all_a = divi.find_all('a')
                for a in all_a:
                    if a.text[0].isupper():
                        categories.append(a.text)
                categories = ', '.join(categories)
        news_data['Categories'].append(categories)

        pred_categories = predict_categories(text_cleaned)
        news_data['Predicted categories'].append(pred_categories)
    return news_data

def make_clickable(val):
    # target _blank to open new window
    return '<a target="_blank" href="{}">{}</a>'.format(val, val)

def create_news_table_show(news_data):
    news_data_table = pd.DataFrame(news_data)
    news_table_show = news_data_table[['Date', 'Article title', 'Link', 'Categories', 'Predicted categories']]
    news_table_show = news_table_show.style.format({'Link': make_clickable})
    return news_table_show


In [5]:
# Image Widget

file = open("cat_daily_news_main.jpg", "rb")
main_image = file.read()

file = open("sleepy_kitten.jpg", "rb")
lovely_image = file.read()

image_headline = widgets.Image(
                    value=main_image,
                    format='jpg',
                    width='95%',
                    height='500px'
                )

image_base = widgets.Image(
                    value=lovely_image,
                    format='jpg',
                    width='350px',
                )

label_base = widgets.Label(
                    value='Photo by fanibani',
                    style={'description_width': 'initial'}
                )


vbox_headline_image = widgets.VBox([image_headline])
vbox_base_image = widgets.VBox([image_base, label_base])
# display(vbox_headline_image, vbox_base_image)

In [6]:
# button to load news

button_send = widgets.Button(
                description='Load news',
                tooltip='Send',
                style={'description_width': 'initial'}
            )

output = widgets.Output()

def on_button_clicked(event):
    with output:
        clear_output()
        news_data = read_news_from_main_web_page()
        news_data_predicted = predict_category(news_data)
        news_table_show = create_news_table_show(news_data_predicted)
        display(news_table_show)
        
button_send.on_click(on_button_clicked)

vbox_result = widgets.VBox([button_send, output])
# display(vbox_result)

In [7]:
# news table widget

# def news_table_widget_creation(df):
    
#     out = widgets.Output(layout={'border': '1px solid black'})
#     with out:
#         display(df)
#     return out

# news_table = news_table_widget_creation(news_table_show)
# vbox_news_table = widgets.VBox([news_table])
# display(vbox_news_table)

In [8]:
level1 = widgets.HBox([vbox_headline_image])
level2 = widgets.HBox([vbox_base_image, vbox_result])
# level2 = widgets.HBox([vbox_result])
# level2 = widgets.HBox([vbox_images, vbox_result])
display(level1,level2)

HBox(children=(VBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00\xf0\x00\xf0\x00\x…

HBox(children=(VBox(children=(Image(value=b'\xff\xd8\xff\xe1"\x98Exif\x00\x00II*\x00\x08\x00\x00\x00\x13\x00\x…

In [9]:
# conda list -e > requirements.txt
# !pip freeze > requirements.txt