In [1]:
import pandas as pd
import pickle
from bs4 import BeautifulSoup                          #scrape information from web pages
from goose3 import Goose                               #extract information from news articles
from collections import Counter                        
import string
from joblib import Parallel, delayed
import sys
from tqdm import tqdm                                    

stop_domains = ['buzzfeed', 'clickhole', 'cnn', 'wikinews', 'upworthy', 'nytimes']


def features(html):
    try:
        soup = BeautifulSoup(html, "lxml")
        g = Goose()
        try:
            goose_article = g.extract(raw_html=html)
        except TypeError:
            goose_article = None
        except IndexError:
            goose_article = None

        size = sys.getsizeof(html)
        html_len = len(html)
        count_links = len(soup.find_all('a'))
        count_buttons = len(soup.find_all('button'))
        count_inputs = len(soup.find_all('input'))
        count_ul = len(soup.find_all('ul'))
        count_ol = len(soup.find_all('ol'))
        count_lists = count_ol + count_ul
        count_h1 = len(soup.find_all('h1'))
        count_h2 = len(soup.find_all('h2'))
        if count_h1 > 0:
            h1_len = 0
            h1_text = ''
            for x in soup.find_all('h1'):
                text = x.get_text().strip()
                h1_text += text + ' '
                h1_len += len(text)
            total_h1_len = h1_len
            avg_h1_len = h1_len * 1. / count_h1
        else:
            total_h1_len = 0
            avg_h1_len = 0
            h1_text = ''

        if count_h2 > 0:
            h2_len = 0
            h2_text = ''
            for x in soup.find_all('h2'):
                text = x.get_text().strip()
                h2_len += len(text)
                h2_text += text + ' '
            total_h2_len = h2_len
            avg_h2_len = h2_len * 1. / count_h2
        else:
            total_h2_len = 0
            avg_h2_len = 0
            h2_text = ''
        if goose_article is not None:
            parser_data = goose_article.meta_description + ' ' + h1_text + ' ' + h2_text
            parser_data = "".join(l for l in parser_data if l not in string.punctuation)
            parser_data = parser_data.strip().lower().split()
            parser_data = [word for word in parser_data if word.lower() not in stop_domains]
            parser_data = ' '.join(parser_data)
        else:
            parser_data = h1_text + ' ' + h2_text
            parser_data = "".join(l for l in parser_data if l not in string.punctuation)
            parser_data = parser_data.strip().lower().split()
            parser_data = [word for word in parser_data if word.lower() not in stop_domains]
            parser_data = ' '.join(parser_data)

        count_images = len(soup.find_all('img'))

        count_tags = len([x.name for x in soup.find_all()])
        count_unique_tags = len(Counter([x.name for x in soup.find_all()]))

        return [size, html_len, count_links, count_buttons,
                count_inputs, count_ul, count_ol, count_lists,
                count_h1, count_h2, total_h1_len, total_h2_len, avg_h1_len, avg_h2_len,
                count_images, count_tags, count_unique_tags,
                parser_data]
    except:
        return [-1, -1, -1, -1,
                -1, -1, -1, -1,
                -1, -1, -1, -1, -1, -1,
                -1, -1, -1,
                "no data"]


clickbait_html = pickle.load(open('data/clickbait_html.pkl','rb'))
clickbait_features = Parallel(n_jobs=30)(delayed(features)(html) for html in tqdm(clickbait_html))

clickbait_features = pd.DataFrame(clickbait_features,
                                     columns=["size", "html_len", "number_of_links", "number_of_buttons",
                                              "number_of_inputs", "number_of_ul", "number_of_ol", "number_of_lists",
                                              "number_of_h1", "number_of_h2", "total_h1_len", "total_h2_len",
                                              "avg_h1_len", "avg_h2_len",
                                              "number_of_images", "number_of_tags", "number_of_unique_tags",
                                              "textdata"])

clickbait_features.to_csv('data/clickbait_website_features.csv', index=False, encoding='utf-8')

non_clickbait_html = pickle.load(open('data/non_clickbait_html.pkl','rb'))
non_clickbait_features = Parallel(n_jobs=30)(delayed(features)(html) for html in tqdm(non_clickbait_html))

non_clickbait_features = pd.DataFrame(non_clickbait_features,
                                         columns=["size", "html_len", "number_of_links", "number_of_buttons",
                                                  "number_of_inputs", "number_of_ul", "number_of_ol", "number_of_lists",
                                                  "number_of_h1", "number_of_h2", "total_h1_len", "total_h2_len",
                                                  "avg_h1_len", "avg_h2_len",
                                                  "number_of_images", "number_of_tags", "number_of_unique_tags",
                                                  "textdata"])


non_clickbait_features.to_csv('data/non_clickbait_website_features.csv', index=False, encoding='utf-8')


100%|██████████████████████████████████████████████████████████████████████████████| 2137/2137 [02:09<00:00, 16.54it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 3019/3019 [02:01<00:00, 24.80it/s]
