In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from scrapy.selector import Selector

import re, requests
import lxml

In [2]:
regex_bbc = r"(http(s?):\/\/(www)?[A-Za-z0-9\/\.]+)"
xpath_bbc = ['//div[@class="story-body__inner"]/p/text()',
            '//ul/li[@class="tags-list__tags"]/a/text()']

# Regex "&amp"
regex_decode = r"(\&amp)"

In [3]:
file_bbc = './datasets/bbchealth.txt'
columns_name = ['tweetid','date','title']

bbc = pd.read_csv(file_bbc, sep='|', header= None)
bbc.columns = columns_name
bbc.head()

Unnamed: 0,tweetid,date,title
0,585978391360221184,Thu Apr 09 01:31:50 +0000 2015,Breast cancer risk test devised http://bbc.in/...
1,585947808772960257,Wed Apr 08 23:30:18 +0000 2015,GP workload harming care - BMA poll http://bbc...
2,585947807816650752,Wed Apr 08 23:30:18 +0000 2015,Short people's 'heart risk greater' http://bbc...
3,585866060991078401,Wed Apr 08 18:05:28 +0000 2015,New approach against HIV 'promising' http://bb...
4,585794106170839041,Wed Apr 08 13:19:33 +0000 2015,Coalition 'undermined NHS' - doctors http://bb...


In [4]:
# Split Url out of content
def get_url(regex, sample):
    result = re.search(regex, sample)[0]
    
    return result

# Split text out of content
def split_txt_form_url(regex, sample):
    url = get_url(regex, sample)
    result = sample.strip(url)
    
    return result

# Scrapt content base on xpath command.
def scrapt_w_xpath(xpath_cmd):
    result = dict()
    
    # Content
    contents = xpath_selector.xpath(xpath_cmd[0]).extract()
    result.update({'Contents': contents})
    
    # Tags
    tags = xpath_selector.xpath(xpath_cmd[1]).extract()
    result.update({'Tags': tags})
    
    return result

In [5]:
bbc['url'] = bbc['title'].map(lambda row: get_url(regex_bbc, row))
bbc['title'] = bbc['title'].map(lambda row: split_txt_form_url(regex_bbc, row))
bbc.head()

Unnamed: 0,tweetid,date,title,url
0,585978391360221184,Thu Apr 09 01:31:50 +0000 2015,Breast cancer risk test devised,http://bbc.in/1CimpJF
1,585947808772960257,Wed Apr 08 23:30:18 +0000 2015,GP workload harming care - BMA poll,http://bbc.in/1ChTBRv
2,585947807816650752,Wed Apr 08 23:30:18 +0000 2015,Short people's 'heart risk greater',http://bbc.in/1ChTANp
3,585866060991078401,Wed Apr 08 18:05:28 +0000 2015,New approach against HIV 'promising',http://bbc.in/1E6jAjt
4,585794106170839041,Wed Apr 08 13:19:33 +0000 2015,oalition 'undermined NHS' - doctors,http://bbc.in/1CnLwK7


In [6]:
bbc_urls = bbc['url'].values

In [7]:
is_attempt = True
for key, url in enumerate(bbc_urls):
    
    # Start the connection
    res = requests.get(url)
    
    # Initialize XPath
    xpath_selector = Selector(text=res.text)
    
    # Get return content as text
    text_html = res.text

    # Check return header
    if res.status_code == 200:
        print("Header:  ", 200)
        soup = BeautifulSoup(res.content, 'lxml')
        xpath_selector = Selector(text=text_html)

        # Scrapt content with XPath
        list_scrapt = scrapt_w_xpath(xpath_bbc)
        
        count_xp_scrapt = len(list_scrapt)
        

        if count_xp_scrapt:
            
            # Add new columns
            if is_attempt:
                df_temp = pd.DataFrame(columns = list(list_scrapt.keys()))
                bbc = pd.concat([bbc, df_temp])
                print(bbc.columns)
                is_attempt = False
                
            # Add to dataFrame
            for k_, v_ in list_scrapt.items():
                bbc[k_].iloc[key] = v_
                
        else:
            print("Return list is emptied ! ({})".format(url))
    else:
        print("Nothing is archived because of {} return {} Errors".format(url, res.status_code))

Header:   200
Index(['Contents', 'Tags', 'date', 'title', 'tweetid', 'url'], dtype='object')


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Header:   200
Header:   200
Header:   200
Header:   200
Header:   200
Header:   200
Header:   200
Header:   200
Header:   200
Header:   200
Header:   200
Header:   200
Header:   200
Header:   200
Header:   200
Header:   200
Header:   200


KeyboardInterrupt: 

In [None]:
# test_url = 'http://bbc.in/14GifAt'
# res = requests.get(test_url)
# author = soup.find('span',{'class':'byline__name'}).text
# author

In [8]:
bbc.head()

Unnamed: 0,Contents,Tags,date,title,tweetid,url
0,[Scientists have predicted the odds of women d...,"[Genetics, Cancer, Medical research]",Thu Apr 09 01:31:50 +0000 2015,Breast cancer risk test devised,5.859784e+17,http://bbc.in/1CimpJF
1,[Better funding and more time with patients ar...,"[Doctors, NHS]",Wed Apr 08 23:30:18 +0000 2015,GP workload harming care - BMA poll,5.859478e+17,http://bbc.in/1ChTBRv
2,"[The shorter you are, the greater your risk of...","[Genetics, Heart disease, Medical research]",Wed Apr 08 23:30:18 +0000 2015,Short people's 'heart risk greater',5.859478e+17,http://bbc.in/1ChTANp
3,[The first human trial of a new type of HIV th...,"[HIV & Aids, Medical research]",Wed Apr 08 18:05:28 +0000 2015,New approach against HIV 'promising',5.858661e+17,http://bbc.in/1E6jAjt
4,"[The government has ""undermined and weakened"" ...","[NHS, A&E]",Wed Apr 08 13:19:33 +0000 2015,oalition 'undermined NHS' - doctors,5.857941e+17,http://bbc.in/1CnLwK7


In [9]:
# cbc content
## //p/text()
## //div[@class='detailSummary']/text() --> Detail Summary

## //h2[@class='deck']/text() --> h2-sub-title
## //div[@class='story']/span --> Content
## //div[@class='source']/p/text() --> Author


## Internal "Comment" link

In [None]:
dictey = dict({'Title':'ewtwewe', 'tweetew':'leoho'})
type(list(dictey.keys()))