### From web, collect news based on company, source, start and end dates from google search results. 
### File name format, company_source_2016_w_1_title
### Input : company, source, year, week

1. Using google news archive to get related links (Google not provide API to access news archive search results)
1.1 Using market watch archive to get related news
2. From each link, get text part and title part and save them as desired file name format 



### Output :  company_source_2016_w_1_title

### Save news ino file with file name format company_source_SearchDate
### Text Name (URLs), TextRank, Positive Words Proportion, Negative Words Proportion 



In [None]:
import os
import sys
spark_home = os.environ['SPARK_HOME'] = '/home/cloudera/Downloads/spark-2.0.1-bin-hadoop2.6/'
if not spark_home:
    raise ValueError('SPARK_HOME enviroment variable is not set')
sys.path.insert(0,os.path.join(spark_home,'python'))
sys.path.insert(0,os.path.join(spark_home,'python/lib/py4j-0.8.2.1-src.zip'))
execfile(os.path.join(spark_home,'python/pyspark/shell.py'))

In [18]:
w = raw_input("Input the English word: ") # For Python 3: use input() instead
with open('positive_words.txt') as f:
    found = False
    for line in f:
        if w.upper() in line: # Key line: check if `w` is in the line.
            print(line)
            found = True
    if not found:
        print('The translation cannot be found!')

Input the English word: strong 
The translation cannot be found!


In [3]:

import io
import nltk
import itertools
from operator import itemgetter
import networkx as nx
import os
import re
from nltk.corpus import stopwords


#apply syntactic filters based on POS tags
def filter_for_tags(tagged, tags=['NN', 'JJ', 'NNP']):
    return [item for item in tagged if item[1] in tags]

def normalize(tagged):
    return [(item[0].replace('.', ''), item[1]) for item in tagged]

def unique_everseen(iterable, key=None):
    "List unique elements, preserving order. Remember all elements ever seen."
    # unique_everseen('AAAABBBCCDAABBB') --> A B C D
    # unique_everseen('ABBCcAD', str.lower) --> A B C D
    seen = set()
    seen_add = seen.add
    if key is None:
        for element in itertools.ifilterfalse(seen.__contains__, iterable):
            seen_add(element)
            yield element
    else:
        for element in iterable:
            k = key(element)
            if k not in seen:
                seen_add(k)
                yield element

def lDistance(firstString, secondString):
    "Function to find the Levenshtein distance between two words/sentences - gotten from http://rosettacode.org/wiki/Levenshtein_distance#Python"
    if len(firstString) > len(secondString):
        firstString, secondString = secondString, firstString
    distances = range(len(firstString) + 1)
    for index2, char2 in enumerate(secondString):
        newDistances = [index2 + 1]
        for index1, char1 in enumerate(firstString):
            if char1 == char2:
                newDistances.append(distances[index1])
            else:
                newDistances.append(1 + min((distances[index1], distances[index1+1], newDistances[-1])))
        distances = newDistances
    return distances[-1]

def buildGraph(nodes):
    "nodes - list of hashables that represents the nodes of the graph"
    gr = nx.Graph() #initialize an undirected graph
    gr.add_nodes_from(nodes)
    nodePairs = list(itertools.combinations(nodes, 2))

    #add edges to the graph (weighted by Levenshtein distance)
    for pair in nodePairs:
        firstString = pair[0]
        secondString = pair[1]
        levDistance = lDistance(firstString, secondString)
        gr.add_edge(firstString, secondString, weight=levDistance)

    return gr

def extractKeyphrases(text):
    # Removing stopwords
    #pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
    #text = pattern.sub('', text)
   
    # tokenize the text using nltk
    wordTokens = nltk.word_tokenize(text)

    # assign POS tags to the words in the text
    tagged = nltk.pos_tag(wordTokens)
    textlist = [x[0] for x in tagged]
    
    tagged = filter_for_tags(tagged)
    tagged = normalize(tagged)

    unique_word_set = unique_everseen([x[0] for x in tagged])
    word_set_list = list(unique_word_set)

    # this will be used to determine adjacent words in order to construct keyphrases with two words
    graph = buildGraph(word_set_list)
    # pageRank - initial value of 1.0, error tolerance of 0,0001, 
    calculated_page_rank = nx.pagerank(graph, weight='weight')
    #print (calculated_page_rank)
    # most important words in ascending order of importance
    keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)
    # the number of keyphrases returned will be relative to the size of the text (a third of the number of vertices)
    aThird = len(word_set_list) / 3
    keyphrases = keyphrases[0:aThird+1]
    # take keyphrases with multiple words into consideration as done in the paper - if two words are adjacent 
    # in the text and are selected as keywords, join them together
    modifiedKeyphrases = set([])
    dealtWith = set([]) #keeps track of individual keywords that have been joined to form a keyphrase
    i = 0
    j = 1
    while j < len(textlist):
        firstWord = textlist[i]
        secondWord = textlist[j]
        if firstWord in keyphrases and secondWord in keyphrases:
            keyphrase = firstWord + ' ' + secondWord
            modifiedKeyphrases.add(keyphrase)
            dealtWith.add(firstWord)
            dealtWith.add(secondWord)
        else:
            if firstWord in keyphrases and firstWord not in dealtWith: 
                modifiedKeyphrases.add(firstWord)
            #if this is the last word in the text, and it is a keyword,
            #it definitely has no chance of being a keyphrase at this point    
            if j == len(textlist)-1 and secondWord in keyphrases and secondWord not in dealtWith:
                modifiedKeyphrases.add(secondWord)      
        i = i + 1
        j = j + 1
        
    return  calculated_page_rank, modifiedKeyphrases

def extractSentences(text):
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentenceTokens = sent_detector.tokenize(text.strip())
    graph = buildGraph(sentenceTokens)

    calculated_page_rank = nx.pagerank(graph, weight='weight')

    #most important sentences in ascending order of importance
    sentences = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)

    #return a 100 word summary
    summary = ' '.join(sentences)
    summaryWords = summary.split()
    summaryWords = summaryWords[0:101]
    summary = ' '.join(summaryWords)

    return summary

def writeFiles(summary, keyphrases, fileName):
    "outputs the keyphrases and summaries to appropriate files"
    print "Generating output to " + 'TextRank-master/keywords/' + fileName
    keyphraseFile = io.open('TextRank-master/keywords/' + fileName, 'w')
    for keyphrase in keyphrases:
        keyphraseFile.write(keyphrase + '\n')
    keyphraseFile.close()

    print "Generating output to " + 'TextRank-master/summaries/' + fileName
    summaryFile = io.open('TextRank-master/summaries/' + fileName, 'w')
    summaryFile.write(summary)
    summaryFile.close()

    print "-"

def mainRunner():
    #nltk.download()
    #retrieve each of the articles
    articles = os.listdir("TextRank-master/articles")
    for article in articles:
        print 'Reading articles/' + article
        articleFile = io.open('TextRank-master/articles/' + article, 'r')
        text = articleFile.read()
        keyphrases = extractKeyphrases(text)
        summary = extractSentences(text)
        writeFiles(summary, keyphrases, article)

In [127]:
import requests
import re
import urllib
from bs4 import BeautifulSoup
import urllib2

#page = requests.get("https://www.google.com/search?cf=all&hl=en&pz=1&ned=us&tbm=nws&gl=us&as_q=IBM&as_occt=any&as_drrb=b&as_mindate=11%2F1%2F2016&as_maxdate=11%2F30%2F2016&tbs=cdr%3A1%2Ccd_min%3A11%2F1%2F2016%2Ccd_max%3A11%2F30%2F2016&as_nsrc=cnn&authuser=0")

page = requests.get("http://query.nytimes.com/search/sitesearch/?action=click&contentCollection&region=TopBar&WT.nav=searchWidget&module=SearchSubmit&pgtype=Homepage#/IBM/from20161101to20161130/allresults/1/allauthors/newest/")
soup = BeautifulSoup(page.content)
#print(soup)
#url = generate_GOOGLE_query_url('IBM', 'cnn', '11', '1', '2016', '11', '30', '2016')
#text = urllib.urlopen(url).read()
#soup = BeautifulSoup(text)

#links = soup.findAll("a")
#for link in  soup.find_all("a",href=re.compile("(?<=/url\?q=)(htt.*://.*)")):
for link in  soup.find_all('a', href=re.compile("(?<=/url\?q=)(htt.*://.*)")):
    tmp = re.split(":(?=http)",link["href"].replace("/url?q=",""))
    k = str(tmp).split('&sa')
    k = k[0][2:]
    print k

In [1]:
############## TEST #############

import urllib
import lxml.html
# financial news website with date
#connection = urllib.urlopen('http://www.marketwatch.com/search?q=IBM&m=Keyword&rpp=15&mp=2005&bd=true&bd=false&bdv=11%2F30%2F2016&rs=true')
connection = urllib.urlopen('https://www.bloomberg.com/quote/aapl:US')
dom =  lxml.html.fromstring(connection.read())
# //a/@href
for link in dom.xpath('//a/@href'): # select the url in href for all a tags(links)
    tmp = str(link)
    #if (("artiles/" in tmp) or ("press-releases/" in tmp)):
    if ("articles/" in tmp):
        print link
    elif ("press-releases/" in tmp):
        print link
    else:
        continue
    

http://www.bloomberg.com/news/articles/2017-01-04/ford-toyota-form-telematics-bloc-to-stymie-google-and-apple
http://www.bloomberg.com/news/articles/2017-01-04/in-apple-spotify-world-soundcloud-can-t-find-room
http://www.bloomberg.com/news/articles/2017-01-03/india-poised-to-reject-apple-s-demand-for-manufacturing-sops
http://www.bloomberg.com/news/articles/2016-12-30/apple-said-to-seek-lower-taxes-to-start-manufacturing-in-india-ixbnbvap
http://www.bloomberg.com/news/articles/2016-12-27/fitbit-gains-after-app-reaches-first-place-in-apple-store
http://www.bloomberg.com/news/articles/2016-12-22/popular-mobile-game-pokemon-go-lands-on-apple-watch
http://www.bloomberg.com/news/articles/2016-12-21/nokia-sues-apple-over-alleged-patent-infringement-in-products
http://www.bloomberg.com/news/articles/2016-12-21/apple-s-search-for-better-iphone-screens-leads-to-japan-s-rice-fields
http://www.bloomberg.com/news/articles/2016-12-21/eu-partly-wins-top-court-fight-over-spanish-m-a-tax-breaks
http:/

In [110]:
####### bloomberg businessweek latest news URLs extraction
import urllib
import lxml.html
# financial news website with date
#connection = urllib.urlopen('http://www.marketwatch.com/search?q=IBM&m=Keyword&rpp=15&mp=2005&bd=true&bd=false&bdv=11%2F30%2F2016&rs=true')
connection = urllib.urlopen('https://www.bloomberg.com/quote/aapl:US')
dom =  lxml.html.fromstring(connection.read())
# //a/@href
for link in dom.xpath('//a/@href'): # select the url in href for all a tags(links)
    tmp = str(link)
    #if (("artiles/" in tmp) or ("press-releases/" in tmp)):
    if ("articles/" in tmp):
        print link
    elif ("press-releases/" in tmp):
        print link
    else:
        continue
    

http://www.bloomberg.com/news/articles/2016-12-30/apple-said-to-seek-lower-taxes-to-start-manufacturing-in-india-ixbnbvap
http://www.bloomberg.com/news/articles/2016-12-27/fitbit-gains-after-app-reaches-first-place-in-apple-store
http://www.bloomberg.com/news/articles/2016-12-22/popular-mobile-game-pokemon-go-lands-on-apple-watch
http://www.bloomberg.com/news/articles/2016-12-21/nokia-sues-apple-over-alleged-patent-infringement-in-products
http://www.bloomberg.com/news/articles/2016-12-21/apple-s-search-for-better-iphone-screens-leads-to-japan-s-rice-fields
http://www.bloomberg.com/news/articles/2016-12-21/eu-partly-wins-top-court-fight-over-spanish-m-a-tax-breaks
http://www.bloomberg.com/news/articles/2016-12-20/your-evening-briefing
http://www.bloomberg.com/news/articles/2016-12-20/how-apple-alienated-mac-loyalists
http://www.bloomberg.com/press-releases/2016-12-22/nokia-nokia-expands-patent-litigation-against-apple-in-asia-europe-and-the-us
http://www.bloomberg.com/press-releases/20

In [124]:
####### Market watch latest news URLs extraction
import urllib
import lxml.html
connection = urllib.urlopen('http://www.marketwatch.com/search?q=apple&m=Keyword&rpp=15&mp=0&bd=true&bd=false&bdv=12%2F30%2F2016&rs=true')
dom =  lxml.html.fromstring(connection.read())
# //a/@href
for link in dom.xpath('//a/@href'): # select the url in href for all a tags(links)
    tmp = str(link)
    if ("story/" in tmp) and ("http" in tmp):
        print link
    elif ("story/" in tmp) and ("http" not in tmp):
        print "http://www.marketwatch.com" + tmp
    else:
        continue

http://www.marketwatch.com/story/10-things-not-to-buy-in-2017-2016-12-20
http://www.marketwatch.com/story/the-5-tech-trends-that-will-dominate-ces-and-2017-2016-12-29
http://www.marketwatch.com/story/amazon-alexa-gets-childs-request-really-wrong-offers-up-porn-2016-12-30
http://www.marketwatch.com/story/goldman-sachs-describes-two-different-trump-economies-so-which-is-right-2016-12-29
http://www.marketwatch.com/story/seven-highly-valued-tech-startups-that-could-ipo-in-2017-2016-12-30
http://www.marketwatch.com/story/5-things-you-should-absolutely-not-buy-in-2017-2016-12-30-12107041
http://www.marketwatch.com/story/the-2017-tech-shopping-spree-is-upon-us-2016-12-30
http://www.marketwatch.com/story/the-ipo-market-is-in-the-dumps-and-all-the-tax-cuts-in-the-world-wont-help-2016-12-27
http://www.marketwatch.com/story/foxconn-announces-plan-for-new-flat-panel-factory-in-china-2016-12-30
http://www.marketwatch.com/story/the-must-see-photos-of-2016-2016-12-16
http://www.marketwatch.com/story/

In [123]:
####### CNN Money news URLs extraction
import urllib
import lxml.html
connection = urllib.urlopen('http://money.cnn.com/quote/quote.html?symb=aapl')
dom =  lxml.html.fromstring(connection.read())
# //a/@href
ini_string = 'a'
for link in dom.xpath('//a/@href'): # select the url in href for all a tags(links)
    tmp = str(link)
    next_string =  tmp
    if (ini_string != next_string) and (("zacks.com" in tmp) or ("story/" in tmp)):
        print link
        ini_string = next_string
    else:
        continue


https://www.thestreet.com/story/13939355/1/jack-dorsey-asks-what-do-you-want-on-twitter-tech-roundup.html?puc=CNNMONEY&cm_ven=CNNMONEY
http://www.zacks.com/research-daily/99602/Top-Research-Reports-for-December-30,-2016?cid=CS-CNN-HL-99602
https://www.thestreet.com/story/13939313/1/dow-s-amp-p-500-and-nasdaq-close-out-2016-with-strong-gains.html?puc=CNNMONEY&cm_ven=CNNMONEY
https://www.thestreet.com/story/13939275/1/apple-reportedly-cutting-back-production-for-the-iphone-7.html?puc=CNNMONEY&cm_ven=CNNMONEY
https://www.thestreet.com/story/13939246/1/potential-apple-iphone-production-cut-not-a-happy-story-for-tim-cook-top-trader-says.html?puc=CNNMONEY&cm_ven=CNNMONEY
http://www.zacks.com/stock/news/244074/nintendo-to-launch-super-mario-runs-android-version-soon?cid=CS-CNN-HL-244074
http://www.zacks.com/stock/news/244067/apple-aapl-continues-to-increase-its-focus-on-india?cid=CS-CNN-HL-244067


In [114]:
####### Invenstor Guide URLs extraction
import urllib
import lxml.html
connection = urllib.urlopen('http://www.investorguide.com/stock.php?ticker=aapl')
dom =  lxml.html.fromstring(connection.read())
ini_string = 'a'
for link in dom.xpath('//a/@href'): # select the url in href for all a tags(links)
    tmp = str(link)
    next_string =  tmp
    if (ini_string != next_string) and ("http://" in tmp) and ("facebook.com" not in tmp) and ("twitter.com" not in tmp) and ("webfinanceinc.com" not in tmp):
        print link
        ini_string = next_string
    else:
        continue

http://www.forbes.com/sites/ewanspence/2016/12/30/apple-news-headlines-iphone-leak-rumor-airpods-macbook-pro/?utm_source=yahoo&utm_medium=partner&utm_campaign=yahootix&partner=yahootix
http://finance.yahoo.com/video/stocks-end-16-whisper-tech-230127024.html
http://sg.finance.yahoo.com/news/asia-stocks-dollar-subdued-last-012253513.html
http://www.investopedia.com/news/apple-cuts-iphone-production-stock-slips-aapl/?partner=YahooSA
http://uk.finance.yahoo.com/news/us-stocks-wall-st-ends-211832803.html


In [115]:
# Yahoo Finance URLs extraction
import urllib
import lxml.html
connection = urllib.urlopen('https://finance.yahoo.com/quote/AAPL/?p=AAPL')
dom =  lxml.html.fromstring(connection.read())
# //a/@href
path_pools = set()
path_pools = dom.xpath('//a/@href')
ini_string = 'a'
for link in path_pools: # select the url in href for all a tags(links)
    
    tmp = str(link)
    next_string =  tmp
    if (ini_string != next_string) and ("news/" in tmp) and (".html" in tmp):
        print "https://finance.yahoo.com"+ tmp
        ini_string = next_string
    else:
        continue
        

https://finance.yahoo.com/news/the-worst-tech-moments-of-2016-191120332.html
https://finance.yahoo.com/news/how-samsung-can-recover-from-its-explosive-2016-123708872.html
https://finance.yahoo.com/news/david-pogue-best-tech-ideas-for-2016-152642724.html
https://finance.yahoo.com/news/the-worst-tech-moments-of-2016-191120332.html
https://finance.yahoo.com/news/you-can-change-your-iphone-battery-151208103.html


In [4]:
### Get text from URL


import urllib
from bs4 import BeautifulSoup

#url = "https://finance.yahoo.com/news/you-can-change-your-iphone-battery-151208103.html"  
#url = "https://finance.yahoo.com/news/david-pogue-best-tech-ideas-for-2016-152642724.html"
#url = "https://finance.yahoo.com/news/the-worst-tech-moments-of-2016-191120332.html"

#url = "http://www.bloomberg.com/news/articles/2016-12-30/apple-said-to-seek-lower-taxes-to-start-manufacturing-in-india-ixbnbvap"
#url = "http://www.bloomberg.com/news/articles/2016-12-27/fitbit-gains-after-app-reaches-first-place-in-apple-store"
#url = "http://www.bloomberg.com/news/articles/2016-12-22/popular-mobile-game-pokemon-go-lands-on-apple-watch"

#url =  "http://uk.finance.yahoo.com/news/us-stocks-wall-st-ends-211832803.html"
#url = "http://www.investopedia.com/news/apple-cuts-iphone-production-stock-slips-aapl/?partner=YahooSA"
#url = "http://www.forbes.com/sites/ewanspence/2016/12/30/apple-news-headlines-iphone-leak-rumor-airpods-macbook-pro/?utm_source=yahoo&utm_medium=partner&utm_campaign=yahootix&partner=yahootix"   


#url = "http://www.zacks.com/research-daily/99602/Top-Research-Reports-for-December-30,-2016?cid=CS-CNN-HL-99602"
#url = "http://www.zacks.com/stock/news/244067/apple-aapl-continues-to-increase-its-focus-on-india?cid=CS-CNN-HL-244067"
#url = "http://www.zacks.com/stock/news/244074/nintendo-to-launch-super-mario-runs-android-version-soon?cid=CS-CNN-HL-244074"

#url = "http://www.marketwatch.com/story/apple-samsung-missed-opportunities-analysts-say-2016-12-28"
#url = "http://www.marketwatch.com/story/foxconn-announces-plan-for-new-flat-panel-factory-in-china-2016-12-30"
url = "http://www.marketwatch.com/story/5-things-you-should-absolutely-not-buy-in-2017-2016-12-30-12107041"


html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)

# kill all script and style elements
for script in soup(["script", "style"]):
    script.extract()    # rip it out

# get text
text = soup.get_text()

# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)

#print(text)


### Then TextRank

page_rank, keyphrases = extractKeyphrases(text)

print(page_rank)
print ("="*100)
print(keyphrases)

{u'Canada': 0.0027923901972656795, u'end-of-day': 0.003962053795845371, u'SIX': 0.0027020136434710075, u'dollar': 0.002777976957366703, u'month': 0.002610717413813308, u'Take': 0.0025006148289729167, u"Figures'": 0.0032318707610021788, u'Shakers': 0.0029184022685576944, u'Today': 0.002636437054882979, u'Business': 0.0031820629116803122, u'Bonds': 0.00260940314531076, u'Moneyologist': 0.004395183605941991, u'Authors': 0.002982577148024729, u'Hedge': 0.002621103938356175, u'tethered': 0.003203048829622982, u'Nutting': 0.0029817031005832705, u'Iranian': 0.0029110201354603524, u'Obamacare': 0.003546463878484593, u'P': 0.0026986169308276147, u'join': 0.0024958107895927117, u'Emerging': 0.003224280760932025, u'presidency': 0.003740039001538832, u'Retire': 0.002653881842547782, u'Nasdaq': 0.0028282971006935173, u'Darrell': 0.0029461028583043333, u'Partner': 0.002853600046488528, u'Index': 0.0026421335229050774, u'Rex': 0.0025526469074649573, u'Spending': 0.00319435002845348, u'Conduct': 0.002

In [39]:

def is_positive(word):
    with open('positive_words.txt') as f:
        found = False
        for line in f:
            if word.upper() in line: # Key line: check if `w` is in the line.
                found = True
                return found 
            else:
                continue
        return False
    
def is_negative(word):
    with open('negative_words.txt') as f:
        found = False
        for line in f:
            if word.upper() in line: # Key line: check if `w` is in the line.
                found = True
                return found 
            else:
                continue
        return False

        
print is_positive('strong')
print is_negative('strong')

True
False


In [158]:
d = page_rank
positive_score = 0
positive_count = 0
negative_score = 0
negative_count = 0
su = 0
for k,v in d.iteritems(): 
    k = k.encode('ascii', 'ignore').decode('ascii')
    su = su + v 
    if k.isalpha() and (is_positive(str(k))):
        #print "%s - %f" % (str(k), v)
        positive_score = positive_score + v
        positive_count = positive_count + 1
        
    elif k.isalpha() and (is_negative(str(k))):
        negative_score = negative_score + v
        negative_count = negative_count + 1
                      
    else:
        continue
print su, positive_score, negative_score, positive_count, negative_count, \
(positive_score/negative_score), (float(positive_count)/float(negative_count)), 

1.0 0.0607698048068 0.116725740302 20 40 0.52062042742 0.5


### https://finance.yahoo.com/news/the-worst-tech-moments-of-2016-191120332.html
###  1.0 0.0439546986578 0.0932536308988 29 63 0.471 0.460 (Removing stopwords)
###  1.0 0.0510288480223 0.099382432745 35 69 0.513 0.507 (Keeping stopwords)

### https://finance.yahoo.com/news/you-can-change-your-iphone-battery-151208103.html
### 1.0 0.0641297563117 0.0905783838755 20 28 0.708 0.714 (Removing stopwords)
### 1.0 0.0636360849934 0.0951572991266 20 29 0.668 0.689 (Keeping stopwords)


### https://finance.yahoo.com/news/david-pogue-best-tech-ideas-for-2016-152642724.html
###  1.0 0.0735970225499 0.073067400756 41 41 1.01 1.0(Removing stopwords)
###  1.0 0.0720756394798 0.0844750723022 41 48 0.853 0.854 (Keeping stopwords)


# Finance Yahoo AVG Score, TextRank, WordCounts

In [147]:
import numpy as np
values = [0.513, 0.668, 0.853]
print np.mean(values)
print np.var(values)

0.678
0.0193166666667


### http://www.bloomberg.com/news/articles/2016-12-30/apple-said-to-seek-lower-taxes-to-start-manufacturing-in-india-ixbnbvap
###  1.0 0.0440125829349 0.0796177307027 17 31 0.552798761614 0.548387096774 (Removing stopwords)
###  1.0 0.0440401547541 0.0741848636141 17 29 0.593654185079 0.586206896552 (Keeping stopwords)

### http://www.bloomberg.com/news/articles/2016-12-27/fitbit-gains-after-app-reaches-first-place-in-apple-store
###  1.0 0.058847023895 0.0901091350112 19 29 0.653063908423 0.655172413793 (Removing stopwords)
###  1.0 0.0581116828732 0.0891047216396 19 29 0.652172879325 0.655172413793 (Keeping stopwords)


### http://www.bloomberg.com/news/articles/2016-12-22/popular-mobile-game-pokemon-go-lands-on-apple-watch
###  1.0 0.0585964579846 0.0744559625863 17 22 0.786994834923 0.772727272727 (Removing stopwords)
###  1.0 0.0575025080654 0.0731388825766 17 22 0.78620982492 0.772727272727 (Keeping stopwords)

# Business Week AVG Score, TextRank, WordCounts

In [148]:
import numpy as np
values = [0.593, 0.652, 0.786]
print np.mean(values)
print np.var(values)

0.677
0.00652066666667


### http://uk.finance.yahoo.com/news/us-stocks-wall-st-ends-211832803.html
###  1.0 0.0379488119665 0.115798503598 25 74 0.327714182717 0.337837837838 (Removing stopwords)
###  1.0 0.0410101494058 0.110881352086 27 70 0.369856144739 0.385714285714 (Keeping stopwords)

###  "http://www.investopedia.com/news/apple-cuts-iphone-production-stock-slips-aapl/?partner=YahooSA"
###   1.0 0.104955227033 0.144401024667 29 36 0.726831594684 0.805555555556 (Removing stopwords)
###   1.0 0.130676881843 0.145173399044 37 36 0.90014343333 1.02777777778 (Keeping stopwords)


###  http://www.forbes.com/sites/ewanspence/2016/12/30/apple-news-headlines-iphone-leak-rumor-airpods-macbook-pro/?utm_source=yahoo&utm_medium=partner&utm_campaign=yahootix&partner=yahootix"   
###   1.0 0.0459634062259 0.113101878063 11 27 0.406389416455 0.407407407407 (Removing stopwords)
###   1.0 0.0481751882425 0.11115760734 12 27 0.433395332947 0.444444444444 (Keeping stopwords)

# Investor Guid AVG Score, TextRank, WordCounts

In [149]:
import numpy as np
values = [0.369, 0.900, 0.433]
print np.mean(values)
print np.var(values)

0.567333333333
0.0560162222222


###  http://www.zacks.com/research-daily/99602/Top-Research-Reports-for-December-30,-2016?cid=CS-CNN-HL-99602
###  1.0 0.0663784490822 0.139212118822 39 81 0.476815162674 0.481481481481 (Removing stopwords)
###  1.0 0.0653785485007 0.134715337247 38 77 0.48530887304 0.493506493506 (Keeping stopwords)

###  http://www.zacks.com/stock/news/244067/apple-aapl-continues-to-increase-its-focus-on-india?cid=CS-CNN-HL-244067
###  1.0 0.0781299481476 0.119772608674 46 70 0.652318998581 0.657142857143  (Removing stopwords)
###  1.0 0.0742061396445 0.117341409811 43 67 0.632395160105 0.641791044776 (Keeping stopwords)


###  http://www.zacks.com/stock/news/244074/nintendo-to-launch-super-mario-runs-android-version-soon?cid=CS-CNN-HL-244074 
###   1.0 0.0818102029814 0.117408226028 48 66 0.696801286836 0.727272727273 (Removing stopwords)
###   1.0 0.0797614811481 0.113222596075 46 62 0.704466104059 0.741935483871 (Keeping stopwords)

# CNN Money AVG Score, TextRank, WordCounts



In [150]:
import numpy as np
values = [0.485, 0.632, 0.704]
print np.mean(values)
print np.var(values)

0.607
0.008306


###  http://www.marketwatch.com/story/apple-samsung-missed-opportunities-analysts-say-2016-12-28
###   (Removing stopwords)
###  1.0 0.0558185768987 0.125278151549 25 57 0.445557155885 0.438596491228 (Keeping stopwords)

### http://www.marketwatch.com/story/foxconn-announces-plan-for-new-flat-panel-factory-in-china-2016-12-30
###   (Removing stopwords)
###  1.0 0.0639546640095 0.110097104489 29 51 0.580893242436 0.56862745098 (Keeping stopwords)


###  http://www.marketwatch.com/story/5-things-you-should-absolutely-not-buy-in-2017-2016-12-30-12107041
###   (Removing stopwords)
###   1.0 0.0607698048068 0.116725740302 20 40 0.52062042742 0.5 (Keeping stopwords)



# Market Watch AVG Score, TextRank, WordCounts

In [159]:
import numpy as np
values = [0.445, 0.581, 0.521]
print np.mean(values)
print np.var(values)

0.515666666667
0.00309688888889


In [None]:
yahoo, 0.678, 0.019
businessweek 0.667, 0.006
Investor Guide 0.567, 0.056
cnn 0.706 0.008
marketwatch 0.515, 0.003