In [88]:
# Task : Build an AI curated news feed for customers which shows only the article which are relavent for stocks customers have invested
# in. 


from xml.dom import minidom
import urllib
import pandas as pd

#Load the URL
def load(rssURL):
  return minidom.parse(urllib.request.urlopen(rssURL))

DEFAULT_NAMESPACES = \
  (None, # RSS 0.91, 0.92, 0.93, 0.94, 2.0
  'http://purl.org/rss/1.0/', # RSS 1.0
  'http://my.netscape.com/rdf/simple/0.9/' # RSS 0.90
  )

#Retrieve elements of rss by tag name
def getElementsByTagName(node, tagName, possibleNamespaces=DEFAULT_NAMESPACES):
  for namespace in possibleNamespaces:
    children = node.getElementsByTagNameNS(namespace, tagName)
    if len(children): return children
  return []

def first(node, tagName, possibleNamespaces=DEFAULT_NAMESPACES):
  children = getElementsByTagName(node, tagName, possibleNamespaces)
  return len(children) and children[0] or None

def textOf(node):
  return node and "".join([child.data for child in node.childNodes]) or ""

#Revalent news sources. More sources can be included later
rssURL1 = 'http://articlefeeds.nasdaq.com/nasdaq/symbols?symbol={}'
rssURL2 = 'https://www.marketwatch.com/rss/newsfinder/AllMarketWatchNews/?p=word&pv={}&t={}&dist=sr_rss'
rssSource = [rssURL1, rssURL2]
#Stock symbols for companies in which user has invested stocks in. 
#Apple, Facebook, Amazon, Netflix, Tesala, Cisco, Google, Baidu, Alibaba and Tencent
symbols = ['AAPL', 'FB', 'AMZN', 'NFLX', 'TSLA', 'CSCO', 'GOOGL', 'BIDU', 'BABA', 'TCEHY']


newsFeedDF = pd.DataFrame(columns = ['symbol','Headline', 'NewsURL', 'Summary'])

#Populate news feed from a veraity of RSS news feeds
def populateNewsFeed(df):
    index = 0
    for symbol in symbols:
        for rssURL in rssSource:
            rssDocument = load(rssURL.format(symbol,symbol))
            for item in getElementsByTagName(rssDocument, 'item'):
                df.loc[index] = pd.Series({'symbol': symbol,
                                           'Headline' : textOf(first(item, 'title')),
                                           'NewsURL' : textOf(first(item, 'link')),
                                           'Summary' : textOf(first(item, 'description'))})
                index+=1
populateNewsFeed(newsFeedDF)

# Remove duplicate entries from the collected news feed. Each entry can be uniquely identified by URL. So we need to 
#filter on URL and remove duplicate row entries

print(newsFeedDF.shape)
newsFeedDF.drop_duplicates(subset = ['NewsURL'], inplace=True)
print(newsFeedDF.shape)

#Relavent info is collected and now our feed uniquely identifies the symbol, URL, Summary and Description

(385, 4)
(335, 4)


In [None]:
# We need to find sentiment of each news article and come up with a score to decide whether the news is positive or negetive.
# For this purpose, approach taken is to find dictionaries containing positive and negetive words and then count 
# how many of each occur respectively in each article. 
# Used Sentiment Analysis of Financial Texts dictionary developed by  Prof Bill McDonald, 
#professor of Finance at the University of Notre Dame to identify the contextual meaning of financial text

In [89]:
#Load the positive dictionary
def loadPositive():
    """
    loading positive dictionary
    """
    myfile = open('LM_Positive.csv', "r")
    positives = myfile.readlines()
    positive = [pos.strip().lower() for pos in positives]
    return positive

In [90]:
#Load the negetive dictionary
def loadNegative():
    """
    loading negetive dictionary
    """
    myfile = open('LM_Negative.csv', "r")
    negatives = myfile.readlines()
    negative = [neg.strip().lower() for neg in negatives]
    return negative

In [91]:
#Count negetive words in the text
def countNeg(cleantext, negative):
    """
    counts negative words in cleantext
    """
    negs = [word for word in cleantext if word in negative]
    return len(negs)

#Count positive words in the text
def countPos(cleantext, positive):
    """
    counts negative words in cleantext
    """
    pos = [word for word in cleantext if word in positive]
    return len(pos)  

In [92]:
#Get the sentiment, by substracting no:of negetive words from no:of positive words
def getSentiment(cleantext, negative, positive):
    """
    counts negative and positive words in cleantext and returns a score accordingly
    """
    positive = loadPositive()
    negative = loadNegative()
    return (countPos(cleantext, positive) - countNeg(cleantext, negative))

In [93]:
#Apply the sentiment generator on Summary text to come up with sentiment score
def updateSentimentDataFrame(df):
    """
    performs sentiment analysis on single text entry of dataframe and returns dataframe with scores
    """
    positive = loadPositive()
    negative = loadNegative()   
    
    df['text'] = df['Summary'].apply(cleanText)
    df['Sentiment_score'] = df['text'].apply(lambda x: getSentiment(x,negative, positive))

    return df

In [94]:
#nltk task to do text preprocessing
def cleanText(text):
    """
    removes punctuation, stopwords and returns lowercase text in a list of single words
    """
    text = text.lower()    
    
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    
    from nltk.corpus import stopwords
    clean = [word for word in text if word not in stopwords.words('english')]
    
    return clean

In [95]:
#Create a sentiment score based on the summary text.
#
newsFeedDF = updateSentimentDataFrame(newsFeedDF)

In [96]:
print(newsFeedDF['Sentiment_score'].value_counts())

 0    181
-1     52
 1     41
-2     25
 2     16
-3      8
-4      4
-5      3
 3      2
-6      1
 4      1
 5      1
Name: Sentiment_score, dtype: int64


In [97]:
newsFeedDF = newsFeedDF.drop('text', axis=1)

In [98]:
print(newsFeedDF.head())

  symbol                                           Headline  \
0   AAPL  Validea&#39;s Top Five Technology Stocks Based...   
1   AAPL  Big Technology&#39;s Spending Spree Is Great f...   
2   AAPL                    Better Buy: Apple vs. Microsoft   
3   AAPL  What Investors Need to Know About Intel&#39;s ...   
4   AAPL  Warren Buffett Is Buying These 2 Stocks -- Sho...   

                                             NewsURL  \
0  http://articlefeeds.nasdaq.com/~r/nasdaq/symbo...   
1  http://articlefeeds.nasdaq.com/~r/nasdaq/symbo...   
2  http://articlefeeds.nasdaq.com/~r/nasdaq/symbo...   
3  http://articlefeeds.nasdaq.com/~r/nasdaq/symbo...   
4  http://articlefeeds.nasdaq.com/~r/nasdaq/symbo...   

                                             Summary  Sentiment_score  
0  The following are the top rated Technology sto...                1  
1  For many investors the tech bubble of the late...               -2  
2  Technology kingpins Microsoft NASDAQ MSFT and ...                