In [1]:
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import os
import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
import unicodedata

In [2]:
nlp = spacy.load('en_core_web_lg', parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [3]:
import sqlite3 as sq
conn = sq.Connection("news.db")

news_df =pd.read_sql("select * from news_table",conn)

In [4]:
news_df.head()

Unnamed: 0,id,published_date,url,outlet,content
0,1,2020-04-11T13:10:42,https://www.americanbankingnews.com/2020/04/11...,americanbankingnews,Krueger & Catalano Capital Partners LLC grew i...
1,2,2020-04-11T23:00:00,https://www.itnews.com.au/news/vale-vista-the-...,itnews,The thorn in <b>Microsofts</b> side will soon ...
2,3,2020-04-11T20:40:00,https://comicbook.com/gaming/2020/04/11/xbox-s...,comicbook,"According to a new report, <b>Microsoft</b> wi..."
3,4,2020-04-11T19:03:22,https://www.business-standard.com/article/news...,business-standard,Read more about <b>Microsoft</b> Teams rolls o...
4,5,2020-04-11T18:08:29,https://menafn.com/1100009316/India-Microsoft-...,menafn,(MENAFN - NewsBytes) <b>Microsoft</b> has roll...


In [5]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [6]:
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [7]:
news_df["clean_text"] = normalize_corpus(news_df["content"])

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [8]:
# create a basic pre-processed corpus, don't lowercase to get POS context
corpus = normalize_corpus(news_df['content'], text_lower_case=False, 
                          text_lemmatization=False, special_char_removal=False)

# demo for POS tagging for sample news headline
sentence = str(news_df.iloc[1].content)
sentence_nlp = nlp(sentence)

# POS tagging with Spacy 
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type'])

# POS tagging with nltk
nltk_pos_tagged = nltk.pos_tag(sentence.split())
pd.DataFrame(nltk_pos_tagged, columns=['Word', 'POS tag'])

Unnamed: 0,Word,POS tag
0,The,DT
1,thorn,NN
2,in,IN
3,<b>Microsofts</b>,JJ
4,side,NN
...,...,...
170,Microsoft,NNP
171,"Division,",NNP
172,wrote.,NN
173,Microsoft,NNP


In [9]:
from nltk.corpus import conll2000

data = conll2000.chunked_sents()
train_data = data[:10900]
test_data = data[10900:] 

In [10]:
from nltk.chunk.util import tree2conlltags, conlltags2tree

wtc = tree2conlltags(train_data[1])

In [11]:
def conll_tag_chunks(chunk_sents):
    tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]


def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff 

In [12]:
from nltk.tag import UnigramTagger, BigramTagger
from nltk.chunk import ChunkParserI

# define the chunker class
class NGramTagChunker(ChunkParserI):
    
  def __init__(self, train_sentences, 
               tagger_classes=[UnigramTagger, BigramTagger]):
    train_sent_tags = conll_tag_chunks(train_sentences)
    self.chunk_tagger = combined_tagger(train_sent_tags, tagger_classes)

  def parse(self, tagged_sentence):
    if not tagged_sentence: 
        return None
    pos_tags = [tag for word, tag in tagged_sentence]
    chunk_pos_tags = self.chunk_tagger.tag(pos_tags)
    chunk_tags = [chunk_tag for (pos_tag, chunk_tag) in chunk_pos_tags]
    wpc_tags = [(word, pos_tag, chunk_tag) for ((word, pos_tag), chunk_tag)
                     in zip(tagged_sentence, chunk_tags)]
    return conlltags2tree(wpc_tags)
  
# train chunker model  
ntc = NGramTagChunker(train_data)

# evaluate chunker model performance
print(ntc.evaluate(test_data))

ChunkParse score:
    IOB Accuracy:  90.0%%
    Precision:     82.1%%
    Recall:        86.3%%
    F-Measure:     84.1%%


In [13]:
chunk_tree = ntc.parse(nltk_pos_tagged)


In [14]:
from nltk.parse.stanford import StanfordParser

scp = StanfordParser(path_to_jar='stanford-parser-full-2015-04-20/stanford-parser.jar',
                     path_to_models_jar='stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')
                   
result = list(scp.raw_parse(sentence))


Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.
  after removing the cwd from sys.path.


In [15]:
dependency_pattern = '{left}<---{word}[{w_type}]--->{right}\n--------'
for token in sentence_nlp:
    print(dependency_pattern.format(word=token.orth_, 
                                  w_type=token.dep_,
                                  left=[t.orth_ 
                                            for t 
                                            in token.lefts],
                                  right=[t.orth_ 
                                             for t 
                                             in token.rights]))

[]<---The[det]--->[]
--------
['The']<---thorn[nsubjpass]--->['in', '>']
--------
[]<---in[prep]--->['Microsofts</b']
--------
[]<---<[nmod]--->[]
--------
['<']<---b[dep]--->[]
--------
['b']<--->[nmod]--->[]
--------
['>']<---Microsofts</b[pobj]--->[]
--------
[]<--->[advmod]--->[]
--------
[]<---side[nsubjpass]--->[]
--------
[]<---will[aux]--->[]
--------
[]<---soon[advmod]--->[]
--------
[]<---be[auxpass]--->[]
--------
['thorn', 'side', 'will', 'soon', 'be']<---laid[ROOT]--->['to', 'issued', '.']
--------
[]<---to[prep]--->['Gates']
--------
[]<---rest.edict[amod]--->[]
--------
[]<---Bill[compound]--->[]
--------
['rest.edict', 'Bill']<---Gates[pobj]--->[]
--------
[]<---had[aux]--->[]
--------
['had']<---issued[advcl]--->['apply']
--------
[]<---to[aux]--->[]
--------
['to']<---apply[xcomp]--->['to']
--------
[]<---to[prep]--->['Vista']
--------
[]<---Vista[pobj]--->[',', 'had']
--------
[]<---,[punct]--->[]
--------
[]<---which[nsubj]--->[]
--------
[]<---simply[advmod]--->[]


In [16]:
from spacy import displacy

displacy.render(sentence_nlp, jupyter=True, 
                options={'distance': 110,
                         'arrow_stroke': 2,
                         'arrow_width': 8})

In [17]:
from nltk.parse.stanford import StanfordDependencyParser
sdp = StanfordDependencyParser(path_to_jar='stanford-parser-full-2015-04-20/stanford-parser.jar',
                               path_to_models_jar='stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')    

result = list(sdp.raw_parse(sentence))  

# print the dependency tree
dep_tree = [parse.tree() for parse in result][0]

Please use [91mnltk.parse.corenlp.CoreNLPDependencyParser[0m instead.
  This is separate from the ipykernel package so we can avoid doing imports until


In [18]:

sentence = str(news_df.iloc[1].content)
sentence_nlp = nlp(sentence)

# print named entities in article
print([(word, word.ent_type_) for word in sentence_nlp if word.ent_type_])

# visualize named entities
displacy.render(sentence_nlp, style='ent', jupyter=True)

[(rest.edict, 'PERSON'), (Bill, 'PERSON'), (Gates, 'PERSON'), (Vista, 'ORG'), (Windows, 'PRODUCT'), (XP, 'PRODUCT'), (Vistas, 'PRODUCT'), (Microsoft, 'ORG'), (512, 'QUANTITY'), (mb, 'QUANTITY'), (RAM, 'ORG'), (DirectXP, 'ORG'), (graphics, 'ORG'), (Vista, 'ORG'), (Home, 'ORG'), (Basic, 'ORG'), (Aero, 'ORG'), (Media, 'ORG'), (Microsoft, 'ORG'), (Vista, 'ORG'), (2100, 'MONEY'), (Microsoft, 'ORG'), (Mike, 'PERSON'), (Nash, 'PERSON'), (Jim, 'PERSON'), (Allchin, 'PERSON'), (the, 'DATE'), (very, 'DATE'), (earliest, 'DATE'), (days, 'DATE'), (Microsoft, 'ORG'), (Division, 'ORG'), (Microsoft, 'ORG')]


In [19]:
named_entities = []
for sentence in corpus:
    temp_entity_name = ''
    temp_named_entity = None
    sentence = nlp(sentence)
    for word in sentence:
        term = word.text 
        tag = word.ent_type_
        if tag:
            temp_entity_name = ' '.join([temp_entity_name, term]).strip()
            temp_named_entity = (temp_entity_name, tag)
        else:
            if temp_named_entity:
                named_entities.append(temp_named_entity)
                temp_entity_name = ''
                temp_named_entity = None

entity_frame = pd.DataFrame(named_entities, 
                            columns=['Entity Name', 'Entity Type'])

In [20]:
top_entities = (entity_frame.groupby(by=['Entity Name', 'Entity Type'])
                           .size()
                           .sort_values(ascending=False)
                           .reset_index().rename(columns={0 : 'Frequency'}))
top_entities.T.iloc[:,:15]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
Entity Name,Microsoft,10,Microsofts,one,first,two,today,Amazon,Google,Windows,Excel,Apple,One,MSFT,2
Entity Type,ORG,CARDINAL,ORG,CARDINAL,ORDINAL,CARDINAL,DATE,ORG,ORG,PRODUCT,PRODUCT,ORG,CARDINAL,ORG,CARDINAL
Frequency,31404,3644,3333,2804,2022,1887,1340,1307,1265,1099,1045,1027,927,914,838


In [21]:
# get the top named entity types
top_entities = (entity_frame.groupby(by=['Entity Type'])
                           .size()
                           .sort_values(ascending=False)
                           .reset_index().rename(columns={0 : 'Frequency'}))
top_entities.T.iloc[:,:15]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
Entity Type,ORG,DATE,CARDINAL,PRODUCT,PERSON,GPE,MONEY,ORDINAL,NORP,PERCENT,WORK_OF_ART,FAC,TIME,LOC,EVENT
Frequency,119524,35887,31523,23483,21829,13121,5739,4255,3220,3080,2669,2276,2264,1431,1323


In [22]:
l= news_df["published_date"]
for i in range(len(l)):
    l[i]= l[i][:10]
news_df["published_date"] = l

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
from datetime import datetime
k=news_df.outlet.unique()
len(k)

5658

In [24]:
k=news_df["outlet"].value_counts()

In [25]:
ll=[]
count=0
for i,j in k.items():
    if(j>10):
        count=count+1
        ll.append(i)

    

In [46]:
news_df = news_df[news_df.outlet.isin(ll)]
    
data = news_df.copy()

In [27]:
## text blob

from textblob import TextBlob

# compute sentiment scores (polarity) and labels
sentiment_scores_tb = [round(TextBlob(article).sentiment.polarity, 3) for article in news_df['clean_text']]
sentiment_category_tb = ['positive' if score > 0 
                             else 'negative' if score < 0 
                                 else 'neutral' 
                                     for score in sentiment_scores_tb]


news_df["sentiment_score"] = sentiment_scores_tb
news_df["sentiment_category"] = sentiment_category_tb  

In [47]:
from afinn import Afinn
af = Afinn()

# compute sentiment scores (polarity) and labels
sentiment_scores = [af.score(article) for article in data["clean_text"]]
sentiment_category = ['positive' if score > 0 
                          else 'negative' if score < 0 
                              else 'neutral' 
                                  for score in sentiment_scores]
    
    
data["sentiment_score"]=sentiment_scores
data["sentiment_category"]= sentiment_category


In [29]:
dd = pd.DataFrame()
df= pd.DataFrame()

dd["published_date"] = data["published_date"]
dd["sentiment_score"] = data["sentiment_score"]

df["published_date"] = news_df["published_date"]
df["sentiment_score"] = news_df["sentiment_score"]

In [30]:
dd =dd.groupby(dd.published_date)
dd=dd.mean()
df =df.groupby(df.published_date)
df=df.mean()


In [31]:
#dd.to_sql("afinn_news_csv",conn)
#df.to_sql("text_blob_news_csv",conn)
