In [14]:
import numpy as np
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
import shutil
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import datetime
import itertools
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
from sklearn.cluster import AffinityPropagation


In [15]:
#Creates a list of phrases to match
person_path = 'resources/person_list.txt'
persons = []

with open(person_path, 'r') as filehandle:
    persons = [current_place.rstrip() for current_place in filehandle.readlines()]
    

In [16]:
nlp = spacy.load("en_core_web_lg")
hnwi_patterns = list(nlp.pipe(persons))
matcher_hnwi = PhraseMatcher(nlp.vocab)
matcher_hnwi.add('HNWI', None, *hnwi_patterns)


def overlap_detect(s_1,e_1,start,end):
    return (start<=s_1<=end) or (start<=e_1<=end) or (s_1<=start<=end<=e_1)

def span_overlap_detect(span1,span2):
    s_1 = span1.start_char
    e_1 = span1.end_char
    start = span2.start_char
    end = span2.end_char
    return overlap_detect(s_1,e_1,start,end)

def span_discard(span, span_list):
    filtered_list = [s for s in span_list if not span_overlap_detect(span,s)]
    return filtered_list

def span_discard_list(s_list1, s_list2):
    filtered_list2 = [span_discard(s,s_list2) for s in s_list1]
    merged = list(itertools.chain.from_iterable(filtered_list2))
    merged = list(set(merged))
    return merged

def hnwi_component(doc):
    matches = matcher_hnwi(doc)
    orig_ents = doc.ents
    #print(orig_ents)
    spans = [Span(doc,start,end,label='HNWI') for match_id, start, end in matches]
    
    try:
        doc.ents = span_discard_list(spans,orig_ents)+spans
    except Exception as e:
        print(e)
        doc.ents = orig_ents
    
    return doc

nlp.add_pipe(hnwi_component, before = 'ner')

def get_key_list(d,val):
    items = d.items()
    rel_items = [item for item in items if item[1]==val]
    key_list = [item[0] for item in rel_items]
    r_d = {val:key_list}
    return r_d

def dict_to_df(d):
    vals = list(set(d.values()))
    dict_list = [get_key_list(d,val) for val in vals]
    dict_f = {list(di.keys())[0]:[','.join(list(di.values())[0])] for di in dict_list}
    df = pd.DataFrame(dict_f)
    return df

def text_to_df(text):
    doc = nlp(text)
    d = dict([(ent.text,ent.label_) for ent in doc.ents])
    df = dict_to_df(d)
    #print(text,df)
    return df

def get_vector(text):
    doc = nlp(text)
    vec = np.array(doc.vector)
    return vec



ValueError: Cannot create vectors table with dimension 0.
If you're using pre-trained vectors, are the vectors loaded?

In [59]:
feed_list = 'resources/rss_feed_list.txt'
dataset = 'out/out.csv'

In [60]:
def sentiment_analyzer_scores(sentence):
    analyzer = SentimentIntensityAnalyzer()
    score = analyzer.polarity_scores(sentence)
    return score['compound']

def url_to_df(src_url):
    resp = requests.get(src_url[1])
    soup = BeautifulSoup(resp.content, features = 'xml')
    items = soup.findAll('item')
    news_items = []
    for item in items:
        news_item = {}
        news_item['Title'] = item.title.text.replace("'","")
        #print(text_to_df(item.title.text.replace("'","")))
        news_item['Description'] = item.description.text
        news_item['PubDate'] = item.pubDate.text
        news_item['Source'] = src_url[0]
        news_item['Title_Sentiment'] = sentiment_analyzer_scores(item.title.text)
        news_item['Description_Sentiment'] = sentiment_analyzer_scores(item.description.text)
        news_item['vector'] = get_vector(item.title.text)
        #news_item['bsScore'] = bsfilter(item.title.text)
        news_items.append(news_item)
    df = pd.DataFrame(news_items)
    return df

def create_nlp_cols(df,col_name):
    df_list = [pd.DataFrame(df[i:i+1]).reset_index(drop=True) for i in range(len(df))]
    df_list = [pd.concat([df_list[i],text_to_df(df_list[i][col_name][0])],axis=1) for i in range(len(df))]
    df_final = pd.concat(df_list,axis=0,sort=False).reset_index(drop=True)
    return df_final

def get_daily_df():
    # define empty list
    urls = []
    sources = []
    # open file and read the content in a list
    with open(feed_list, 'r') as filehandle:
        urls = [url.rstrip().split('|')[1] for url in filehandle.readlines()]
    with open(feed_list, 'r') as filehandle:
        sources = [url.rstrip().split('|')[0] for url in filehandle.readlines()]
    src_url_pairs = list(zip(sources,urls))
    df_list = [url_to_df(src_url) for src_url in src_url_pairs]
    
    #df_list = [pd.concat([df,text_to_df(df['Title'][0])],axis=1) for df in df_list]
    df_final = pd.concat(df_list,axis=0).reset_index(drop=True)
    #df_final = df_final[df_final['bsScore']>=0.5]
    df_final = create_nlp_cols(df_final,'Title')
    names = list(set(list(df_final['HNWI'].dropna())))
    df_list = [create_time_line(df_final,name) for name in names]
    df_final = pd.concat(df_list,axis=0)
    #df = pd.read_csv('news_feeds.csv')
    #df_final = pd.concat([df,df_final],axis=0)
    df_final.to_csv(dataset)
    return df_final

def arr_match(arr1,arr2):
    if False in np.array(arr1==arr2):
        return False
    else:
        return True
    
def create_time_line(df,name):
    print(name)
    try:
        vectors = list(df[df['HNWI']==name]['vector'])
        clustering = AffinityPropagation(damping = 0.9).fit(vectors)
        tim_df = df[df['HNWI']==name]
        tim_df['Event_Id'] = clustering.labels_
        tim_df['Event_center'] = tim_df.apply(lambda row: clustering.cluster_centers_[row['Event_Id']],axis=1)
        tim_df['Event_Text'] = tim_df.apply(lambda row:row['Title'] if arr_match(row['vector'],row['Event_center']) else 0, axis=1)
        tim_df['PubDate'] = pd.to_datetime(tim_df['PubDate'])
        tim_df = tim_df.sort_values('PubDate')
        return tim_df
    except:
        return df
    




In [61]:
df = get_daily_df()

Tim Cook
Elon Musk
Bill Gates,Mukesh Ambani
Amancio Ortega,Juan Roig
Elon Musk,Bill Gates


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Mukesh Ambani,Jeff Bezos
Amancio Ortega
Jeff Bezos,Amancio Ortega
Bill Gates,Jeff Bezos
Jeff Bezos,Bill Gates
Bill Gates


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Bill Gates,Elon Musk
Amancio Ortega,Bill Gates
Mukesh Ambani,Elon Musk
Mukesh Ambani,Bill Gates
Elon Musk,Jeff Bezos
Jeff Bezos
Mukesh Ambani


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Ratan Tata
Juan Roig
Jeff Bezos,Tim Cook
Jeff Bezos,Elon Musk


In [62]:
df[df['Event_Text']!=0][['Event_Text', 'PubDate','HNWI']].to_csv('out/time_line.csv')