In [1]:
import pandas as pd
from newspaper import Article
import spacy
import plotly.express as px
import numpy as np
import pytextrank

In [2]:
# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")

# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")


<pytextrank.base.BaseTextRankFactory at 0x214b9773700>

In [3]:
def get_article_text_and_title(url):
    article = Article(url)
    article.download()
    article.parse()
    text = article.text
    title = article.title
    return text, title

In [4]:
def analyze_text_textrank(doc, rank_threshold=0.0):
    # examine the top-ranked phrases in the document
    rank_d = {"text":[], "rank":[], "count":[]}

    for phrase in doc._.phrases:
        if phrase.rank >= rank_threshold:
            rank_d['text'].append(phrase.text)
            rank_d['rank'].append(phrase.rank)
            rank_d['count'].append(phrase.count)
    
    return rank_d 

In [5]:
def create_df_with_bag_of_words_per_link(ex_df):
    "takes a df with 2 columns, 'article_link' and a text column either called 'article_keywords"
    "and returns a df with bag of words for each link"
    text_col = 'article_keywords'
    return ex_df.groupby('article_link')[text_col].unique().str.join(' ')

1. Run an Example

In [6]:
#read posts that were filtered 
path = r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\after_filtering_articles_reddit_and_crunch\filtered_articles.csv"

In [7]:
df  = pd.read_csv(path)

In [8]:
df

Unnamed: 0.1,Unnamed: 0,link,title
0,0,https://www.globalconstructionreview.com/scien...,Scientists can now build structures with swarm...
1,1,https://spectrum.ieee.org/4d-printing-microscale,Micro 4D Printing Builds on Programmable Matter
2,2,https://interestingengineering.com/innovation/...,This startup 3D prints tiny homes from recycla...
3,4,https://www.theguardian.com/environment/2022/o...,‘A growing machine’: Scotland looks to vertica...
4,5,https://www.geekwire.com/2022/eviation-all-ele...,‘It was wonderful’: Eviation’s Alice electric ...
...,...,...,...
638,200,https://techcrunch.com/2020/11/20/onit-acquire...,Onit acquires legal startup McCarthyFinch to i...
639,201,https://techcrunch.com/2020/10/28/priori-serie...,Priori raises $6.3M to help large companies hi...
640,202,https://techcrunch.com/2018/08/17/klarity-uses...,Klarity uses AI to strip drudgery from contrac...
641,203,https://techcrunch.com/2018/04/12/helpself-use...,HelpSelf uses simple AI to help those in legal...


In [9]:
urls = df['link'].to_list()
titles= df['title'].to_list()

In [15]:
'https://www.tomshardware.com/news/lithograhy-tool-russia-7nm-2028' in urls

True

In [17]:
[i for i, url in enumerate(urls) if url=='https://www.tomshardware.com/news/lithograhy-tool-russia-7nm-2028']

[294]

In [18]:
urls[294]

'https://www.tomshardware.com/news/lithograhy-tool-russia-7nm-2028'

In [11]:
def plot_ranks(d):
    y_rank = d['rank']
    x_index = list(np.arange(len(y_rank)))
    fig = px.line(x=x_index, y=y_rank)
    fig.show()
    

In [19]:
rank_dict = analyze_text_textrank(doc = nlp(titles[294]))

Elbow plot for rank scores

In [20]:
plot_ranks(rank_dict)

In [26]:
threshold_rank = 0.05

In [27]:
#ANALYZE TITLES WITH TEXTRANK

cols_title_df = ['article_keywords', 'title_keywords', 'count']
title_df = pd.DataFrame([], columns=cols_title_df)


for url, title in zip(urls, titles):
     title_doc = nlp(title)
     title_dict = analyze_text_textrank(title_doc, threshold_rank)

     temp_title_df = pd.DataFrame.from_dict(title_dict)
     temp_title_df.columns = cols_title_df
     temp_title_df['article_link'] = [url]*len(temp_title_df)
     title_df = pd.concat([title_df, temp_title_df])

In [28]:
title_df

Unnamed: 0,article_keywords,title_keywords,count,article_link
0,flying drones,0.349876,1,https://www.globalconstructionreview.com/scien...
1,swarms,0.198159,1,https://www.globalconstructionreview.com/scien...
2,structures,0.174922,1,https://www.globalconstructionreview.com/scien...
3,Scientists,0.136302,1,https://www.globalconstructionreview.com/scien...
0,Programmable Matter,0.349876,1,https://spectrum.ieee.org/4d-printing-microscale
...,...,...,...,...
0,Better Produce,0.290647,1,https://singularityhub.com/2022/10/26/from-pit...
1,Softer Kale,0.171403,2,https://singularityhub.com/2022/10/26/from-pit...
2,CRISPR,0.164192,1,https://singularityhub.com/2022/10/26/from-pit...
3,Pitless Cherries,0.133561,2,https://singularityhub.com/2022/10/26/from-pit...


In [29]:
out_title_df = create_df_with_bag_of_words_per_link(title_df[['article_keywords', 'article_link']]).to_frame()

In [30]:
out_title_df.to_csv(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\final_results\titles_textrank.csv")

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\Ravit\\Documents\\rnd\\horizon_scanning_lab\\articles\\analyze_Articles\\final_results\\titles_textrank.csv'

'The catastrophic threat of thawing permafrost hangs over us all'

In [42]:
#This titles are added manually due to imprpoer scrapping
sub_titles = ['New approach to ‘cosmic magnet’ manufacturing could reduce reliance on rare earths in low-carbon technologies', 'Russian University Vows to Build 7nm Chipmaking Tools', 'New Form Of Cotton Can Grab Nearly Three And A Half Times Its Own Weight Of Water Out Of Thin Air']
sub_urls = ['https://www.cam.ac.uk/research/news/new-approach-to-cosmic-magnet-manufacturing-could-reduce-reliance-on-rare-earths-in-low-carbon', 'https://www.tomshardware.com/news/lithograhy-tool-russia-7nm-2028', 'https://techcrunch.com/2013/01/22/new-form-of-cotton-can-grab-nearly-three-and-a-half-times-its-own-weight-of-water-out-of-thin-air/']

cols_title_df = ['article_keywords', 'title_keywords', 'count']
sub_title_df = pd.DataFrame([], columns=cols_title_df)


for url, title in zip(sub_urls, sub_titles):
     title_doc = nlp(title)
     title_dict = analyze_text_textrank(title_doc, threshold_rank)

     temp_title_df = pd.DataFrame.from_dict(title_dict)
     temp_title_df.columns = cols_title_df
     temp_title_df['article_link'] = [url]*len(temp_title_df)
     sub_title_df = pd.concat([sub_title_df, temp_title_df])

In [43]:
#The rows were added manually to the csv after saving
create_df_with_bag_of_words_per_link(sub_title_df).to_frame()['article_keywords'].to_list()

['Thin Air Water New Form Cotton Its Own Weight And A Half Times',
 'rare earths low-carbon technologies reliance New approach cosmic magnet’ manufacturing',
 'Chipmaking Tools Russian University Vows 7nm']