In [42]:
import pandas as pd
from newspaper import Article
import spacy
import plotly.express as px
import numpy as np
import pytextrank

In [43]:
# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")

# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")


<pytextrank.base.BaseTextRankFactory at 0x13a290e0dc0>

In [44]:
def get_article_abstract(url):
    article = Article(url)
    article.download()
    article.parse()
    text = article.text
    title = article.title
    return text

In [45]:
def analyze_text_textrank(doc, rank_threshold=0.0):
    # examine the top-ranked phrases in the document
    rank_d = {"text":[], "rank":[], "count":[]}

    for phrase in doc._.phrases:
        if phrase.rank >= rank_threshold:
            rank_d['text'].append(phrase.text)
            rank_d['rank'].append(phrase.rank)
            rank_d['count'].append(phrase.count)
    
    return rank_d 

In [46]:
def create_df_with_bag_of_words_per_link(ex_df):
    "takes a df with 2 columns, 'article_link' and a text column either called 'article_keywords"
    "and returns a df with bag of words for each link"
    text_col = 'article_keywords'
    return ex_df.groupby('article_link')[text_col].unique().str.join(' ')

1. Run an Example

In [47]:
#read links from titles that were filtered with jaccard and textrank 
path = r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\final_results\titles_textrank_before_final_filtering.csv"

In [48]:
df  = pd.read_csv(path,encoding='cp1252')

In [49]:
df

Unnamed: 0,article_link,article_keywords
0,http://jhr.uwpress.org/content/early/2022/10/0...,Cumulative Lead Exposure Reduces Academic Achi...
1,http://www.exeter.ac.uk/news/research/title_93...,better adult blue spaces Positive childhood ex...
2,https://9to5mac.com/2022/10/25/apple-watch-blo...,Apple Watch blood oxygen sensor Apple Watch â€...
3,https://9to5mac.com/2022/10/27/apple-cant-make...,demand enough iPhone 14 Pros Apple Tim Cook iP...
4,https://aacrjournals.org/clincancerres/article...,Clinical Oncology Cancer Research Electronic N...
...,...,...
637,https://www.youtube.com/watch?v=mQ1ln7zxpA4,your browser
638,https://www.zdnet.com/article/criminals-are-us...,drones trucks GPS jammers them
639,https://www.zdnet.com/article/nasa-develops-a-...,"water the Moon NASA NASAs new tiny, high-power..."
640,https://www.zdnet.com/article/this-nasa-space-...,EV This NASA space tech NASA


In [50]:
urls = df['article_link'].to_list()


In [51]:
urls[0]

'http://jhr.uwpress.org/content/early/2022/10/03/jhr.0222-12169R2.abstract'

In [52]:
get_article_abstract(urls[0])

'Abstract\n\nWe study how ambient lead exposure impacts learning in elementary school by leveraging a natural experiment where a large national automotive racing organization switched from leaded to unleaded fuel. We find increased levels and duration of exposure to lead negatively affect academic performance, shift the entire academic performance distribution, and negatively impact both younger and older children. The average treated student in our setting has an expected income reduction of $5,200 in present value terms. Avoiding said treatment has an effect size similar to improving teacher value added by one-fourth of a standard deviation, reducing class size by 3 students, or increasing school spending per pupil by $750. The marginal impacts of lead are larger in impoverished, non-white counties, and among students with greater duration of exposure, even after controlling for the total quantity of exposure.'

In [53]:
#In case there's a website that's not accessible. Try to scrape from it another way
prob_websites = []

abstracts = {}
for i, url in enumerate(urls):
    try:
        abstracts[str(i)] = get_article_abstract(url)
    except:
        prob_websites.append(url)
        abstracts[str(i)] = []

In [54]:
prob_websites = list(set(prob_websites))

In [55]:
prob_websites

['https://www.sciencedirect.com/science/article/abs/pii/S009265662200126X',
 'https://thehill.com/policy/energy-environment/3685583-nasa-suggests-new-space-cooling-technology-could-charge-electric-cars-in-5-minutes/',
 'https://newsroom.unsw.edu.au/news/science-tech/engineers-light-way-toward-bionics-future?utm_source=reddit&utm_medium=social',
 'https://techcrunch.com/2022/03/21/to-raise-a-fund-this-agtech-outfit-built-a-content-company-first-now-it-has-60-million-to-put-to-work/',
 'https://www.newsweek.com/baby-alpaca-learns-walk-again-prosthetic-leg-adorable-clip-1749410?',
 'https://www.sciencedirect.com/science/article/abs/pii/S0025326X22007627?via%3Dihub',
 'https://www.sciencedirect.com/science/article/abs/pii/S0925492722001172',
 'https://techcrunch.com/2022/03/01/zero-systems-gets-12m-series-a-to-bring-automation-to-professional-services/',
 'https://thehill.com/policy/energy-environment/3669504-supreme-court-to-hear-case-that-could-have-massive-impact-on-us-water-quality/',


In [56]:
prob_websites[0].split('/')[2]

'www.sciencedirect.com'

In [57]:
prob_sources = [url.split('/')[2] for url in prob_websites]

In [58]:
prob_sources

['www.sciencedirect.com',
 'thehill.com',
 'newsroom.unsw.edu.au',
 'techcrunch.com',
 'www.newsweek.com',
 'www.sciencedirect.com',
 'www.sciencedirect.com',
 'techcrunch.com',
 'thehill.com',
 'www.sciencedirect.com',
 'apnews.com',
 'apnews.com',
 'www.cnbc.com',
 'www.sciencedirect.com',
 'www.lightreading.com',
 'www.wsj.com',
 'techcrunch.com',
 'www.abc.net.au',
 'eandt.theiet.org',
 'techcrunch.com',
 'onlinelibrary.wiley.com',
 'thehill.com',
 'www.uq.edu.au',
 'cosmosmagazine.com',
 'www.techdirt.com',
 'jamanetwork.com',
 'www.sciencedirect.com',
 'academic.oup.com',
 'techcrunch.com',
 'academic.oup.com',
 'www.techpowerup.com',
 'jamanetwork.com',
 'techcrunch.com',
 'onlinelibrary.wiley.com',
 'jamanetwork.com',
 'agupubs.onlinelibrary.wiley.com',
 'edition.cnn.com',
 'www.smithsonianmag.com',
 'www.sciencedirect.com',
 'edition.cnn.com',
 'www.techdirt.com',
 'techcrunch.com',
 'cleantechnica.com',
 'techcrunch.com',
 'www.wsj.com',
 'www.technologynetworks.com',
 'techc

In [78]:

abstracts = [abs.replace('\n','') for abs in abstracts]
abstracts = [abs.replace('\t','') for abs in abstracts]

In [101]:
abstracts_ = []

for abs in abstracts.values():
    if abs == []:
        abstracts_.append([])
    else:
        abs = abs.replace('\n','')
        abs = abs.replace('\t','')
        abs = abs.replace('Abstract','')
        abstracts_.append(abs)

In [102]:
#save prob urls to csv
prob_path = r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\final_results\tough_urls\prob_urls.csv"
pd.DataFrame(prob_websites, columns=['links']).to_csv(prob_path)

abs_path = r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\final_results\abstracts_full.csv"
abstracts_df = pd.DataFrame([], columns=['link', 'abstract'])
abstracts_df['link'] = urls
abstracts_df['abstract'] = list(abstracts_)
abstracts_df.to_csv(abs_path)

In [103]:
#SEARCH FOR ARTICLE LINKS
abstracts_df

Unnamed: 0,link,abstract
0,http://jhr.uwpress.org/content/early/2022/10/0...,We study how ambient lead exposure impacts lea...
1,http://www.exeter.ac.uk/news/research/title_93...,Positive childhood experiences of blue spaces ...
2,https://9to5mac.com/2022/10/25/apple-watch-blo...,A new validation study published this month pu...
3,https://9to5mac.com/2022/10/27/apple-cant-make...,Apple’s fiscal Q4 earnings are out and while t...
4,https://aacrjournals.org/clincancerres/article...,Combustible tobacco use has reached historic l...
...,...,...
637,https://www.youtube.com/watch?v=mQ1ln7zxpA4,
638,https://www.zdnet.com/article/criminals-are-us...,By PopTika -- ShutterstockSatellite navigation...
639,https://www.zdnet.com/article/nasa-develops-a-...,Image: NASA/Michael GiuntoPrevious technologie...
640,https://www.zdnet.com/article/this-nasa-space-...,Image: Getty Images/Jung GettyFully charging a...


In [104]:
#try to scrape abstracts from other sources too


['www.sciencedirect.com',
 'thehill.com',
 'newsroom.unsw.edu.au',
 'techcrunch.com',
 'www.newsweek.com',
 'www.sciencedirect.com',
 'www.sciencedirect.com',
 'techcrunch.com',
 'thehill.com',
 'www.sciencedirect.com',
 'apnews.com',
 'apnews.com',
 'www.cnbc.com',
 'www.sciencedirect.com',
 'www.lightreading.com',
 'www.wsj.com',
 'techcrunch.com',
 'www.abc.net.au',
 'eandt.theiet.org',
 'techcrunch.com',
 'onlinelibrary.wiley.com',
 'thehill.com',
 'www.uq.edu.au',
 'cosmosmagazine.com',
 'www.techdirt.com',
 'jamanetwork.com',
 'www.sciencedirect.com',
 'academic.oup.com',
 'techcrunch.com',
 'academic.oup.com',
 'www.techpowerup.com',
 'jamanetwork.com',
 'techcrunch.com',
 'onlinelibrary.wiley.com',
 'jamanetwork.com',
 'agupubs.onlinelibrary.wiley.com',
 'edition.cnn.com',
 'www.smithsonianmag.com',
 'www.sciencedirect.com',
 'edition.cnn.com',
 'www.techdirt.com',
 'techcrunch.com',
 'cleantechnica.com',
 'techcrunch.com',
 'www.wsj.com',
 'www.technologynetworks.com',
 'techc

In [31]:
#TODO EXTRACT ABSTRACTS FROM ARTICLES

In [51]:
df_abstracts = pd.DataFrame([abstracts]).T

In [52]:
#save abstracts to csv
abst_save_path = r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\final_results\raw_abstracts_before_filtering.csv"
df_abstracts.to_csv(abst_save_path)

In [53]:
def plot_ranks(d):
    y_rank = d['rank']
    x_index = list(np.arange(len(y_rank)))
    fig = px.line(x=x_index, y=y_rank)
    fig.show()
    

In [59]:
abst_list = df_abstracts[0].to_list()

In [83]:
#remove space and tabs
abst_list = [abs.replace('\n','').replace('\t', '').replace('/', ' ') for abs in abst_list]

In [85]:
abst_list[2]

'In July, an HIV-positive man became the first volunteer in a clinical trial aimed at using Crispr gene editing to snip the AIDS-causing virus out of his cells. For an hour, he was hooked up to an IV bag that pumped the experimental treatment directly into his bloodstream. The one-time infusion is designed to carry the gene-editing tools to the man’s infected cells to clear the virus.Later this month, the volunteer will stop taking the antiretroviral drugs he’s been on to keep the virus at undetectable levels. Then, investigators will wait 12 weeks to see if the virus rebounds. If not, they’ll consider the experiment a success. “What we’re trying to do is return the cell to a near-normal state,” says Daniel Dornbusch, CEO of Excision BioTherapeutics, the San Francisco-based biotech company that’s running the trial.The HIV virus attacks immune cells in the body called CD4 cells and hijacks their machinery to make copies of itself. But some HIV-infected cells can go dormant—sometimes for

In [68]:
rank_dict = analyze_text_textrank(doc = nlp(abst_list[4]))

In [69]:
rank_dict

{'text': ['many plant ailments',
  'plants',
  'new antibiotic compounds',
  'other antibiotic compounds',
  'fungal disease',
  'Dickeya solani',
  'new compounds',
  'Candida albicans',
  'dangerous infections',
  'Dr. Rita Monson',
  'Candida fungi',
  'Rita Monson',
  'potatoes',
  'Antibiotic resistance',
  'solanimycin',
  'disease',
  'existing antibiotics',
  'Most therapeutic antibiotics',
  'Candida',
  'detail',
  'Monson',
  'years',
  'plant and animal models',
  'plant-based microorganisms',
  'fungal competitors',
  'yeast',
  'Computer illustration',
  'human and plant health',
  'Dickeya',
  'Exciting new research',
  'Getty Images',
  'C. albicans',
  'soil microbes',
  'clinical settings',
  'genes',
  'Cambridge',
  'a new antifungal antibiotic',
  'control',
  'Antiobiotic resistance',
  'chemists',
  'genitals',
  'both plants',
  'science stories',
  'A',
  'agriculture',
  'The bacteria',
  'a bacteria',
  'the bacteria',
  'millions',
  'the bacterium’s genome'

Elbow plot for rank scores

In [70]:
plot_ranks(rank_dict)

In [26]:
threshold_rank = 0.05

In [27]:
#ANALYZE TITLES WITH TEXTRANK

cols_title_df = ['article_keywords', 'title_keywords', 'count']
title_df = pd.DataFrame([], columns=cols_title_df)


for url, title in zip(urls, titles):
     title_doc = nlp(title)
     title_dict = analyze_text_textrank(title_doc, threshold_rank)

     temp_title_df = pd.DataFrame.from_dict(title_dict)
     temp_title_df.columns = cols_title_df
     temp_title_df['article_link'] = [url]*len(temp_title_df)
     title_df = pd.concat([title_df, temp_title_df])

In [28]:
title_df

Unnamed: 0,article_keywords,title_keywords,count,article_link
0,flying drones,0.349876,1,https://www.globalconstructionreview.com/scien...
1,swarms,0.198159,1,https://www.globalconstructionreview.com/scien...
2,structures,0.174922,1,https://www.globalconstructionreview.com/scien...
3,Scientists,0.136302,1,https://www.globalconstructionreview.com/scien...
0,Programmable Matter,0.349876,1,https://spectrum.ieee.org/4d-printing-microscale
...,...,...,...,...
0,Better Produce,0.290647,1,https://singularityhub.com/2022/10/26/from-pit...
1,Softer Kale,0.171403,2,https://singularityhub.com/2022/10/26/from-pit...
2,CRISPR,0.164192,1,https://singularityhub.com/2022/10/26/from-pit...
3,Pitless Cherries,0.133561,2,https://singularityhub.com/2022/10/26/from-pit...


In [29]:
out_title_df = create_df_with_bag_of_words_per_link(title_df[['article_keywords', 'article_link']]).to_frame()

In [30]:
out_title_df.to_csv(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\final_results\titles_textrank.csv")

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\Ravit\\Documents\\rnd\\horizon_scanning_lab\\articles\\analyze_Articles\\final_results\\titles_textrank.csv'

'The catastrophic threat of thawing permafrost hangs over us all'

In [42]:
#This titles are added manually due to imprpoer scrapping
sub_titles = ['New approach to ‘cosmic magnet’ manufacturing could reduce reliance on rare earths in low-carbon technologies', 'Russian University Vows to Build 7nm Chipmaking Tools', 'New Form Of Cotton Can Grab Nearly Three And A Half Times Its Own Weight Of Water Out Of Thin Air']
sub_urls = ['https://www.cam.ac.uk/research/news/new-approach-to-cosmic-magnet-manufacturing-could-reduce-reliance-on-rare-earths-in-low-carbon', 'https://www.tomshardware.com/news/lithograhy-tool-russia-7nm-2028', 'https://techcrunch.com/2013/01/22/new-form-of-cotton-can-grab-nearly-three-and-a-half-times-its-own-weight-of-water-out-of-thin-air/']

cols_title_df = ['article_keywords', 'title_keywords', 'count']
sub_title_df = pd.DataFrame([], columns=cols_title_df)


for url, title in zip(sub_urls, sub_titles):
     title_doc = nlp(title)
     title_dict = analyze_text_textrank(title_doc, threshold_rank)

     temp_title_df = pd.DataFrame.from_dict(title_dict)
     temp_title_df.columns = cols_title_df
     temp_title_df['article_link'] = [url]*len(temp_title_df)
     sub_title_df = pd.concat([sub_title_df, temp_title_df])

In [41]:
#The rows were added manually to the csv after saving
create_df_with_bag_of_words_per_link(sub_title_df).to_frame()['article_keywords'].to_list()

['Thin Air Water New Form Cotton Its Own Weight And A Half Times',
 'rare earths low-carbon technologies reliance New approach cosmic magnet’ manufacturing',
 'Chipmaking Tools Russian University Vows 7nm']

In [21]:
#TODO FIGURE OUT WHY SCRAPPING ABSTRACTS IS NOT WORKING ANYMORE
'''
problematic_urls = []

cols_raw_texts = ['abstract', 'title', 'articlae_link']
raw_txt_df = pd.DataFrame([], columns=cols_raw_texts)

cols_title_df = ['article_keywords', 'title_keywords', 'count']
title_df = pd.DataFrame([], columns=cols_title_df)

cols_text_df = ['sentence', 'sentence_rank', 'count']
text_df = pd.DataFrame([], columns=cols_text_df)


for url in urls:
     try:
          text, title = get_article_text_and_title(url)
          temp_raw_df = pd.DataFrame([text, title, url], columns=cols_raw_texts)
          raw_txt_df = pd.concat([raw_txt_df, temp_raw_df])
     except:
          print("unable to extract url for ", url)
          problematic_urls.append(url)
          continue
   #  text_doc = nlp(text)
    # title_doc = nlp(title)
     #text_dict = analyze_text_textrank(text_doc, threshold_rank)
     #title_dict = analyze_text_textrank(title_doc, threshold_rank)

    # temp_text_df = pd.DataFrame.from_dict(text_dict)
     #temp_text_df.columns = cols_text_df
    # temp_text_df['article_link'] = [url]*len(temp_text_df)
    # text_df = pd.concat([text_df, temp_text_df])
 
     #temp_title_df = pd.DataFrame.from_dict(title_dict)
     #temp_title_df.columns = cols_title_df
     #temp_title_df['article_link'] = [url]*len(temp_title_df)
     #title_df = pd.concat([title_df, temp_title_df])'''

'\nproblematic_urls = []\n\ncols_raw_texts = [\'abstract\', \'title\', \'articlae_link\']\nraw_txt_df = pd.DataFrame([], columns=cols_raw_texts)\n\ncols_title_df = [\'article_keywords\', \'title_keywords\', \'count\']\ntitle_df = pd.DataFrame([], columns=cols_title_df)\n\ncols_text_df = [\'sentence\', \'sentence_rank\', \'count\']\ntext_df = pd.DataFrame([], columns=cols_text_df)\n\n\nfor url in urls:\n     try:\n          text, title = get_article_text_and_title(url)\n          temp_raw_df = pd.DataFrame([text, title, url], columns=cols_raw_texts)\n          raw_txt_df = pd.concat([raw_txt_df, temp_raw_df])\n     except:\n          print("unable to extract url for ", url)\n          problematic_urls.append(url)\n          continue\n   #  text_doc = nlp(text)\n    # title_doc = nlp(title)\n     #text_dict = analyze_text_textrank(text_doc, threshold_rank)\n     #title_dict = analyze_text_textrank(title_doc, threshold_rank)\n\n    # temp_text_df = pd.DataFrame.from_dict(text_dict)\n     

In [None]:
#title_df.to_csv(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\reddit_articles_analysis\reddit_titles_textrank.csv")
#text_df.to_csv(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\reddit_articles_analysis\reddit_texts_textrank.csv")
#
#with open(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\reddit_articles_analysis\reddit_problematic_articles.txt", 'w') as f:
 #   f.write(str(problematic_urls))

In [None]:
#crunch_title_df.to_csv(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\crunch_articles_analysis\crunch_titles_textrank.csv")
#crunch_text_df.to_csv(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\crunch_articles_analysis\crunch_texts_textrank.csv")

#with open(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\crunch_articles_analysis\crunch_problematic_articles.txt", 'w') as f:
 #   f.write(str(crunch_problematic_urls))