In [1]:
import pandas as pd
from newspaper import Article
import spacy
import plotly.express as px
import numpy as np
import pytextrank

In [2]:
# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")

# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")


<pytextrank.base.BaseTextRankFactory at 0x1ccb05bbf10>

In [3]:
def get_article_text_and_title(url):
    article = Article(url)
    article.download()
    article.parse()
    text = article.text
    title = article.title
    return text, title

In [4]:
def analyze_text_textrank(doc, rank_threshold=0.0):
    # examine the top-ranked phrases in the document
    rank_d = {"text":[], "rank":[], "count":[]}

    for phrase in doc._.phrases:
        if phrase.rank >= rank_threshold:
            rank_d['text'].append(phrase.text)
            rank_d['rank'].append(phrase.rank)
            rank_d['count'].append(phrase.count)
    
    return rank_d 

In [18]:
def create_df_with_bag_of_words_per_link(ex_df):
    "takes a df with 2 columns, 'article_link' and a text column either called 'article_keywords"
    "and returns a df with bag of words for each link"
    text_col = 'article_keywords'
    return ex_df.groupby('article_link')[text_col].unique().str.join(' ')

1. Run an Example

In [6]:
#read posts that were filtered 
path = r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\after_filtering_articles_reddit_and_crunch\filtered_articles.csv"

In [7]:
df  = pd.read_csv(path)

In [8]:
df

Unnamed: 0.1,Unnamed: 0,link,title
0,0,https://www.globalconstructionreview.com/scien...,Scientists can now build structures with swarm...
1,1,https://spectrum.ieee.org/4d-printing-microscale,Micro 4D Printing Builds on Programmable Matter
2,2,https://interestingengineering.com/innovation/...,This startup 3D prints tiny homes from recycla...
3,4,https://www.theguardian.com/environment/2022/o...,‘A growing machine’: Scotland looks to vertica...
4,5,https://www.geekwire.com/2022/eviation-all-ele...,‘It was wonderful’: Eviation’s Alice electric ...
...,...,...,...
224,382,https://www.pcgamer.com/a-single-chip-has-mana...,A single chip has managed to transfer the enti...
225,383,https://www.express.co.uk/life-style/science-t...,WhatsApp down: Chat app hit by second huge out...
226,384,https://arstechnica.com/tech-policy/2022/10/co...,Comcast wants Internet users to pay more becau...
227,385,https://www.visaliatimesdelta.com/story/news/2...,"In high-poverty L.A. neighborhoods, the poor p..."


In [9]:
urls = df['link'].to_list()
titles= df['title'].to_list()

In [10]:
def plot_ranks(d):
    y_rank = d['rank']
    x_index = list(np.arange(len(y_rank)))
    fig = px.line(x=x_index, y=y_rank)
    fig.show()
    

In [24]:
titles[20]

"Major breakthrough in cancer research: Papers reveal 'dark matter' that contributes to disease's growth"

In [25]:
rank_dict = analyze_text_textrank(doc = nlp(titles[20]))

Elbow plot for rank scores

In [26]:
plot_ranks(rank_dict)

In [27]:
threshold_rank = 0.15

In [28]:
#ANALYZE TITLES WITH TEXTRANK

cols_title_df = ['article_keywords', 'title_keywords', 'count']
title_df = pd.DataFrame([], columns=cols_title_df)


for url, title in zip(urls, titles):
     title_doc = nlp(title)
     title_dict = analyze_text_textrank(title_doc, threshold_rank)

     temp_title_df = pd.DataFrame.from_dict(title_dict)
     temp_title_df.columns = cols_title_df
     temp_title_df['article_link'] = [url]*len(temp_title_df)
     title_df = pd.concat([title_df, temp_title_df])

In [29]:
title_df

Unnamed: 0,article_keywords,title_keywords,count,article_link
0,flying drones,0.349876,1,https://www.globalconstructionreview.com/scien...
1,swarms,0.198159,1,https://www.globalconstructionreview.com/scien...
2,structures,0.174922,1,https://www.globalconstructionreview.com/scien...
0,Programmable Matter,0.349876,1,https://spectrum.ieee.org/4d-printing-microscale
1,Micro 4D Printing Builds,0.268072,1,https://spectrum.ieee.org/4d-printing-microscale
...,...,...,...,...
1,Internet users,0.185230,1,https://arstechnica.com/tech-policy/2022/10/co...
0,internet service,0.240529,1,https://www.visaliatimesdelta.com/story/news/2...
0,Better Produce,0.290647,1,https://singularityhub.com/2022/10/26/from-pit...
1,Softer Kale,0.171403,2,https://singularityhub.com/2022/10/26/from-pit...


In [30]:
out_title_df = create_df_with_bag_of_words_per_link(title_df[['article_keywords', 'article_link']]).to_frame()

In [31]:
out_title_df.to_csv(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\final_results\titles_textrank.csv")

In [21]:
#TODO FIGURE OUT WHY SCRAPPING ABSTRACTS IS NOT WORKING ANYMORE
'''
problematic_urls = []

cols_raw_texts = ['abstract', 'title', 'articlae_link']
raw_txt_df = pd.DataFrame([], columns=cols_raw_texts)

cols_title_df = ['article_keywords', 'title_keywords', 'count']
title_df = pd.DataFrame([], columns=cols_title_df)

cols_text_df = ['sentence', 'sentence_rank', 'count']
text_df = pd.DataFrame([], columns=cols_text_df)


for url in urls:
     try:
          text, title = get_article_text_and_title(url)
          temp_raw_df = pd.DataFrame([text, title, url], columns=cols_raw_texts)
          raw_txt_df = pd.concat([raw_txt_df, temp_raw_df])
     except:
          print("unable to extract url for ", url)
          problematic_urls.append(url)
          continue
   #  text_doc = nlp(text)
    # title_doc = nlp(title)
     #text_dict = analyze_text_textrank(text_doc, threshold_rank)
     #title_dict = analyze_text_textrank(title_doc, threshold_rank)

    # temp_text_df = pd.DataFrame.from_dict(text_dict)
     #temp_text_df.columns = cols_text_df
    # temp_text_df['article_link'] = [url]*len(temp_text_df)
    # text_df = pd.concat([text_df, temp_text_df])
 
     #temp_title_df = pd.DataFrame.from_dict(title_dict)
     #temp_title_df.columns = cols_title_df
     #temp_title_df['article_link'] = [url]*len(temp_title_df)
     #title_df = pd.concat([title_df, temp_title_df])'''

'\nproblematic_urls = []\n\ncols_raw_texts = [\'abstract\', \'title\', \'articlae_link\']\nraw_txt_df = pd.DataFrame([], columns=cols_raw_texts)\n\ncols_title_df = [\'article_keywords\', \'title_keywords\', \'count\']\ntitle_df = pd.DataFrame([], columns=cols_title_df)\n\ncols_text_df = [\'sentence\', \'sentence_rank\', \'count\']\ntext_df = pd.DataFrame([], columns=cols_text_df)\n\n\nfor url in urls:\n     try:\n          text, title = get_article_text_and_title(url)\n          temp_raw_df = pd.DataFrame([text, title, url], columns=cols_raw_texts)\n          raw_txt_df = pd.concat([raw_txt_df, temp_raw_df])\n     except:\n          print("unable to extract url for ", url)\n          problematic_urls.append(url)\n          continue\n   #  text_doc = nlp(text)\n    # title_doc = nlp(title)\n     #text_dict = analyze_text_textrank(text_doc, threshold_rank)\n     #title_dict = analyze_text_textrank(title_doc, threshold_rank)\n\n    # temp_text_df = pd.DataFrame.from_dict(text_dict)\n     

In [None]:
#title_df.to_csv(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\reddit_articles_analysis\reddit_titles_textrank.csv")
#text_df.to_csv(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\reddit_articles_analysis\reddit_texts_textrank.csv")
#
#with open(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\reddit_articles_analysis\reddit_problematic_articles.txt", 'w') as f:
 #   f.write(str(problematic_urls))

In [None]:
#crunch_title_df.to_csv(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\crunch_articles_analysis\crunch_titles_textrank.csv")
#crunch_text_df.to_csv(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\crunch_articles_analysis\crunch_texts_textrank.csv")

#with open(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\crunch_articles_analysis\crunch_problematic_articles.txt", 'w') as f:
 #   f.write(str(crunch_problematic_urls))