In [2]:
import pandas as pd
from newspaper import Article
import spacy
import plotly.express as px
import numpy as np
import pytextrank

In [3]:
# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")

# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")


<pytextrank.base.BaseTextRankFactory at 0x21deafdc940>

In [4]:
url = "https://www.tomshardware.com/news/lithograhy-tool-russia-7nm-2028"


In [5]:
def get_article_text_and_title(url):
    article = Article(url)
    article.download()
    article.parse()
    text = article.text
    title = article.title
    return text, title

In [6]:
def analyze_text_textrank(doc, rank_threshold=0.0):
    # examine the top-ranked phrases in the document
    rank_d = {"text":[], "rank":[], "count":[]}

    for phrase in doc._.phrases:
        if phrase.rank >= rank_threshold:
            rank_d['text'].append(phrase.text)
            rank_d['rank'].append(phrase.rank)
            rank_d['count'].append(phrase.count)
    
    return rank_d 

1. Run an Example

In [7]:
def plot_ranks(d):
    y_rank = d['rank']
    x_index = list(np.arange(len(y_rank)))
    fig = px.line(x=x_index, y=y_rank)
    fig.show()
    

In [8]:
text, title = get_article_text_and_title(url)

In [9]:
rank_dict = analyze_text_textrank(doc = nlp(text))

Elbow plot for rank scores

In [10]:
plot_ranks(rank_dict)

In [11]:
threshold_rank = 0.05

In [12]:
path_reddit = r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\Scrapers\reddit_RCS\first_iteration\final_results\after_cleaning\posts_after_filtering_with_titles.csv"
path_techcrunch = r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\Scrapers\techcrunch_RCS\scrapped_titles\techcrunch_scrapped_article_titles.csv"

df_reddit = pd.read_csv(path_reddit)
df_techcrunch = pd.read_csv(path_techcrunch)

In [13]:
reddit_urls = df_reddit['discussion_links'].to_list()
crunch_urls = df_techcrunch['link'].to_list()

In [14]:
problematic_urls = []

cols_title_df = ['article_keywords', 'title_keywords', 'count']
title_df = pd.DataFrame([], columns=cols_title_df)

cols_text_df = ['sentence', 'sentence_rank', 'count']
text_df = pd.DataFrame([], columns=cols_text_df)


for url in reddit_urls:
     try:
          text, title = get_article_text_and_title(url)
     except:
          print("unable to extract url for ", url)
          problematic_urls.append(url)
          continue
     text_doc = nlp(text)
     title_doc = nlp(title)
     text_dict = analyze_text_textrank(text_doc, threshold_rank)
     title_dict = analyze_text_textrank(title_doc, threshold_rank)

     temp_text_df = pd.DataFrame.from_dict(text_dict)
     temp_text_df.columns = cols_text_df
     temp_text_df['article_link'] = [url]*len(temp_text_df)
     text_df = pd.concat([text_df, temp_text_df])
 
     temp_title_df = pd.DataFrame.from_dict(title_dict)
     temp_title_df.columns = cols_title_df
     temp_title_df['article_link'] = [url]*len(temp_title_df)
     title_df = pd.concat([title_df, temp_title_df])

unable to extract url for  https://www.newsweek.com/baby-alpaca-learns-walk-again-prosthetic-leg-adorable-clip-1749410?
unable to extract url for  https://academic.oup.com/humupd/article/28/4/457/6555833?login=false
unable to extract url for  https://www.fiercebiotech.com/research/ut-austin-scientists-design-safer-cas9-improved-crispr-gene-editing-accuracy
unable to extract url for  https://www.wsj.com/articles/metas-facebook-says-its-new-vr-headset-could-replace-workers-pcs-11665521118?st=w9xk0vs5je9lyo1&reflink=share_mobilewebshare
unable to extract url for  https://www.sciencedirect.com/science/article/pii/S1877050922012777
unable to extract url for  https://www.lightreading.com/security/the-cloud-and-5g-security-apocalypse-is-only-matter-of-time/d/d-id/781259
unable to extract url for  https://www.abc.net.au/news/science/2022-10-26/extreme-miyake-radiation-events-tree-rings-solar-storms/101563738
unable to extract url for  https://onlinelibrary.wiley.com/doi/full/10.1002/smj.3459
u

In [17]:
title_df.to_csv(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\reddit_articles_analysis\reddit_titles_textrank.csv")
text_df.to_csv(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\reddit_articles_analysis\reddit_texts_textrank.csv")

with open(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\reddit_articles_analysis\reddit_problematic_articles.txt", 'w') as f:
    f.write(str(problematic_urls))

TechCrunch



In [22]:
crunch_problematic_urls = []

cols_title_df = ['article_keywords', 'title_keywords', 'count']
crunch_title_df = pd.DataFrame([], columns=cols_title_df)

cols_text_df = ['sentence', 'sentence_rank', 'count']
crunch_text_df = pd.DataFrame([], columns=cols_text_df)


for url in crunch_urls:
     try:
          text, title = get_article_text_and_title(url)
     except:
          print("unable to extract url for ", url)
          crunch_problematic_urls.append(url)
          continue
     text_doc = nlp(text)
     title_doc = nlp(title)
     text_dict = analyze_text_textrank(text_doc, threshold_rank)
     title_dict = analyze_text_textrank(title_doc, threshold_rank)

     crunch_temp_text_df = pd.DataFrame.from_dict(text_dict)
     crunch_temp_text_df.columns = cols_text_df
     crunch_temp_text_df['article_link'] = [url]*len(crunch_temp_text_df)
     crunch_text_df = pd.concat([crunch_text_df, crunch_temp_text_df])
 
     crunch_temp_title_df = pd.DataFrame.from_dict(title_dict)
     crunch_temp_title_df.columns = cols_title_df
     crunch_temp_title_df['article_link'] = [url]*len(crunch_temp_title_df)
     crunch_title_df = pd.concat([crunch_title_df, crunch_temp_title_df])

In [23]:
crunch_title_df.to_csv(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\crunch_articles_analysis\crunch_titles_textrank.csv")
crunch_text_df.to_csv(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\crunch_articles_analysis\crunch_texts_textrank.csv")

with open(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\crunch_articles_analysis\crunch_problematic_articles.txt", 'w') as f:
    f.write(str(crunch_problematic_urls))

In [25]:
crunch_text_df

Unnamed: 0,sentence,sentence_rank,count,article_link
0,climate change,0.088581,2,https://techcrunch.com/2022/10/07/if-its-agtec...
1,biological crop protection solutions,0.081115,1,https://techcrunch.com/2022/10/07/if-its-agtec...
2,food security,0.081112,1,https://techcrunch.com/2022/10/07/if-its-agtec...
3,Indian food security,0.080866,1,https://techcrunch.com/2022/10/07/if-its-agtec...
4,food waste,0.078833,1,https://techcrunch.com/2022/10/07/if-its-agtec...
...,...,...,...,...
29,even different databases providers,0.056223,1,https://techcrunch.com/2016/07/19/legalist-is-...
30,10 different states,0.053295,1,https://techcrunch.com/2016/07/19/legalist-is-...
31,one main searchable database,0.051997,1,https://techcrunch.com/2016/07/19/legalist-is-...
32,"110,000 different lawyers",0.051709,1,https://techcrunch.com/2016/07/19/legalist-is-...
