In [1]:
import pandas as pd
from newspaper import Article
import spacy
import plotly.express as px
import numpy as np
import pytextrank

In [2]:
# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")

# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")


<pytextrank.base.BaseTextRankFactory at 0x25ef5f209a0>

In [3]:
url = "https://www.tomshardware.com/news/lithograhy-tool-russia-7nm-2028"


In [4]:
def get_article_text_and_title(url):
    article = Article(url)
    article.download()
    article.parse()
    text = article.text
    title = article.title
    return text, title

In [5]:
def analyze_text_textrank(doc, rank_threshold=0.0):
    # examine the top-ranked phrases in the document
    rank_d = {"text":[], "rank":[], "count":[]}

    for phrase in doc._.phrases:
        if phrase.rank >= rank_threshold:
            rank_d['text'].append(phrase.text)
            rank_d['rank'].append(phrase.rank)
            rank_d['count'].append(phrase.count)
    
    return rank_d 

1. Run an Example

In [6]:
def plot_ranks(d):
    y_rank = d['rank']
    x_index = list(np.arange(len(y_rank)))
    fig = px.line(x=x_index, y=y_rank)
    fig.show()
    

In [7]:
text, title = get_article_text_and_title(url)

In [8]:
rank_dict = analyze_text_textrank(doc = nlp(text))

Elbow plot for rank scores

In [9]:
plot_ranks(rank_dict)

In [10]:
threshold_rank = 0.05

In [11]:
path_reddit = r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\Scrapers\reddit_RCS\first_iteration\final_results\after_cleaning\posts_after_filtering_with_titles.csv"
path_techcrunch = r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\Scrapers\techcrunch_RCS\scrapped_titles\techcrunch_scrapped_article_titles.csv"

df_reddit = pd.read_csv(path_reddit)
df_techcrunch = pd.read_csv(path_techcrunch)

In [12]:
reddit_urls = df_reddit['discussion_links'].to_list()
crunch_urls = df_techcrunch['link'].to_list()

In [30]:
cols_title_df = ['article_keywords', 'title_keywords', 'count']
title_df = pd.DataFrame([], columns=cols_title_df)

cols_text_df = ['sentence', 'sentence_rank', 'count']
text_df = pd.DataFrame([], columns=cols_text_df)

df_reddit_ranktext_res = pd.DataFrame()
for url in reddit_urls:
     text, title = get_article_text_and_title(url)
     text_doc = nlp(text)
     title_doc = nlp(title)
     text_dict = analyze_text_textrank(text_doc, threshold_rank)
     title_dict = analyze_text_textrank(title_doc, threshold_rank)

     temp_text_df = pd.DataFrame.from_dict(text_dict)
     temp_text_df.columns = cols_text_df
     temp_text_df['article_link'] = [url]*len(temp_text_df)
     text_df = pd.concat([text_df, temp_text_df])
 
     temp_title_df = pd.DataFrame.from_dict(title_dict)
     temp_title_df.columns = cols_title_df
     temp_title_df['article_link'] = [url]*len(temp_title_df)
     title_df = pd.concat([title_df, temp_title_df])

In [32]:
temp_text_df


Unnamed: 0,sentence,sentence_rank,count,article_link
0,outlook dot com,0.170044,1,https://spectrum.ieee.org/4d-printing-microscale
1,3E69 C4A7 EC91 611B 5C12,0.151674,1,https://spectrum.ieee.org/4d-printing-microscale
2,edd dot gent,0.15004,1,https://spectrum.ieee.org/4d-printing-microscale
3,611B 5C12,0.143464,1,https://spectrum.ieee.org/4d-printing-microscale
4,India,0.12094,2,https://spectrum.ieee.org/4d-printing-microscale
5,bioscience,0.118011,1,https://spectrum.ieee.org/4d-printing-microscale
6,emerging technologies,0.107429,1,https://spectrum.ieee.org/4d-printing-microscale
7,Signal info,0.095036,1,https://spectrum.ieee.org/4d-printing-microscale
8,Bangalore,0.092654,2,https://spectrum.ieee.org/4d-printing-microscale
9,energy,0.090589,1,https://spectrum.ieee.org/4d-printing-microscale
