In [122]:
import pandas as pd
from newspaper import Article
import spacy
import plotly.express as px
import numpy as np
import pytextrank

In [123]:
# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")

# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")


<pytextrank.base.BaseTextRankFactory at 0x259d8a8cdf0>

In [124]:
def analyze_text_textrank(doc, rank_threshold):
    # examine the top-ranked phrases in the document
    rank_d = {"text":[], "rank":[], "count":[]}

    for phrase in doc._.phrases:
        if phrase.rank >= rank_threshold:
            rank_d['text'].append(phrase.text)
            rank_d['rank'].append(phrase.rank)
            rank_d['count'].append(phrase.count)
    
    return rank_d 

1. Run an Example

In [125]:
path = r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\final_results\bertopic\abtracts_bertopic_clusters.csv"

In [126]:
df  = pd.read_csv(path)

In [127]:
df.columns

Index(['Unnamed: 0', 'Topic', 'label', 'link', 'abstract', 'titles'], dtype='object')

In [128]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [129]:
df

Unnamed: 0,Topic,label,link,abstract,titles
0,-1,-1_new_said_study_people,http://jhr.uwpress.org/content/early/2022/10/0...,We study how ambient lead exposure impacts lea...,A Thousand Cuts: Cumulative Lead Exposure Redu...
1,-1,-1_new_said_study_people,https://www.eurekalert.org/news-releases/967553,"RIVERSIDE, Calif. -- Thirdhand smoke, or THS, ...",Thirdhand smoke can trigger skin diseases
2,-1,-1_new_said_study_people,https://www.eurekalert.org/news-releases/965575,Deep Longevity bridges the gap between the con...,Being lonely and unhappy accelerates aging mor...
3,-1,-1_new_said_study_people,https://www.eurekalert.org/news-releases/958880,By subjecting a quantum computer’s qubits to q...,Strange new phase of matter created in quantum...
4,-1,-1_new_said_study_people,https://www.engadget.com/zipline-drone-deliver...,Zipline has teamed up with a healthcare provid...,Zipline drones will deliver medicine to commun...
...,...,...,...,...,...
586,12,12_women_health_pill_cycles,https://techcrunch.com/2019/09/29/badass-mille...,"Across the political, social and economic stag...",Badass millennial women are supercharging star...
587,12,12_women_health_pill_cycles,https://techcrunch.com/2020/08/28/femtech-pois...,The market for female-focused health products ...,Femtech poised for growth beyond fertility
588,12,12_women_health_pill_cycles,https://techcrunch.com/2020/02/09/hannah-seal-...,Index Fund’s portfolio is driving long-overdue...,Index Fundâs portfolio is driving long-overd...
589,12,12_women_health_pill_cycles,https://techcrunch.com/2021/05/25/acuitymd-rai...,In a world defined by tons of noise and little...,AcuityMD raises $7M to better track the evolvi...


Titles

In [130]:
titles_list = df['titles'].to_list()

In [131]:
def plot_ranks(d):
    y_rank = d['rank']
    x_index = list(np.arange(len(y_rank)))
    fig = px.line(x=x_index, y=y_rank)
    fig.show()
    

In [132]:
titles_list

['A Thousand Cuts: Cumulative Lead Exposure Reduces Academic Achievement',
 'Thirdhand smoke can trigger skin diseases',
 'Being lonely and unhappy accelerates aging more than smoking',
 'Strange new phase of matter created in quantum computer acts like it has two time dimensions',
 'Zipline drones will deliver medicine to communities in Utah',
 'T-Mobile will start charging a $35 fee on all new activations and upgrades',
 'Scientists may have found an affordable way to destroy forever chemicals',
 'Global emissions targets spell growth for CO2 tech sector',
 'The end of Appleâ\x80\x99s affair with China',
 'Nvidia says falling GPU prices are â\x80\x98a story of the pastâ\x80\x99',
 'Columbia Study Finds Mass School Shootings Not Caused by Mental Illness',
 "NASA's Webb Space Telescope Is So Good, We Might Need Improved Planetary Models",
 "Chipotle Mexican Grill will test robotic tortilla chip maker 'Chippy' in California restaurant",
 "Feds commit $50 million to for-profit nuclear fu

In [135]:
rank_dict = analyze_text_textrank(doc = nlp(titles_list[3]), rank_threshold=0)

Elbow plot for rank scores

In [136]:
plot_ranks(rank_dict)

In [137]:
threshold_rank = 0.05

In [138]:
#ANALYZE TITLES WITH TEXTRANK

cols_title_df = ['article_keywords', 'title_keywords', 'count']
title_rank_words = []


for title in titles_list:
     title_doc = nlp(title)
     title_dict = analyze_text_textrank(title_doc, threshold_rank)
     title_ranked = " ".join(title_dict['text'])
     title_rank_words.append(title_ranked)


In [139]:
df['title_textrank'] = title_rank_words

In [140]:
df

Unnamed: 0,Topic,label,link,abstract,titles,title_textrank
0,-1,-1_new_said_study_people,http://jhr.uwpress.org/content/early/2022/10/0...,We study how ambient lead exposure impacts lea...,A Thousand Cuts: Cumulative Lead Exposure Redu...,Cumulative Lead Exposure Reduces Academic Achi...
1,-1,-1_new_said_study_people,https://www.eurekalert.org/news-releases/967553,"RIVERSIDE, Calif. -- Thirdhand smoke, or THS, ...",Thirdhand smoke can trigger skin diseases,skin diseases Thirdhand smoke Thirdhand
2,-1,-1_new_said_study_people,https://www.eurekalert.org/news-releases/965575,Deep Longevity bridges the gap between the con...,Being lonely and unhappy accelerates aging mor...,Being lonely and unhappy accelerates
3,-1,-1_new_said_study_people,https://www.eurekalert.org/news-releases/958880,By subjecting a quantum computer’s qubits to q...,Strange new phase of matter created in quantum...,quantum computer Strange new phase two time di...
4,-1,-1_new_said_study_people,https://www.engadget.com/zipline-drone-deliver...,Zipline has teamed up with a healthcare provid...,Zipline drones will deliver medicine to commun...,Utah communities medicine Zipline drones Zipline
...,...,...,...,...,...,...
586,12,12_women_health_pill_cycles,https://techcrunch.com/2019/09/29/badass-mille...,"Across the political, social and economic stag...",Badass millennial women are supercharging star...,startup investments Badass millennial women
587,12,12_women_health_pill_cycles,https://techcrunch.com/2020/08/28/femtech-pois...,The market for female-focused health products ...,Femtech poised for growth beyond fertility,fertility growth Femtech
588,12,12_women_health_pill_cycles,https://techcrunch.com/2020/02/09/hannah-seal-...,Index Fund’s portfolio is driving long-overdue...,Index Fundâs portfolio is driving long-overd...,femcare Index Fundâs portfolio Fundâs long...
589,12,12_women_health_pill_cycles,https://techcrunch.com/2021/05/25/acuitymd-rai...,In a world defined by tons of noise and little...,AcuityMD raises $7M to better track the evolvi...,medical hardware the evolving world AcuityMD


Abstracts

In [141]:
abstracts_list = df['abstract'].to_list()

In [142]:
rank_dict = analyze_text_textrank(nlp(abstracts_list[3]), 0.07)

In [143]:
plot_ranks(rank_dict)

In [144]:
rank_dict['text'][:5]

['quantum computers',
 'quantum information',
 'study lead author Philipp Dumitrescu',
 'laser pulses',
 'quantum mechanics']

In [145]:
#ANALYZE ABSTRACTS WITH TEXTRANK
threshold_rank = 0.05

abstract_rank_words = []


for abstract in abstracts_list:
     abstract_doc = nlp(abstract)
     abstract_dict = analyze_text_textrank(abstract_doc, threshold_rank)
     abstract_ranked = " ".join(abstract_dict['text'][:5])
     abstract_rank_words.append(abstract_ranked)


In [146]:
abstract_rank_words

['academic performance exposure unleaded fuel present value terms class size',
 'THS exposure THS exposures acute THS exposure skin diseases inflammatory skin disease',
 'psychological aging vulnerable mental health health conditions health outcomes aging',
 'quantum computers quantum information study lead author Philipp Dumitrescu laser pulses quantum mechanics',
 'drone deliveries commercial drone deliveries Zipline deliveries Salt Lake Valley drones',
 'constant activation charges new activations Device Connection Charge customers all new postpaid activations',
 'PFAS molecules non-stick Teflon pans dental floss breaking news alerts PFAS',
 'carbon capture firms carbon emissions carbon technology firms carbon tech carbon capture',
 'Chinese firms Chinese companies China Chinese manufacturers Chinese consumers',
 'high GPU prices price points GPU prices price RTX',
 'mass school shootings mass school shootings risks future mass school shootings mass shootings mass murder',
 'JWST li

In [147]:
df['abstract_textrank'] = abstract_rank_words

In [148]:
df.to_csv(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\final_results\bertopic\article_bertopic_titles_textrank_final.csv")