In [98]:
import pandas as pd
import codecs
import os
import numpy as np
import pytextrank
import spacy
from bs4 import BeautifulSoup


In [99]:
# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")

# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")


<pytextrank.base.BaseTextRankFactory at 0x270f8410d90>

In [101]:
def get_html_return_soup(html_path):
    f=codecs.open(html_path, 'r', encoding="utf8")  
    html = f.read()
    return BeautifulSoup(html)

In [102]:
def analyze_text_textrank(doc, rank_threshold=0.0):
    # examine the top-ranked phrases in the document
    rank_d = {"text":[], "rank":[], "count":[]}

    for phrase in doc._.phrases:
        if phrase.rank >= rank_threshold:
            rank_d['text'].append(phrase.text)
            rank_d['rank'].append(phrase.rank)
            rank_d['count'].append(phrase.count)
    
    return rank_d 

In [115]:
path = r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\reddit_articles_analysis"
html1_path = os.path.join(path, "Harvard Engineers Invent a Solid-State Battery That Never Dies, It's a Game Changer.html")
html2_path = os.path.join(path, "In situ resource utilization of lunar soil for highly efficient extraterrestrial fuel and oxygen supply _ National Science Review _ Oxford Academic.html")
html3_path = os.path.join(path, "3.htm")

In [116]:
soup1 = get_html_return_soup(html1_path)
soup2 = get_html_return_soup(html2_path)
soup3 = get_html_return_soup(html3_path)

In [117]:
title1 = soup1.find("h1").text
body1 = soup1.find("div", {"itemprop":"articleBody"})
text1 = "".join([el.text.strip('\n').strip('\t') for el in body1 ]) 

In [118]:
title2 = soup2.find("h1").text.strip('\r\n ').strip('\n')
body2 = soup2.find("section", {"class":"abstract"})
text2 = "".join([el.text for el in body2 ]) 

In [119]:
title3 = soup3.find_all("meta")[15]['content']

In [126]:
text3 = soup3.find_all("meta")[16]['content']

In [11]:
threshold_rank = 0.05

In [135]:
all_titles = [title1, title2, title3]
all_texts = [text1, text2, text3]
all_urls = ["https://www.autoevolution.com/news/harvard-engineers-invent-a-solid-state-battery-that-never-dies-it-s-a-game-changer-198518.html"
,"https://academic.oup.com/nsr/advance-article/doi/10.1093/nsr/nwac200/6712344?login=false"
,"https://www.fiercebiotech.com/research/ut-austin-scientists-design-safer-cas9-improved-crispr-gene-editing-accuracy"
]

In [137]:
threshold_rank = 0.05

cols_title_df = ['article_keywords', 'title_keywords', 'count']
title_df = pd.DataFrame([], columns=cols_title_df)

cols_text_df = ['sentence', 'sentence_rank', 'count']
text_df = pd.DataFrame([], columns=cols_text_df)


for url, title, text in zip(all_urls, all_titles, all_texts):
     text_doc = nlp(text)
     title_doc = nlp(title)
     text_dict = analyze_text_textrank(text_doc, threshold_rank)
     title_dict = analyze_text_textrank(title_doc, threshold_rank)

     temp_text_df = pd.DataFrame.from_dict(text_dict)
     temp_text_df.columns = cols_text_df
     temp_text_df['article_link'] = [url]*len(temp_text_df)
     text_df = pd.concat([text_df, temp_text_df])
 
     temp_title_df = pd.DataFrame.from_dict(title_dict)
     temp_title_df.columns = cols_title_df
     temp_title_df['article_link'] = [url]*len(temp_title_df)
     title_df = pd.concat([title_df, temp_title_df])

In [139]:
title_df.to_csv(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\reddit_articles_analysis\resolved_prob_urls\titles2.csv")
text_df.to_csv(r"C:\Users\Ravit\Documents\rnd\horizon_scanning_lab\articles\analyze_Articles\reddit_articles_analysis\resolved_prob_urls\texts2.csv")
