In [14]:
import pandas as pd

pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', None)  
pd.set_option('display.max_colwidth', None) 

from tqdm import tqdm
tqdm.pandas()

In [15]:
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)

In [16]:
df.head(3)

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-uptake-us_n_632d719ee4b087fae6feaac9,Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters,U.S. NEWS,Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlines-passenger-banned-flight-attendant-punch-justice-department_n_632e25d3e4b0e247890329fe,"American Airlines Flyer Charged, Banned For Life After Punching Flight Attendant On Video",U.S. NEWS,"He was subdued by passengers and crew when he fled to the back of the aircraft after the confrontation, according to the U.S. attorney's office in Los Angeles.",Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets-cats-dogs-september-17-23_n_632de332e4b0695c1d81dc02,23 Of The Funniest Tweets About Cats And Dogs This Week (Sept. 17-23),COMEDY,"""Until you have a dog you don't understand what could be eaten.""",Elyse Wanshel,2022-09-23


In [17]:
df = df[['headline', 'short_description']]

In [18]:
# merge headline and short description into one column
df['content'] = df['headline'] + '. ' + df['short_description']

In [19]:
df.head(2)

Unnamed: 0,headline,short_description,content
0,Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters,Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.,Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters. Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.
1,"American Airlines Flyer Charged, Banned For Life After Punching Flight Attendant On Video","He was subdued by passengers and crew when he fled to the back of the aircraft after the confrontation, according to the U.S. attorney's office in Los Angeles.","American Airlines Flyer Charged, Banned For Life After Punching Flight Attendant On Video. He was subdued by passengers and crew when he fled to the back of the aircraft after the confrontation, according to the U.S. attorney's office in Los Angeles."


In [20]:
# take only the first 500 rows
df = df[:50]

#### Keyphrase without POS pattern

In [21]:
from keybert import KeyBERT
model = KeyBERT()


def get_keyphrase_bert(text):
    keyphrase = model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words='english', top_n=5)
    # return only the keyphrase
    return [i[0] for i in keyphrase]

In [22]:
# apply the function to the content column
df['keyphrase_without_pos'] = df['content'].progress_apply(get_keyphrase_bert)

100%|██████████| 50/50 [00:10<00:00,  4.68it/s]


In [23]:
df[['content', 'keyphrase_without_pos']].head(3)

Unnamed: 0,content,keyphrase_without_pos
0,Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters. Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.,"[omicron targeted covid, targeted covid boosters, sleeves omicron targeted, covid boosters, covid boosters health]"
1,"American Airlines Flyer Charged, Banned For Life After Punching Flight Attendant On Video. He was subdued by passengers and crew when he fled to the back of the aircraft after the confrontation, according to the U.S. attorney's office in Los Angeles.","[punching flight attendant, flyer charged banned, airlines flyer charged, punching flight, american airlines flyer]"
2,"23 Of The Funniest Tweets About Cats And Dogs This Week (Sept. 17-23). ""Until you have a dog you don't understand what could be eaten.""","[tweets cats dogs, funniest tweets cats, tweets cats, 23 funniest tweets, cats dogs]"


### Keyphrase extraction using POS pattern

In [24]:
from keyphrase_vectorizers import KeyphraseCountVectorizer

from keybert import KeyBERT

kw_model = KeyBERT()

def get_keyPhrases_POS_BERT(text):
    keyPhrases = kw_model.extract_keywords(docs=text, vectorizer=KeyphraseCountVectorizer(), top_n=5)
    # return only the keyphrases
    return [keyPhrases[i][0] for i in range(len(keyPhrases))]

In [25]:
df['keyphrase_with_pos'] = df['content'].progress_apply(get_keyPhrases_POS_BERT)

100%|██████████| 50/50 [00:26<00:00,  1.91it/s]


In [27]:
df[['content', 'keyphrase_without_pos', 'keyphrase_with_pos']].head(5)

Unnamed: 0,content,keyphrase_without_pos,keyphrase_with_pos
0,Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters. Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.,"[omicron targeted covid, targeted covid boosters, sleeves omicron targeted, covid boosters, covid boosters health]","[targeted covid boosters, new boosters, omicron, sleeves, doses]"
1,"American Airlines Flyer Charged, Banned For Life After Punching Flight Attendant On Video. He was subdued by passengers and crew when he fled to the back of the aircraft after the confrontation, according to the U.S. attorney's office in Los Angeles.","[punching flight attendant, flyer charged banned, airlines flyer charged, punching flight, american airlines flyer]","[punching flight attendant, american airlines flyer charged, aircraft, passengers, confrontation]"
2,"23 Of The Funniest Tweets About Cats And Dogs This Week (Sept. 17-23). ""Until you have a dog you don't understand what could be eaten.""","[tweets cats dogs, funniest tweets cats, tweets cats, 23 funniest tweets, cats dogs]","[funniest tweets, cats, dogs, dog, week]"
3,"The Funniest Tweets From Parents This Week (Sept. 17-23). ""Accidentally put grown-up toothpaste on my toddler’s toothbrush and he screamed like I was cleaning his teeth with a Carolina Reaper dipped in Tabasco sauce.""","[funniest tweets parents, toddler toothbrush screamed, toothpaste toddler, toothbrush screamed, grown toothpaste toddler]","[funniest tweets, tabasco sauce, toothpaste, teeth, parents]"
4,Woman Who Called Cops On Black Bird-Watcher Loses Lawsuit Against Ex-Employer. Amy Cooper accused investment firm Franklin Templeton of unfairly firing her and branding her a racist after video of the Central Park encounter went viral.,"[watcher loses lawsuit, cops black bird, black bird watcher, amy cooper accused, bird watcher loses]","[amy cooper, black bird, lawsuit, watcher, woman]"
