In [1]:
import pandas as pd

In [2]:
df = pd.read_json(r"News_Category_Dataset_v3.json", lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [3]:
df.shape

(209527, 6)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   link               209527 non-null  object        
 1   headline           209527 non-null  object        
 2   category           209527 non-null  object        
 3   short_description  209527 non-null  object        
 4   authors            209527 non-null  object        
 5   date               209527 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.6+ MB


In [5]:
(df.isnull().sum()/len(df))*100

link                 0.0
headline             0.0
category             0.0
short_description    0.0
authors              0.0
date                 0.0
dtype: float64

In [6]:
df.columns

Index(['link', 'headline', 'category', 'short_description', 'authors', 'date'], dtype='object')

We will apply masking on Headline, Short Description, and Authors

In [7]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [8]:
def mask(text):
    doc = nlp(text)
    masked = []
    for token in doc:
        if token.ent_type_:
            masked.append(f"[{token.ent_type_}]")
        else:
            masked.append(token.text)
    return " ".join(masked)

In [9]:
df_sample = df.sample(10, random_state=42)

In [10]:
try:
    df_sample["masked_authors"] = df_sample["authors"].apply(mask)
    print("Done with masking authors")
except Exception as e:
    print(e)

Done with masking authors


In [11]:
df_sample[["masked_authors", "authors"]].head()

Unnamed: 0,masked_authors,authors
128310,"[PERSON] [PERSON] , [ORG] , [NORP] / Chief Cur...","Matt Murrie, ContributorEdupreneur, Cofounder/..."
139983,,
42339,"[ORG] [ORG] [ORG] [ORG] [ORG] , [ORG] [ORG] [ORG]","Michael McLaughlin & Josh Morgan, The Huffingt..."
131494,,
163649,[PERSON] [PERSON],Melissa Cronin


In [12]:
try:
    df_sample["masked_headline"] = df_sample["headline"].apply(mask)
    print("Done with masking headline")
except Exception as e:
    print(e)

Done with masking headline


In [13]:
df_sample[["masked_headline", "headline"]].head()

Unnamed: 0,masked_headline,headline
128310,What If We Were All Family Generation Changers ?,What If We Were All Family Generation Changers?
139983,Firestorm At [ORG] [ORG] [ORG] [ORG] [ORG],Firestorm At AOL Over Employee Benefit Cuts
42339,[ORG] [ORG] [ORG] [ORG] As Deadline Passes To ...,Dakota Access Protesters Arrested As Deadline ...
131494,[CARDINAL] Glimpse Of These Baby Kit Foxes And...,One Glimpse Of These Baby Kit Foxes And You'll...
163649,[ORG] [ORG] [ORG] [ORG] [ORG] [ORG] [ORG] [ORG...,"Mens' Sweat Pheromone, Androstadienone, Influe..."


In [14]:
try:
    df_sample["masked_description"] = df_sample["short_description"].apply(mask)
    print("Done with masking description")
except Exception as e:
    print(e)

Done with masking description


In [15]:
df_sample[["masked_description", "short_description"]].head()

Unnamed: 0,masked_description,short_description
128310,"What if , in doing so , we wo n't just create ...","What if, in doing so, we won't just create new..."
139983,It should have been [DATE] [DATE] [DATE] for [...,It should have been a glorious week for AOL ch...
42339,A few protesters who refused to leave remained...,A few protesters who refused to leave remained...
131494,,
163649,Scientists did n't know if humans played that ...,Scientists didn't know if humans played that g...


In [16]:
df_sample.head()

Unnamed: 0,link,headline,category,short_description,authors,date,masked_authors,masked_headline,masked_description
128310,https://www.huffingtonpost.com/entry/what-if-w...,What If We Were All Family Generation Changers?,IMPACT,"What if, in doing so, we won't just create new...","Matt Murrie, ContributorEdupreneur, Cofounder/...",2014-06-20,"[PERSON] [PERSON] , [ORG] , [NORP] / Chief Cur...",What If We Were All Family Generation Changers ?,"What if , in doing so , we wo n't just create ..."
139983,https://www.huffingtonpost.comhttp://www.washi...,Firestorm At AOL Over Employee Benefit Cuts,BUSINESS,It should have been a glorious week for AOL ch...,,2014-02-08,,Firestorm At [ORG] [ORG] [ORG] [ORG] [ORG],It should have been [DATE] [DATE] [DATE] for [...
42339,https://www.huffingtonpost.com/entry/time-runs...,Dakota Access Protesters Arrested As Deadline ...,POLITICS,A few protesters who refused to leave remained...,"Michael McLaughlin & Josh Morgan, The Huffingt...",2017-02-22,"[ORG] [ORG] [ORG] [ORG] [ORG] , [ORG] [ORG] [ORG]",[ORG] [ORG] [ORG] [ORG] As Deadline Passes To ...,A few protesters who refused to leave remained...
131494,https://www.huffingtonpost.com/entry/one-glimp...,One Glimpse Of These Baby Kit Foxes And You'll...,GREEN,,,2014-05-14,,[CARDINAL] Glimpse Of These Baby Kit Foxes And...,
163649,https://www.huffingtonpost.com/entry/mens-swea...,"Mens' Sweat Pheromone, Androstadienone, Influe...",SCIENCE,Scientists didn't know if humans played that g...,Melissa Cronin,2013-06-02,[PERSON] [PERSON],[ORG] [ORG] [ORG] [ORG] [ORG] [ORG] [ORG] [ORG...,Scientists did n't know if humans played that ...


In [19]:
def display_comparison(index):
    print("🔹 ORIGINAL TEXT:\n")
    print(f"📰 Headline         : {df_sample.loc[index, 'headline']}")
    print(f"📄 Description      : {df_sample.loc[index, 'short_description']}")
    print(f"✍️  Authors          : {df_sample.loc[index, 'authors']}")
    
    print("\n🔒 MASKED TEXT:\n")
    print(f"📰 Headline         : {df_sample.loc[index, 'masked_headline']}")
    print(f"📄 Description      : {df_sample.loc[index, 'masked_description']}")
    print(f"✍️  Authors          : {df_sample.loc[index, 'masked_authors']}")


In [22]:
display_comparison(139983)

🔹 ORIGINAL TEXT:

📰 Headline         : Firestorm At AOL Over Employee Benefit Cuts
📄 Description      : It should have been a glorious week for AOL chief executive Tim Armstrong. His company’s quarterly earnings, announced Thursday
✍️  Authors          : 

🔒 MASKED TEXT:

📰 Headline         : Firestorm At [ORG] [ORG] [ORG] [ORG] [ORG]
📄 Description      : It should have been [DATE] [DATE] [DATE] for [ORG] chief executive [PERSON] [PERSON] . His company ’s [DATE] earnings , announced [DATE]
✍️  Authors          : 
