In [1]:
import spacy

In [2]:
#all stop words in english
from spacy.lang.en.stop_words import STOP_WORDS

In [3]:
len(STOP_WORDS)

326

In [4]:
type(STOP_WORDS) #set objects can't be indexed; you can't do STOP_WORDS[0]

set

In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
doc = nlp("We just opened our wings, the flying part is coming soon")

for token in doc:
    print(token.text, " | ", token.is_stop, " | ", token.pos_)

We  |  True  |  PRON
just  |  True  |  ADV
opened  |  False  |  VERB
our  |  True  |  PRON
wings  |  False  |  NOUN
,  |  False  |  PUNCT
the  |  True  |  DET
flying  |  False  |  VERB
part  |  True  |  NOUN
is  |  True  |  AUX
coming  |  False  |  VERB
soon  |  False  |  ADV


In [7]:
for token in doc:
    if token.is_stop:
        print(token.text)

We
just
our
the
part
is


In [8]:
#function for preprocessing (typically use to remove stop words, perform stemming, lemmatization etc)
def preprocess(text):
    doc = nlp(text)
    no_stop_words = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return no_stop_words #returns no_stop_words that are also not punctuations

In [9]:
preprocess("We just opened our wings, the flying part is coming soon")

['opened', 'wings', 'flying', 'coming', 'soon']

In [10]:
preprocess("The other is not other but your divine brother")

['divine', 'brother']

In [11]:
preprocess("Musk wants time to prepare for a trial over his")

['Musk', 'wants', 'time', 'prepare', 'trial']

In [12]:
#how to do this preprocessing of removing stop words on pandas dataframe
import pandas as pd

In [13]:
df = pd.read_json("/Users/raghavraahul/Downloads/combined.json", lines = True) #lines = True indicates there is one line per json object

In [14]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components
0,,Convicted Bomb Plotter Sentenced to 30 Years,"PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...",2014-10-01T00:00:00-04:00,[],[National Security Division (NSD)]
1,12-919,$1 Million in Restitution Payments Announced t...,WASHINGTON – North Carolina’s Waccamaw River...,2012-07-25T00:00:00-04:00,[],[Environment and Natural Resources Division]
2,11-1002,$1 Million Settlement Reached for Natural Reso...,BOSTON– A $1-million settlement has been...,2011-08-03T00:00:00-04:00,[],[Environment and Natural Resources Division]
3,10-015,10 Las Vegas Men Indicted \r\nfor Falsifying V...,WASHINGTON—A federal grand jury in Las Vegas...,2010-01-08T00:00:00-05:00,[],[Environment and Natural Resources Division]
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]


In [15]:
df.shape

(13087, 6)

In [16]:
type(df.topics)

pandas.core.series.Series

In [17]:
type(df.topics[4])

list

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13087 entries, 0 to 13086
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          12810 non-null  object
 1   title       13087 non-null  object
 2   contents    13087 non-null  object
 3   date        13087 non-null  object
 4   topics      13087 non-null  object
 5   components  13087 non-null  object
dtypes: object(6)
memory usage: 613.6+ KB


In [19]:
df.describe()

Unnamed: 0,id,title,contents,date,topics,components
count,12810,13087,13087,13087,13087,13087
unique,12672,12887,13080,2400,253,810
top,13-526,Northern California Real Estate Investor Agree...,"WASHINGTON – ING Bank N.V., a financial inst...",2018-04-13T00:00:00-04:00,[],[Criminal Division]
freq,3,8,2,20,8399,2680


In [20]:
df = df[df["topics"].str.len() != 0] #removing entries where topic field is empty
df.head(5)

Unnamed: 0,id,title,contents,date,topics,components
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division]
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U..."
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division]
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]"


In [21]:
df.shape

(4688, 6)

In [22]:
#preprocessing contents column
len(df.contents.iloc[4]) #index 4

5504

In [23]:
def preprocess(text):
    doc = nlp(text)
    no_stop_words = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return " ". join(no_stop_words) #returns no_stop_words that are also not punctuations, it is a string and not a list



In [24]:
type(df["contents"].iloc[0])

str

In [25]:
df = df.head(100)
df["contents_new"] = df["contents"].apply(preprocess)
df.head(5)

Unnamed: 0,id,title,contents,date,topics,components,contents_new
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division],U.S. Department Justice U.S. Environmental Pro...
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division],131 count criminal indictment unsealed today B...
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U...",United States Attorney Office Middle District ...
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division],21st Century Oncology LLC agreed pay $ 19.75 m...
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]",21st Century Oncology Inc. certain subsidiarie...


In [28]:
len(df["contents"].iloc[4])

5504

In [29]:
len(df["contents_new"].iloc[4])

4217

In [30]:
df["contents"].iloc[4][:300]

'21st Century Oncology Inc. and certain of its subsidiaries and affiliates have agreed to pay $26 million to the government to resolve a self-disclosure relating to the submission of false attestations regarding the company’s use of electronic health records software and separate allegations that the'

In [31]:
df["contents_new"].iloc[4][:300]

'21st Century Oncology Inc. certain subsidiaries affiliates agreed pay $ 26 million government resolve self disclosure relating submission false attestations company use electronic health records software separate allegations violated False Claims Act submitting causing submission claims certain serv'