## Stop Words

In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [4]:
print("Length of stop words", len(STOP_WORDS))
STOP_WORDS

Length of stop words 326


{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

We would like to remove these stop words in preprocessing stage to bring in more efficieny in computation.

In [5]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("We just opened our wings, the flying part is coming soon")

for token in doc:
    if token.is_stop:
        print(token)

We
just
our
the
part
is


In [34]:
def preprocess(text):
    doc = nlp(text)
    
    no_stop_words = [token.text for token in doc if not token.is_stop and not token.is_punct]
    
    return " ".join(no_stop_words)
        

In [35]:
preprocess("The other is not other but your divine brother.")

'divine brother'

### Handling a dataset from kaggle

In [12]:
import pandas as pd

df = pd.read_json("doj_press.json", lines=True)

df.shape

(13087, 6)

In [14]:
df.head(6)

Unnamed: 0,id,title,contents,date,topics,components
0,,Convicted Bomb Plotter Sentenced to 30 Years,"PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...",2014-10-01T00:00:00-04:00,[],[National Security Division (NSD)]
1,12-919,$1 Million in Restitution Payments Announced t...,WASHINGTON – North Carolina’s Waccamaw River...,2012-07-25T00:00:00-04:00,[],[Environment and Natural Resources Division]
2,11-1002,$1 Million Settlement Reached for Natural Reso...,BOSTON– A $1-million settlement has been...,2011-08-03T00:00:00-04:00,[],[Environment and Natural Resources Division]
3,10-015,10 Las Vegas Men Indicted \r\nfor Falsifying V...,WASHINGTON—A federal grand jury in Las Vegas...,2010-01-08T00:00:00-05:00,[],[Environment and Natural Resources Division]
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]
5,,105 Individuals Indicted for Violating the RIC...,A nine count federal indictment was unsealed t...,2015-07-22T00:00:00-04:00,[],[USAO - Puerto Rico]


In [16]:
type(df.topics[1])

list

In [17]:
df.describe()

Unnamed: 0,id,title,contents,date,topics,components
count,12810,13087,13087,13087,13087,13087
unique,12672,12887,13080,2400,253,810
top,13-526,Northern California Real Estate Investor Agree...,"WASHINGTON – ING Bank N.V., a financial inst...",2018-04-13T00:00:00-04:00,[],[Criminal Division]
freq,3,8,2,20,8399,2680


In [18]:
# filter out all the rows not having topics

df = df[df['topics'].str.len() != 0]

df.head()

Unnamed: 0,id,title,contents,date,topics,components
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division]
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U..."
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division]
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]"


In [19]:
df.shape

(4688, 6)

In [20]:
df = df.head(100)

In [21]:
df.shape

(100, 6)

In [25]:
df["contents"]

4      The U.S. Department of Justice, the U.S. Envir...
7      A 131-count criminal indictment was unsealed t...
19     The United States Attorney’s Office for the Mi...
22     21st Century Oncology LLC, has agreed to pay $...
23     21st Century Oncology Inc. and certain of its ...
                             ...                        
316    Doctor Hid Millions in Secret Accounts in Pana...
318    Defendant Concealed Bank Accounts in Panama an...
321    An Alaskan couple was charged in federal court...
322    A husband and wife pleaded guilty yesterday to...
324    A resident of Big Lake, Alaska was indicted on...
Name: contents, Length: 100, dtype: object

In [31]:
df.contents.iloc[0]

"The U.S. Department of Justice, the U.S. Environmental Protection Agency (EPA), and the Rhode Island Department of Environmental Management (RIDEM) announced today that two subsidiaries of Stanley Black & Decker Inc.—Emhart Industries Inc. and Black & Decker Inc.—have agreed to clean up dioxin contaminated sediment and soil at the Centredale Manor Restoration Project Superfund Site in North Providence and Johnston, Rhode Island.\xa0 “We are pleased to reach a resolution through collaborative work with the responsible parties, EPA, and other stakeholders,” said\xa0Acting Assistant Attorney General Jeffrey H. Wood for the Justice Department's\xa0Environment and Natural Resources Division . “Today’s settlement ends protracted litigation and allows for important work to get underway to restore a healthy environment for citizens living in and around the Centredale Manor Site and the Woonasquatucket River.” “This settlement demonstrates the tremendous progress we are achieving working with 

In [33]:
# before removing stop words
len(df.contents.iloc[0])

6286

In [36]:
df["contents_new"] = df["contents"].apply(preprocess)

In [37]:
df.head(6)

Unnamed: 0,id,title,contents,date,topics,components,contents_new
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division],U.S. Department Justice U.S. Environmental Pro...
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division],131 count criminal indictment unsealed today B...
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U...",United States Attorney Office Middle District ...
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division],21st Century Oncology LLC agreed pay $ 19.75 m...
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]",21st Century Oncology Inc. certain subsidiarie...
26,18-961,24 Defendants Sentenced in Multimillion Dolla...,Twenty-one members of a massive India-based fr...,2018-07-20T00:00:00-04:00,"[Consumer Protection, Elder Justice]","[Criminal Division, USAO - Texas, Southern]",members massive India based fraud money launde...


In [39]:
len(df.contents_new.iloc[0]) # this was 6286 not its 4574

4574