NLTK stands for Natural Language ToolKit.
Word_tokenize returns a list of words and punctuations in a string.

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ravitripathi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data = pd.read_csv('nyt_metadata.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42625 entries, 0 to 42624
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        42625 non-null  int64  
 1   abstract          42623 non-null  object 
 2   web_url           42625 non-null  object 
 3   snippet           42468 non-null  object 
 4   lead_paragraph    42487 non-null  object 
 5   print_section     25726 non-null  object 
 6   print_page        25726 non-null  float64
 7   source            42625 non-null  object 
 8   multimedia        42625 non-null  object 
 9   headline          42625 non-null  object 
 10  keywords          42625 non-null  object 
 11  pub_date          42625 non-null  object 
 12  document_type     42625 non-null  object 
 13  news_desk         42624 non-null  object 
 14  section_name      42619 non-null  object 
 15  subsection_name   16890 non-null  object 
 16  byline            42625 non-null  object

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,headline,...,pub_date,document_type,news_desk,section_name,subsection_name,byline,type_of_material,_id,word_count,uri
0,1813,"Economic hardship, climate change, political i...",https://www.nytimes.com/2023/05/14/us/migrants...,"Economic hardship, climate change, political i...",Relative quiet has prevailed along the souther...,A,14.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","{'main': 'Title 42 Is Gone, but Not the Condit...",...,2023-05-15 01:24:42+00:00,article,National,U.S.,,"{'original': 'By Miriam Jordan', 'person': [{'...",News,nyt://article/3d95da14-0c64-59c6-bae2-02b151ad...,1217,nyt://article/3d95da14-0c64-59c6-bae2-02b151ad...
1,1814,It’s election night in America. Stay away from...,https://www.nytimes.com/2023/05/14/arts/televi...,It’s election night in America. Stay away from...,"The day before Logan Roy died, he delivered a ...",,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","{'main': '‘Succession’ Season 4, Episode 8 Rec...",...,2023-05-15 02:01:05+00:00,article,Culture,Arts,Television,"{'original': 'By Noel Murray', 'person': [{'fi...",News,nyt://article/17f6f628-2939-541b-a0e8-5c503fa6...,1495,nyt://article/17f6f628-2939-541b-a0e8-5c503fa6...
2,1815,"Tom is stressed in dress shoes, Shiv hides ben...",https://www.nytimes.com/2023/05/14/style/succe...,"Tom is stressed in dress shoes, Shiv hides ben...",This article contains spoilers for Episode 8 o...,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","{'main': '‘Succession’ Style, Episode 8: Some ...",...,2023-05-15 02:15:04+00:00,article,Styles,Style,,"{'original': 'By The Styles Desk', 'person': [...",News,nyt://article/70773662-4815-5d40-8460-b438aa44...,665,nyt://article/70773662-4815-5d40-8460-b438aa44...
3,1816,"No corrections appeared in print on Monday, Ma...",https://www.nytimes.com/2023/05/14/pageoneplus...,"No corrections appeared in print on Monday, Ma...",Errors are corrected during the press run when...,,,The New York Times,[],"{'main': 'No Corrections: May 15, 2023', 'kick...",...,2023-05-15 03:55:48+00:00,article,Corrections,Corrections,,"{'original': '', 'person': [], 'organization':...",News,nyt://article/199d026e-1372-51a3-adf0-c82abeeb...,52,nyt://article/199d026e-1372-51a3-adf0-c82abeeb...
4,1817,"Quotation of the Day for Monday, May 15, 2023.",https://www.nytimes.com/2023/05/14/pageoneplus...,"Quotation of the Day for Monday, May 15, 2023.","“For me, it was time to give back the love the...",A,2.0,The New York Times,[],{'main': 'Quotation of the Day: When Your Cham...,...,2023-05-15 03:55:57+00:00,article,Summary,Corrections,,"{'original': '', 'person': [], 'organization':...",News,nyt://article/5f4b7ea7-88f4-5178-884f-ae28530b...,42,nyt://article/5f4b7ea7-88f4-5178-884f-ae28530b...


In [5]:
# All the documents in the corpus are articles
len(data.document_type) == len([x for x in data['document_type'] if x=='article'])

True

In [6]:
# Creating a copy of our data. We will make changes to this copy and ensure our original data can be accessed if needed
data_copy = data

Let us see the list of stop words. We will remove these stop words from our corpus as they usually do not capture the essence of the text

In [7]:
stop_words = set(stopwords.words('english'))
print(stop_words)

{"won't", 'him', 'wouldn', 'hadn', 'weren', 'but', 'as', 'out', "you're", 'ourselves', "wouldn't", 'myself', 'to', 'once', 'here', 'won', 'whom', 'me', 'herself', 'over', 'hers', 'where', 'very', 'didn', 'she', 'can', 'most', 'should', 'off', 'during', 'between', "haven't", 'because', "mightn't", 'then', 'haven', "you'll", "that'll", 'does', 'some', "needn't", "isn't", 'yourself', "it's", 'if', 'mightn', 'aren', 'more', 'while', 'themselves', 'how', 'needn', 'couldn', 'his', 'shan', 'shouldn', 'down', 'doesn', 'only', 'through', "weren't", 'up', 'further', 'above', 'our', 'what', 'being', 're', 'are', 'your', 'doing', 'y', 'when', 'my', "doesn't", 'against', 'it', "couldn't", 'such', 'below', 'again', "shouldn't", 'will', 'about', "you've", 'their', 'there', 'that', "wasn't", 'under', 'we', 'in', "she's", 'not', "you'd", 'her', 'theirs', 'now', 'is', 'than', 'each', 'or', 'same', 'and', 'have', 'them', "aren't", 'd', 'himself', 'after', 'few', 'having', 'did', 'don', 't', "mustn't", 'j

In [8]:
# Tokenizing the lead paragraphs of the first article. Just to see how word_tokenize works
para = word_tokenize(data_copy.lead_paragraph[0])
para

['Relative',
 'quiet',
 'has',
 'prevailed',
 'along',
 'the',
 'southern',
 'U.S.',
 'border',
 'since',
 'Friday',
 ',',
 'despite',
 'widespread',
 'fears',
 'that',
 'ending',
 'a',
 'pandemic-era',
 'policy',
 'to',
 'immediately',
 'expel',
 'most',
 'migrants',
 ',',
 'even',
 'asylum',
 'seekers',
 ',',
 'would',
 'set',
 'off',
 'a',
 'stampede',
 'from',
 'Mexico',
 '.']

In [9]:
# Removing stop words from the first lead paragraph and seeing the result
para = [word for word in para if not word.lower() in stop_words]
para

['Relative',
 'quiet',
 'prevailed',
 'along',
 'southern',
 'U.S.',
 'border',
 'since',
 'Friday',
 ',',
 'despite',
 'widespread',
 'fears',
 'ending',
 'pandemic-era',
 'policy',
 'immediately',
 'expel',
 'migrants',
 ',',
 'even',
 'asylum',
 'seekers',
 ',',
 'would',
 'set',
 'stampede',
 'Mexico',
 '.']

In [10]:
# Tokenizing and removing stop words from lead paragraphs of each article 
for i in range(len(data_copy.lead_paragraph)):
    word_tokens = word_tokenize(data_copy.lead_paragraph[i])
    data_copy.lead_paragraph[i] = [w for w in word_tokens if not w.lower() in stop_words]

['Relative', 'quiet', 'has', 'prevailed', 'along', 'the', 'southern', 'U.S.', 'border', 'since', 'Friday', ',', 'despite', 'widespread', 'fears', 'that', 'ending', 'a', 'pandemic-era', 'policy', 'to', 'immediately', 'expel', 'most', 'migrants', ',', 'even', 'asylum', 'seekers', ',', 'would', 'set', 'off', 'a', 'stampede', 'from', 'Mexico', '.']
['The', 'day', 'before', 'Logan', 'Roy', 'died', ',', 'he', 'delivered', 'a', 'fiery', 'call', 'to', 'arms', 'to', 'his', 'ATN', 'staff', ',', 'letting', 'them', 'know', 'what', 'he', 'expected', 'from', 'the', 'network', 'going', 'forward', '.', 'The', 'speech', 'was', 'an', 'angrier', 'variation', 'of', 'the', 'populist', 'spiel', 'he', 'had', 'given', 'many', 'times', 'before', ',', 'in', 'which', 'he', 'insisted', 'that', 'the', 'news', 'should', 'always', 'be', 'frank', 'and', 'unpretentious', '.', 'He', 'wanted', 'his', 'anchors', 'to', 'tell', 'their', 'viewers', '“', 'truthful', '”', 'things', 'they', 'had', 'never', 'heard', 'anyone', '

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [w for w in word_tokens if not w.lower() in stop_words]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [w for w in word_tokens if not w.lower() in stop_words]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [w for w in word_tokens if not w.lower() in stop_words]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://panda

['The', 'European', 'Commission', 'has', 'formally', 'asked', 'Greece', 'to', 'begin', 'an', 'investigation', 'into', 'a', 'New', 'York', 'Times', 'report', 'based', 'on', 'exclusive', 'footage', 'showing', 'the', 'country', '’', 's', 'Coast', 'Guard', 'abandoning', 'migrants', 'in', 'the', 'Aegean', 'Sea', 'last', 'month', ',', 'a', 'top', 'official', 'said', 'on', 'Monday', '.']
['Spring', 'has', 'been', 'bashful', 'this', 'year', 'in', 'the', 'Northeast', ',', 'beaming', 'with', 'sunshine', 'one', 'minute', ',', 'only', 'to', 'cover', 'up', 'with', 'sheets', 'of', 'clouds', 'and', 'dampening', 'rain', 'the', 'next', '.', 'But', ',', 'as', 'Kay', 'Chun', 'reminds', 'us', ',', 'summer', 'is', 'just', 'around', 'the', 'corner', ',', '“', 'ready', 'to', 'encourage', 'breezy', 'communal', 'dining', 'under', 'the', 'glow', 'of', 'the', 'sun', 'and', 'the', 'grill', ',', '”', 'she', 'writes', 'in', 'her', 'lovely', 'article', 'in', 'The', 'New', 'York', 'Times', '.']
['Some', '252', 'milli

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [w for w in word_tokens if not w.lower() in stop_words]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [w for w in word_tokens if not w.lower() in stop_words]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [w for w in word_tokens if not w.lower() in stop_words]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://panda

TypeError: expected string or bytes-like object, got 'float'

In [11]:
# Checking if above code worked for the 1st lead paragraph
data_copy.lead_paragraph[0]

['Relative',
 'quiet',
 'prevailed',
 'along',
 'southern',
 'U.S.',
 'border',
 'since',
 'Friday',
 ',',
 'despite',
 'widespread',
 'fears',
 'ending',
 'pandemic-era',
 'policy',
 'immediately',
 'expel',
 'migrants',
 ',',
 'even',
 'asylum',
 'seekers',
 ',',
 'would',
 'set',
 'stampede',
 'Mexico',
 '.']

In [12]:
# Let us now lemmatize each lead paragraph

wnl = WordNetLemmatizer()

for i in range(len(data_copy.lead_paragraph)):
        data_copy.lead_paragraph[i] = [wnl.lemmatize(word) for word in data_copy.lead_paragraph[i]] 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [wnl.lemmatize(word) for word in data_copy.lead_paragraph[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [wnl.lemmatize(word) for word in data_copy.lead_paragraph[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [wnl.lemmatize(word) for word in data_copy.lead_paragraph[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentat

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [wnl.lemmatize(word) for word in data_copy.lead_paragraph[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [wnl.lemmatize(word) for word in data_copy.lead_paragraph[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [wnl.lemmatize(word) for word in data_copy.lead_paragraph[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentat

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [wnl.lemmatize(word) for word in data_copy.lead_paragraph[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [wnl.lemmatize(word) for word in data_copy.lead_paragraph[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [wnl.lemmatize(word) for word in data_copy.lead_paragraph[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentat

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [wnl.lemmatize(word) for word in data_copy.lead_paragraph[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [wnl.lemmatize(word) for word in data_copy.lead_paragraph[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [wnl.lemmatize(word) for word in data_copy.lead_paragraph[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentat

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [wnl.lemmatize(word) for word in data_copy.lead_paragraph[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [wnl.lemmatize(word) for word in data_copy.lead_paragraph[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [wnl.lemmatize(word) for word in data_copy.lead_paragraph[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentat

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [wnl.lemmatize(word) for word in data_copy.lead_paragraph[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [wnl.lemmatize(word) for word in data_copy.lead_paragraph[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.lead_paragraph[i] = [wnl.lemmatize(word) for word in data_copy.lead_paragraph[i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentat

TypeError: 'float' object is not iterable

In [13]:
# You can see that fears has changed to fear, migrants to migrant etc.
data_copy.lead_paragraph[0]

['Relative',
 'quiet',
 'prevailed',
 'along',
 'southern',
 'U.S.',
 'border',
 'since',
 'Friday',
 ',',
 'despite',
 'widespread',
 'fear',
 'ending',
 'pandemic-era',
 'policy',
 'immediately',
 'expel',
 'migrant',
 ',',
 'even',
 'asylum',
 'seeker',
 ',',
 'would',
 'set',
 'stampede',
 'Mexico',
 '.']