In [1]:
## import packages

import re
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\salma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
## load data

df1 = pd.read_csv(r'data\articles1.csv')
df2 = pd.read_csv(r'data\articles2.csv')
df3 = pd.read_csv(r'data\articles3.csv')

df = pd.concat([df1, df2, df3])
df = df.reset_index(drop=True)
del df1, df2, df3

print(df.shape)
df.head()

(142570, 10)


Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [3]:
## data columns

df.columns

Index(['Unnamed: 0', 'id', 'title', 'publication', 'author', 'date', 'year',
       'month', 'url', 'content'],
      dtype='object')

In [4]:
## only keeping necessary columns

df = df[['title', 'content', 'author', 'publication']]
print(df.shape)
df.head()

(142570, 4)


Unnamed: 0,title,content,author,publication
0,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...,Carl Hulse,New York Times
1,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood...",Benjamin Mueller and Al Baker,New York Times
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...","When Walt Disney’s “Bambi” opened in 1942, cri...",Margalit Fox,New York Times
3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...","Death may be the great equalizer, but it isn’t...",William McDonald,New York Times
4,Kim Jong-un Says North Korea Is Preparing to T...,"SEOUL, South Korea — North Korea’s leader, ...",Choe Sang-Hun,New York Times


In [5]:
## missing values

df.isnull().sum()

title              2
content            0
author         15876
publication        0
dtype: int64

In [6]:
## filling missing values

df['title'] = df['title'].fillna('No Title')
df['author'] = df['author'].fillna('Unknown')

In [7]:
## maximum length of title and content

max_title_length = max(df['title'].apply(lambda x: len(x)))
max_content_length = max(df['content'].apply(lambda x: len(x)))

print('Maximum title length: ', max_title_length)
print('Maximum content length: ', max_content_length)

Maximum title length:  231
Maximum content length:  292586


In [8]:
## sample title and content

print(f'Title: {df["title"][0]}')
print("-------------------------------------------")
print(f'Content: {df["content"][0]}')

Title: House Republicans Fret About Winning Their Health Care Suit - The New York Times
-------------------------------------------
Content: WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care lawsuit against the Obama administration: They might win. The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for   and   Americans, handing House Republicans a big victory on    issues. But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement. That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. To stave off that outcome, Republicans could find themselves in t

In [9]:
# stopwords after removing gender pronouns

stopwords = stopwords.words('english')
gender_pronouns = ['he', 'she', 'her', 'his', 'him', 'hers', 'himself', 'herself', "she's", "he's", "she'd", "he'd", "she'll", "he'll"]
stopwords_ = set(stopwords) - set(gender_pronouns)

In [10]:
## function definition for text preprocessing title and content

def text_preprocessing(text):
    text = text.lower()  # lowercase text
    text = re.sub('[^a-zA-Z]', ' ', text)  # remove non-alphabets
    text = text.split()  # split text
    text = [word for word in text if not word in stopwords_]  # remove stopwords
    text = ' '.join(text)  # join all
    return text

In [11]:
## text preprocessing title and content

df['title'] = df['title'].apply(text_preprocessing)
print('Title text preprocessing completed.')
df['content'] = df['content'].apply(text_preprocessing)
print('Content text preprocessing completed.')

Title text preprocessing completed.
Content text preprocessing completed.


In [12]:
## missing values after text preprocessing

df.isnull().sum()

title          0
content        0
author         0
publication    0
dtype: int64

In [13]:
## lemmatize the text

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w) for w in text.split()])

df['content_lemmatized'] = ''

for i in range(len(df)):
    df['content_lemmatized'][i] = lemmatize_text(df['content'][i])
    if i % 10000 == 0:
        print(i, end=' ')

df.head()

0 10000 20000 30000 40000 50000 60000 70000 80000 90000 100000 110000 120000 130000 140000 

Unnamed: 0,title,content,author,publication,content_lemmatized
0,house republicans fret winning health care sui...,washington congressional republicans new fear ...,Carl Hulse,New York Times,washington congressional republican new fear c...
1,rift officers residents killings persist south...,bullet shells get counted blood dries votive c...,Benjamin Mueller and Al Baker,New York Times,bullet shell get counted blood dry votive cand...
2,tyrus wong bambi artist thwarted racial bias d...,walt disney bambi opened critics praised spare...,Margalit Fox,New York Times,walt disney bambi opened critic praised spare ...
3,among deaths heavy toll pop music new york times,death may great equalizer necessarily evenhand...,William McDonald,New York Times,death may great equalizer necessarily evenhand...
4,kim jong un says north korea preparing test lo...,seoul south korea north korea leader kim said ...,Choe Sang-Hun,New York Times,seoul south korea north korea leader kim said ...


In [14]:
## saving preprocessed data to csv

df.to_csv(r'data\articles_preprocessed.csv', index=False)