In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import contractions
import matplotlib.pyplot as plt


df = pd.read_csv('data/News_Category_Dataset_v3.csv')
print(f'The data has {df.shape[0]} rows and {df.shape[1]} columns')
print(f'The data has the following features:\n{df.columns.tolist()}')

The data has 209527 rows and 8 columns
The data has the following features:
['Unnamed: 0', 'headline', 'category', 'short_description', 'authors', 'date', 'headline_length', 'short_description_length']


In [2]:
#target - category

print(f'Number of classes in category: {len(df['category'].unique())}')

Number of classes in category: 42


In [3]:
df['category'].value_counts()

category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATI

- Some categories can be inter-related and grouping them using the following mapping:

In [4]:
mapping = {
    "POLITICS" : ["POLITICS"],
    "ENTERTAINMENT" : ["ENTERTAINMENT", "MEDIA", "COMEDY"],
    "VOICES" : ["QUEER VOICES", "BLACK VOICES", "LATINO VOICES"],
    "FOOD & DRINK" : ["FOOD & DRINK", "TASTE"],
    "BUSINESS" : ["BUSINESS"],
    "SPORTS" : ["SPORTS"],
    "HOME & LIVING" : ["HOME & LIVING"],
    "NEWS" : ["THE WORLDPOST", "CRIME", "WORLD NEWS", "WEIRD NEWS", "WORLDPOST", "FIFTY", "GOOD NEWS", "U.S. NEWS", ],
    "FAMILY" : ["WEDDINGS", "DIVORCE", "PARENTING", "PARENTS"],
    "IMPACT" : ["IMPACT"],
    "ENVIRONMENT" : ["GREEN", "ENVIRONMENT"],
    "LIFE" : ["WOMEN", "RELIGION", "MONEY", "TRAVEL"],
    "SCIENCE" : ["SCIENCE", "TECH", ],
    "ARTS & CULTURE" : ["ARTS", "ARTS & CULTURE", "CULTURE & ARTS"],
    "EDUCATION" : ["COLLEGE", "EDUCATION"],
    "STYLE & BEAUTY" : ["STYLE & BEAUTY", "STYLE"],
    "WELLNESS" : ["WELLNESS", "HEALTHY LIVING"]
}
for i, j in mapping.items():
    df.loc[df['category'].isin(j), 'category'] = i

In [5]:
print(f'Number of classes in category after relabeling: {len(df['category'].unique())}')

Number of classes in category after relabeling: 17


In [6]:
#generating non-ml attributes

non_ml = ['Unnamed: 0', 'authors', 'date']
df = df.drop(non_ml, axis = 1)
display(df.head(3))
print(f'The data has the following features after dropping non-ml attributes:\n{df.columns.tolist()}')

Unnamed: 0,headline,category,short_description,headline_length,short_description_length
0,Over 4 Million Americans Roll Up Sleeves For O...,NEWS,Health experts said it is too early to predict...,76,154
1,"American Airlines Flyer Charged, Banned For Li...",NEWS,He was subdued by passengers and crew when he ...,89,159
2,23 Of The Funniest Tweets About Cats And Dogs ...,ENTERTAINMENT,"""Until you have a dog you don't understand wha...",69,64


The data has the following features after dropping non-ml attributes:
['headline', 'category', 'short_description', 'headline_length', 'short_description_length']


In [7]:
#missing value report

df.isna().sum()

headline                        6
category                        0
short_description           19712
headline_length                 0
short_description_length        0
dtype: int64

In [8]:
#dropping missing values.
df.dropna(inplace = True)
display(df.isna().sum())
print(f'The data has {df.shape[0]} rows and {df.shape[1]} columns after dropping nulls')

headline                    0
category                    0
short_description           0
headline_length             0
short_description_length    0
dtype: int64

The data has 189814 rows and 5 columns after dropping nulls


In [9]:
df['category'].value_counts()

category
POLITICS          32441
WELLNESS          23208
ENTERTAINMENT     21820
FAMILY            19425
NEWS              16809
LIFE              16238
STYLE & BEAUTY    11369
VOICES            10802
FOOD & DRINK       8271
BUSINESS           5132
SPORTS             4414
HOME & LIVING      4317
SCIENCE            3906
ENVIRONMENT        3488
ARTS & CULTURE     3265
IMPACT             3086
EDUCATION          1823
Name: count, dtype: int64

In [10]:
#creating content column by combining headline and short description

df['content'] = df['headline'] + df['short_description']

#removing headline and short_description
df.drop(['headline', 'short_description'], axis = 1, inplace = True)
df.head(3)

Unnamed: 0,category,headline_length,short_description_length,content
0,NEWS,76,154,Over 4 Million Americans Roll Up Sleeves For O...
1,NEWS,89,159,"American Airlines Flyer Charged, Banned For Li..."
2,ENTERTAINMENT,69,64,23 Of The Funniest Tweets About Cats And Dogs ...


## Content processing 

In [11]:
stop_words = set(stopwords.words('english'))

#creating a list of words that might actually help in sentiment analysis 
#and removing them from stopwords
x = ['few', 'once', 'same', 'below', 'above', 'during','over', 'after', 'most','before', 'just', 'against','very','no','which','where','what','nor','whom','why','when','down','but', 'not']
for i in x:
    stop_words.remove(i)
#print(stop_words)

def preprocess_text(text):
    '''
    preprocessing the required text column to convert case, remove number, remove contractions and stopwords
    '''
    # Convert to lower case
    text = text.lower()
    
    ## add space inbetween numbers and letters (e.g. 5mg to 5 mg, 17yo to 17 yo)
    text = re.sub(r'(\d+)([a-zA-Z])', r'\1 \2', text)
    
    #remove numbers
    text = re.sub(r'\d+', '', text)

    # Expand contractions (e.g., "can't" to "can not")
    text = contractions.fix(text)
    
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)

    # Apply stopwords list
    #stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text

In [12]:
df['content_processed'] = df['content'].apply(lambda x : preprocess_text(x))
df.head(3)

Unnamed: 0,category,headline_length,short_description_length,content,content_processed
0,NEWS,76,154,Over 4 Million Americans Roll Up Sleeves For O...,over million americans roll sleeves omicron ta...
1,NEWS,89,159,"American Airlines Flyer Charged, Banned For Li...",american airlines flyer charged banned life af...
2,ENTERTAINMENT,69,64,23 Of The Funniest Tweets About Cats And Dogs ...,funniest tweets cats dogs week sept dog not un...


In [13]:
df.to_csv('data/processed_data.csv', index = False)