# Understanding the data

## Importing libraries

In [1]:
import pandas as pd
import sys
from pathlib import Path
import os
from sklearn.model_selection import train_test_split

In [2]:
path = Path(os.path.dirname(os.getcwd()))
path = str(path)
print(path)
sys.path.insert(1, path)

/Users/saideepbunny/Projects/HuffPost-News-classification


## Reading the data

In [3]:
df = pd.read_csv(f'{path}/data/News_Category_Dataset_v3.csv').drop(['Unnamed: 0'], axis = 1)

print(f'The data has {df.shape[0]} rows and {df.shape[1]} columns')
print(f'The data has the following features:\n{df.columns.tolist()}')
print()
df.head(10)

The data has 209527 rows and 7 columns
The data has the following features:
['headline', 'category', 'short_description', 'authors', 'date', 'headline_length', 'short_description_length']



Unnamed: 0,headline,category,short_description,authors,date,headline_length,short_description_length
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,76,154
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,89,159
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23,69,64
3,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23,56,159
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22,77,156
5,Cleaner Was Dead In Belk Bathroom For 4 Days B...,U.S. NEWS,The 63-year-old woman was seen working at the ...,,2022-09-22,70,162
6,Reporter Gets Adorable Surprise From Her Boyfr...,U.S. NEWS,"""Who's that behind you?"" an anchor for New Yor...",Elyse Wanshel,2022-09-22,67,119
7,Puerto Ricans Desperate For Water After Hurric...,WORLD NEWS,More than half a million people remained witho...,"DÁNICA COTO, AP",2022-09-22,65,116
8,How A New Documentary Captures The Complexity ...,CULTURE & ARTS,"In ""Mija,"" director Isabel Castro combined mus...",Marina Fang,2022-09-22,76,148
9,Biden At UN To Call Russian War An Affront To ...,WORLD NEWS,White House officials say the crux of the pres...,"Aamer Madhani, AP",2022-09-21,60,148


In [4]:
#target - category

print(f'Number of classes in category: {len(df["category"].unique())}')

Number of classes in category: 42


In [5]:
df['category'].value_counts()

category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATI

- Some categories can be inter-related and grouping them using the following mapping:

## Grouping similar NEWS categories

In [6]:
mapping = {
    "POLITICS" : ["POLITICS"],
    "ENTERTAINMENT" : ["ENTERTAINMENT", "MEDIA", "COMEDY"],
    "VOICES" : ["QUEER VOICES", "BLACK VOICES", "LATINO VOICES"],
    "FOOD & DRINK" : ["FOOD & DRINK", "TASTE"],
    "BUSINESS" : ["BUSINESS"],
    "SPORTS" : ["SPORTS"],
    "HOME & LIVING" : ["HOME & LIVING"],
    "NEWS" : ["THE WORLDPOST", "CRIME", "WORLD NEWS", "WEIRD NEWS", "WORLDPOST", "FIFTY", "GOOD NEWS", "U.S. NEWS", ],
    "FAMILY" : ["WEDDINGS", "DIVORCE", "PARENTING", "PARENTS"],
    "IMPACT" : ["IMPACT"],
    "ENVIRONMENT" : ["GREEN", "ENVIRONMENT"],
    "LIFE" : ["WOMEN", "RELIGION", "MONEY", "TRAVEL"],
    "SCIENCE" : ["SCIENCE", "TECH", ],
    "ARTS & CULTURE" : ["ARTS", "ARTS & CULTURE", "CULTURE & ARTS"],
    "EDUCATION" : ["COLLEGE", "EDUCATION"],
    "STYLE & BEAUTY" : ["STYLE & BEAUTY", "STYLE"],
    "WELLNESS" : ["WELLNESS", "HEALTHY LIVING"]
}
for i, j in mapping.items():
    df.loc[df['category'].isin(j), 'category'] = i

In [7]:
print(f'Number of classes in category after relabeling: {len(df["category"].unique())}')
print()

df['category'].value_counts()

Number of classes in category after relabeling: 17



category
POLITICS          35602
ENTERTAINMENT     25706
WELLNESS          24639
NEWS              20057
FAMILY            19825
LIFE              17805
STYLE & BEAUTY    12068
VOICES            12060
FOOD & DRINK       8436
BUSINESS           5992
SPORTS             5077
HOME & LIVING      4320
SCIENCE            4310
ENVIRONMENT        4066
ARTS & CULTURE     3922
IMPACT             3484
EDUCATION          2158
Name: count, dtype: int64

## Missing value report

### Missing values

In [8]:
#missing value report
df.isna().sum()

headline                        6
category                        0
short_description           19712
authors                     37418
date                            0
headline_length                 0
short_description_length        0
dtype: int64

### Dropping records with null values in headline and short_description

In [9]:
#dropping missing values.
print(f'The data has {df.shape[0]} records before dropping nulls')
df.dropna(subset=['headline', 'short_description'], inplace = True)
display(df.isna().sum())
print(f'The data has {df.shape[0]} records after dropping nulls')

The data has 209527 records before dropping nulls


headline                        0
category                        0
short_description               0
authors                     32955
date                            0
headline_length                 0
short_description_length        0
dtype: int64

The data has 189814 records after dropping nulls


## Selecting 3 categories to simplify the problem

In [10]:
final_df = df[df['category'].isin(['POLITICS', 'ENTERTAINMENT', 'WELLNESS'])].copy()
final_df

Unnamed: 0,headline,category,short_description,authors,date,headline_length,short_description_length
2,23 Of The Funniest Tweets About Cats And Dogs ...,ENTERTAINMENT,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23,69,64
20,Golden Globes Returning To NBC In January Afte...,ENTERTAINMENT,"For the past 18 months, Hollywood has effectiv...",,2022-09-20,60,166
21,Biden Says U.S. Forces Would Defend Taiwan If ...,POLITICS,President issues vow as tensions with China rise.,,2022-09-19,59,49
24,‘Beautiful And Sad At The Same Time’: Ukrainia...,POLITICS,An annual celebration took on a different feel...,Jonathan Nicholson,2022-09-19,101,89
28,James Cameron Says He 'Clashed' With Studio Be...,ENTERTAINMENT,"The ""Avatar"" director said aspects of his 2009...",Ben Blanchet,2022-09-18,67,121
...,...,...,...,...,...,...,...
209486,Daily Show Correspondent Clip Of The Week: Al ...,ENTERTAINMENT,"If you're like us, by the time Monday rolls ar...",,2012-01-28,92,123
209487,Mitt Romney Madness: Florida Edition (VIDEO),ENTERTAINMENT,The apparent madness that gripped Mitt Romney ...,Ben Craw,2012-01-28,44,133
209488,7 Amazing Name Generators (PHOTOS),ENTERTAINMENT,Let's be honest: most of our names are pretty ...,Seena Vali,2012-01-28,34,121
209512,"Sundance, Ice-T, and Shades of the American Ra...",ENTERTAINMENT,Representation of the collective diaspora has ...,"Courtney Garcia, Contributor\nI tell stories a...",2012-01-28,58,204


## Preparing train and test data

In [11]:
# Perform train-test split (80% train, 20% test)
train_df, test_df = train_test_split(final_df, test_size=0.2, random_state=42, stratify=final_df['category'])

In [12]:
train_df['category'].value_counts()

category
POLITICS         25953
WELLNESS         18566
ENTERTAINMENT    17456
Name: count, dtype: int64

In [13]:
test_df['category'].value_counts()

category
POLITICS         6488
WELLNESS         4642
ENTERTAINMENT    4364
Name: count, dtype: int64

## Writing train and test data

In [14]:
train_df.to_json(f'{path}/data/train_data.json', orient='records')
test_df.to_json(f'{path}/data/test_data.json', orient='records')