In [None]:
# Importing the relevant packages
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [9]:
# This dataset contains the topics already marked and so I would like to use this as training set for my model.


filename = "~/Documents/mygithub/bu_dsc/data/external/News_Category_Dataset_v2.json"
test_file = "~/Documents/mygithub/bu_dsc/data/external/global-issues.csv"

data = pd.read_json(filename, lines = True)
#display the first few rows of data
data.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [None]:
'''
Data Description
The data consists of a collection of news where each news entry is represented by the following columns:

category: Category of the news (string)
headline: Headline of the news (string)
author: Authors and Contributors or Associations (string)
short_description: Short description of the news' content (string)
date: Date of the published news. (date)
The goal is to classify the news category given only the headline. An accuracy of 90% on the test set is aimed.
'''

In [12]:
# Inspecting the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200853 entries, 0 to 200852
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   category           200853 non-null  object        
 1   headline           200853 non-null  object        
 2   authors            200853 non-null  object        
 3   link               200853 non-null  object        
 4   short_description  200853 non-null  object        
 5   date               200853 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.2+ MB


In [13]:
# Checking the unique elements
data.nunique()

category                 41
headline             199344
authors               27993
link                 200812
short_description    178353
date                   2309
dtype: int64

In [15]:
print('Unique category: ', data.category.unique())

Unique category:  ['CRIME' 'ENTERTAINMENT' 'WORLD NEWS' 'IMPACT' 'POLITICS' 'WEIRD NEWS'
 'BLACK VOICES' 'WOMEN' 'COMEDY' 'QUEER VOICES' 'SPORTS' 'BUSINESS'
 'TRAVEL' 'MEDIA' 'TECH' 'RELIGION' 'SCIENCE' 'LATINO VOICES' 'EDUCATION'
 'COLLEGE' 'PARENTS' 'ARTS & CULTURE' 'STYLE' 'GREEN' 'TASTE'
 'HEALTHY LIVING' 'THE WORLDPOST' 'GOOD NEWS' 'WORLDPOST' 'FIFTY' 'ARTS'
 'WELLNESS' 'PARENTING' 'HOME & LIVING' 'STYLE & BEAUTY' 'DIVORCE'
 'WEDDINGS' 'FOOD & DRINK' 'MONEY' 'ENVIRONMENT' 'CULTURE & ARTS']


In [16]:
# Note thate there is the category THE WORLDPOST and WORLDPOST. It is the same category, so it needs to be substituted.
data.category = data.category.map(lambda x: "WORLDPOST" if x == "THE WORLDPOST" else x)

In [18]:
# Printing out the new categories
# The category count is now showing 40
print('Unique category: ', data.category.unique())
data.nunique()

Unique category:  ['CRIME' 'ENTERTAINMENT' 'WORLD NEWS' 'IMPACT' 'POLITICS' 'WEIRD NEWS'
 'BLACK VOICES' 'WOMEN' 'COMEDY' 'QUEER VOICES' 'SPORTS' 'BUSINESS'
 'TRAVEL' 'MEDIA' 'TECH' 'RELIGION' 'SCIENCE' 'LATINO VOICES' 'EDUCATION'
 'COLLEGE' 'PARENTS' 'ARTS & CULTURE' 'STYLE' 'GREEN' 'TASTE'
 'HEALTHY LIVING' 'WORLDPOST' 'GOOD NEWS' 'FIFTY' 'ARTS' 'WELLNESS'
 'PARENTING' 'HOME & LIVING' 'STYLE & BEAUTY' 'DIVORCE' 'WEDDINGS'
 'FOOD & DRINK' 'MONEY' 'ENVIRONMENT' 'CULTURE & ARTS']


category                 40
headline             199344
authors               27993
link                 200812
short_description    178353
date                   2309
dtype: int64

In [19]:
# do specific authors write for a specific categories?
t = data[['category', 'authors']].groupby(['authors'])
t.head()

Unnamed: 0,category,authors
0,CRIME,Melissa Jeltsen
1,ENTERTAINMENT,Andy McDonald
2,ENTERTAINMENT,Ron Dicker
3,ENTERTAINMENT,Ron Dicker
4,ENTERTAINMENT,Ron Dicker
...,...,...
200836,BUSINESS,"Peter S. Goodman, Contributor\nExecutive Busin..."
200837,BUSINESS,"Ernan Roman, Contributor\nPresident"
200838,ENTERTAINMENT,"Courtney Garcia, Contributor\nI tell stories a..."
200843,TECH,"Mateo Gutierrez, Contributor\nArtist"


In [20]:
# Make text corpus
# Getting the data for analysis
# combining the headline with the short description
text_corpus = pd.DataFrame({
    'text':data.headline + data.short_description,
    'label':data.category
})

text_corpus.head()

Unnamed: 0,text,label
0,There Were 2 Mass Shootings In Texas Last Week...,CRIME
1,Will Smith Joins Diplo And Nicky Jam For The 2...,ENTERTAINMENT
2,Hugh Grant Marries For The First Time At Age 5...,ENTERTAINMENT
3,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,ENTERTAINMENT
4,Julianna Margulies Uses Donald Trump Poop Bags...,ENTERTAINMENT


In [24]:
# check for empty strings
print(text_corpus.text.apply(len).value_counts().sort_index())
# There are 5 rows with an empty string (length = 0). They can be removed.

3        2
4        4
5        6
6        7
7       13
        ..
1300     1
1362     1
1375     1
1424     1
1486     1
Name: text, Length: 714, dtype: int64


In [23]:
# remove rows with empty text corpus
text_corpus = text_corpus[text_corpus.text.apply(len) > 0]

In [25]:
text_corpus.head()

Unnamed: 0,text,label
0,There Were 2 Mass Shootings In Texas Last Week...,CRIME
1,Will Smith Joins Diplo And Nicky Jam For The 2...,ENTERTAINMENT
2,Hugh Grant Marries For The First Time At Age 5...,ENTERTAINMENT
3,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,ENTERTAINMENT
4,Julianna Margulies Uses Donald Trump Poop Bags...,ENTERTAINMENT


In [27]:
# Encoding the label
encoder = LabelEncoder()
text_corpus.label = encoder.fit_transform(text_corpus.label)

text_corpus.head()

Unnamed: 0,text,label
0,There Were 2 Mass Shootings In Texas Last Week...,6
1,Will Smith Joins Diplo And Nicky Jam For The 2...,10
2,Hugh Grant Marries For The First Time At Age 5...,10
3,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,10
4,Julianna Margulies Uses Donald Trump Poop Bags...,10


In [28]:
#Convert text to lowercase and romove punctuation
#define a function to clean the text
#import regular expressions library
import re

def clean_text(text):
    """
    Remove punctuations and special characters, makes lower case
    Args: text
    Output: text
    """
    text=text.lower() #makes text lowercase
    text=re.sub('\\d|\\W+|_',' ',text) #removes extra white space
    text=re.sub('[^a-zA-Z]'," ", text) #removes any non-alphabetic characters
    return text

In [29]:
#import word tokenizer from NLTK
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def tokenize_and_remove_stop_words(txt):
    """
    takes in a sentence, tokenizes the words into a list,
    and then removes stop words from the tokenized list
    """
    stop_words = stopwords.words('english')
    txt_token = word_tokenize(txt)
    txt_no_stopwords = [word for word in txt_token if word not in stop_words]
    return txt_no_stopwords

In [30]:
#Apply NLTK's PorterStemmer
#define a function to stem the words
from nltk.stem.porter import PorterStemmer

def stem_text(word_list):

    porter = PorterStemmer()
    return [porter.stem(word) for word in word_list]

In [31]:
## Applying the cleaning function for the text corpus

#apply text cleaning function
text_corpus['clean_desc'] = text_corpus['text'].apply(clean_text)
#apply tokenizing
text_corpus['tokenized_desc'] = text_corpus['clean_desc'].apply(tokenize_and_remove_stop_words)
#apply PorterStemmer function
text_corpus['txt_stemmed_desc'] = text_corpus['tokenized_desc'].apply(stem_text)
#put the text back together (untokenize)
text_corpus['final_desc'] = text_corpus['txt_stemmed_desc'].apply(lambda text: ' '.join(text))
#view the pre-processed text
print('Show the dimension of the new dataframe: ', text_corpus.shape)
text_corpus.info()

Show the dimension of the new dataframe:  (200848, 6)


Unnamed: 0,text,label,clean_desc,tokenized_desc,txt_stemmed_desc,final_desc
0,There Were 2 Mass Shootings In Texas Last Week...,6,there were mass shootings in texas last week...,"[mass, shootings, texas, last, week, tvshe, le...","[mass, shoot, texa, last, week, tvshe, left, h...",mass shoot texa last week tvshe left husband k...
1,Will Smith Joins Diplo And Nicky Jam For The 2...,10,will smith joins diplo and nicky jam for the ...,"[smith, joins, diplo, nicky, jam, world, cup, ...","[smith, join, diplo, nicki, jam, world, cup, o...",smith join diplo nicki jam world cup offici so...
2,Hugh Grant Marries For The First Time At Age 5...,10,hugh grant marries for the first time at age ...,"[hugh, grant, marries, first, time, age, actor...","[hugh, grant, marri, first, time, age, actor, ...",hugh grant marri first time age actor longtim ...
3,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,10,jim carrey blasts castrato adam schiff and dem...,"[jim, carrey, blasts, castrato, adam, schiff, ...","[jim, carrey, blast, castrato, adam, schiff, d...",jim carrey blast castrato adam schiff democrat...
4,Julianna Margulies Uses Donald Trump Poop Bags...,10,julianna margulies uses donald trump poop bags...,"[julianna, margulies, uses, donald, trump, poo...","[julianna, marguli, use, donald, trump, poop, ...",julianna marguli use donald trump poop bag pic...


In [32]:
# make sure that after cleaning the text there are no empty strings
text_corpus = text_corpus[text_corpus.final_desc.apply(len) > 0]

In [34]:
text_corpus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200837 entries, 0 to 200852
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   text              200837 non-null  object
 1   label             200837 non-null  int64 
 2   clean_desc        200837 non-null  object
 3   tokenized_desc    200837 non-null  object
 4   txt_stemmed_desc  200837 non-null  object
 5   final_desc        200837 non-null  object
dtypes: int64(1), object(5)
memory usage: 10.7+ MB


In [36]:
# Applying the Neural Network using SCikit Learn
# Using sample data set for the model creation
# Get the target name
from sklearn.model_selection import train_test_split
# Creating the features from the data set
features, category = text_corpus.final_desc, text_corpus.label
# Make test and training split (20:80)
features_train,features_test,target_train, target_test = train_test_split(features,category, random_state=1000, test_size = 0.2)

print('Features-Training Set: ',len(features_train))
print('Features-Test Set: ',len(features_test))
print('Target: Training Set: ',len(target_train))
print('Target: Test Set: ',len(target_test))

Features-Training Set:  160669
Features-Test Set:  40168
Target: Training Set:  160669
Target: Test Set:  40168


In [38]:
# Train the model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer


from sklearn.naive_bayes import MultinomialNB

classifier = Pipeline([('vect', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('clf', MultinomialNB())
])

text_clf = classifier.fit(features_train, target_train)

In [39]:
# Performance of NB Classifier
# Using the Test Set
import numpy as np
predicted = text_clf.predict(features_test)
np.mean(predicted == target_test)

0.3690748854809799

In [40]:
# Saving the Model

import joblib
from joblib import dump, load

model_path="/home/arindam/Documents/mygithub/bu_dsc/models"
model_name="news_classify_NN_sklearn.pkl"
filename = model_path + "/" + model_name 
# print(filename)
joblib.dump(text_clf, filename)

['/home/arindam/Documents/mygithub/bu_dsc/models/news_classify_NN_sklearn.pkl']