In [1]:
import pyLDAvis.gensim
import spacy
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from spacy.tokens import DocBin
import spacy
from tqdm import tqdm
import warnings


warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)


  """
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.object,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.object,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.bool,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.object:
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.bool:
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  def randint(low, high=None, size=None, dtype=onp.int):  # pylint: disable=missing-function-docstring
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-

# Analysis
> 1. Load data and remove duplicated
> 2. LDA (Latent Dirichlet Allocation) to analysis subtopics within the data

In [2]:
#data module
import pandas as pd
from spacy.cli import download


def download_spacy_data():
    download("en_core_web_sm")

    
def validate_data(df):
    # check is as expected and described
    for category in df.category.unique():
        assert category in ['business', 'entertainment', 'politics', 'sport', 'tech']
    assert not any(df.duplicated())


def remove_duplicates(df):
    if any(df.duplicated()):
        dup = df.duplicated().value_counts()
        df = df[~df.duplicated()]
        print(f'{dup[True]} duplicated articles removed from dataset')
    return df

In [3]:
# read the data
df = pd.read_csv('../input/newsgroup20bbcnews/bbc-text.csv')
df = remove_duplicates(df)
validate_data(df)

99 duplicated articles removed from dataset


In [4]:
# summarise
df.describe()

Unnamed: 0,category,text
count,2126,2126
unique,5,2126
top,sport,tv future in the hands of viewers with home th...
freq,504,1


In [5]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("merge_noun_chunks")

# custom stop words
add_stop_words = ['say', 's', 'mr', 'Mr', 'said', 'says', 'saying', 'today', 'be', 'I']
for stopword in add_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

articles, article = [], []
doc_bin = DocBin()
dictionary = Dictionary()
corpus = []

# clean data
print('Processing articles using spaCy, removing punctuation, stop words, numbers and lemminizing')
for doc in tqdm(nlp.pipe(df.text.values, disable=["tok2vec"])):
    doc_bin.add(doc)
    article = [
        token.lemma_ for token in doc 
        if not token.is_stop 
        and not token.is_punct 
        and not token.like_num
        and not token.is_space]
    dictionary.add_documents([article])
    corpus.append(dictionary.doc2bow(article))

print('Building LDA model')
lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
lda_model.show_topics()

Processing articles using spaCy, removing punctuation, stop words, numbers and lemminizing


2126it [01:46, 20.01it/s]


Building LDA model


[(0,
  '0.007*"p2p" + 0.007*"music" + 0.006*"uwb" + 0.005*"year" + 0.005*"uk" + 0.005*"new" + 0.004*"sales" + 0.003*"prices" + 0.003*"growth" + 0.003*"networks"'),
 (1,
  '0.006*"people" + 0.006*"mobile" + 0.006*"tv" + 0.005*"technology" + 0.004*"year" + 0.004*"high" + 0.004*"new" + 0.004*"digital" + 0.004*"like" + 0.004*"world"'),
 (2,
  '0.008*"people" + 0.005*"$" + 0.004*"information" + 0.004*"t" + 0.004*"year" + 0.004*"£" + 0.003*"new" + 0.003*"world" + 0.003*"children" + 0.003*"way"'),
 (3,
  '0.010*"m" + 0.010*"$" + 0.007*"sales" + 0.007*"year" + 0.006*"new" + 0.005*"tv" + 0.004*"£" + 0.004*"uk" + 0.004*"company" + 0.003*"people"'),
 (4,
  '0.006*"year" + 0.006*"$" + 0.004*"bank" + 0.004*"new" + 0.003*"m" + 0.003*"dollar" + 0.003*"china" + 0.003*"£" + 0.003*"t" + 0.003*"time"'),
 (5,
  '0.006*"dallaglio" + 0.006*"games" + 0.005*"game" + 0.005*"year" + 0.005*"england" + 0.004*"juninho" + 0.004*"man" + 0.004*"uk" + 0.003*"t" + 0.003*"new"'),
 (6,
  '0.008*"o" + 0.008*"wales" + 0.00

In [6]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)