In [1]:
# ignore warnings that might clutter the output, ensuring that the results are clean and readable
import warnings
warnings.filterwarnings('ignore')

# ensure compatibility with Python 2 for division, print function, and unicode literals, making the code more forward-compatible with Python 3
from __future__ import division, print_function, unicode_literals

import matplotlib.pyplot as plt
%matplotlib inline

import os
import numpy as np
import pandas as pd
import math

from IPython.display import display, HTML

Dataset: https://www.kaggle.com/snapcrack/all-the-news

In [2]:
# Get the data 
import random
path = './data/articles1.csv'

df = pd.read_csv(path, skiprows=0, nrows=1000)
df.rename(columns={'Unnamed: 0':'index'}, inplace=True)

df.head()

Unnamed: 0,index,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


English stopwords list used originally was built by Gerard Salton and Chris Buckley for the experimental SMART information retrieval system at Cornell University. It was available here: http://www.lextek.com/manuals/onix/stopwords2.html

Now, we have used [Rank NL's full list of stopwords](https://www.ranks.nl/)

In [3]:
# Run once to download the stopwords & wordnet corpus
# import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')

In [4]:
# Preprocessing data

# The preprocessing steps involve converting titles to lowercase, removing punctuation, and splitting the titles into tokens (words).
import nltk
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

# Stopwords (common words like "the", "is", etc.) are removed from the tokenized texts to reduce noise in the data.
# from nltk.corpus import stopwords
# stop_words = set(stopwords.words('english'))
stop_words_path = './data/stopwords_en'
stop_words = set()
with open(stop_words_path, 'r') as file:
    for line in file:
        # Remove any leading/trailing whitespace from the line and add the word to the set
        stop_words.add(line.strip())

corpus = []

for i in range(len(df['index'])):
    text = re.sub('[^a-zA-Z]', ' ', df['content'][i]) # remove punctuations
    text = text.lower()                               # convert to lowercase
    text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)  # remove tags
    text = re.sub("(\\d|\\W)+"," ",text)              # remove special characters and digits
    text = text.split()                               # convert to list from string
    
    # Lemmatization is simplifying words to their base form, so "running" and "ran" are both considered as "run". This helps in understanding the core topic without getting bogged down by variations of words.
    lem = WordNetLemmatizer()                         # lemmatisation
    text = [lem.lemmatize(word) for word in text if not word in stop_words] 
    
    text = " ".join(text)
    
    corpus.append(text)

corpus[:1]

['washington congressional republican fear health care lawsuit obama administration win incoming trump administration choose longer defend executive branch suit challenge administration authority spend billion dollar health insurance subsidy american handing house republican big victory issue sudden loss disputed subsidy conceivably health care program implode leaving million people access health insurance republican prepared replacement lead chaos insurance market spur political backlash republican gain full control government stave outcome republican find awkward position appropriating huge sum temporarily prop obama health care law angering conservative voter demanding law year twist donald trump administration worried preserving executive branch prerogative choose fight republican ally house central question dispute eager avoid ugly political pileup republican capitol hill trump transition team gaming handle lawsuit election limbo late february united state court appeal district co

In [5]:
corpus_list = []

for i in range(len(corpus)):
    tokened_article = corpus[i].split()
    corpus_list.append(tokened_article)

print(corpus_list[0][:10])

['washington', 'congressional', 'republican', 'fear', 'health', 'care', 'lawsuit', 'obama', 'administration', 'win']


# Automated Keyword Extraction

> Automatic identification of terms that best describe the subject of a document. Key phrases, key terms, key segments or just keywords are the terminology which is used for defining the terms that represent the most relevant information contained in the document.

Method used: TfidfVectorizer

Reference: http://kavita-ganesan.com/extracting-keywords-from-text-tfidf/#.XLX1EUPhXeQ

## TfidfVectorizer Overview

`TfidfVectorizer` is used to compute the word counts or term frequency (TF), inverse document frequency (IDF), and the product of the two, TF-IDF values, all at once.

### Term Frequency (TF)

Term Frequency (TF) gives us the frequency of the word in each document in the corpus. It is defined as the ratio of the number of times the word appears in a document compared to the total number of words in that document. The formula for TF is:

$$
\text{TF}(word) = \frac{\text{Number of times the word appears in a document}}{\text{Total number of words in the document}} = tf_{i,j} = \frac{n_{i,j}}{\sum_k n_{i,j}}
$$

TF increases as the number of occurrences of that word within the document increases, summarizing how often a given word appears within a document. Each document has its own TF.

### Inverse Document Frequency (IDF)

Inverse Document Frequency (IDF) is used to calculate the weight of rare words across all documents in the corpus, thus downscales words that appear a lot across documents. The formula for IDF is:

$$
\text{IDF}(word) = \log\left(\frac{\text{Total number of documents}}{\text{Number of documents containing the word}}\right) = idf(w) = \log\left(\frac{N}{df_t}\right)
$$

The lower the IDF value of a word, the less unique it is to any particular document. Words that occur rarely in the corpus have a high IDF score.

### TF-IDF Calculation

TF-IDF values are the product of TF and IDF values and highlight the words that are more interesting, i.e., frequent in a document but not across documents. The formula for calculating TF-IDF is:

$$
\text{TF-IDF}(word) = \text{TF}(word) \times \text{IDF}(word) = w_{i,j} = tf_{i,j} \times \log\left(\frac{N}{df_i}\right)
$$

where:
- \( tf_{i,j} \) is the term frequency, the number of occurrences of term \( i \) in document \( j \),
- \( df_i \) is the document frequency, the number of documents containing term \( i \),
- \( N \) is the total number of documents in the corpus.

`TfidfVectorizer` tokenizes documents, learns the vocabulary and inverse document frequency weightings, and allows encoding new documents. This emphasizes words that are more relevant to a specific document in the corpus, providing a way to extract meaningful words from texts.

### In simple terms

Term Frequency (TF) is like counting how many times each highlighted word appears in an article to understand its importance in that specific context.    

Inverse Document Frequency (IDF) helps to reduce the weight of words that appear too frequently across all articles, making common words less dominant in the analysis.    

Combining TF and IDF gives a score that helps to identify words that are important in an individual article but not just commonly used across all articles.    

By sorting words in each article based on their TF-IDF scores, the most defining terms of each article are identified, giving a quick glance at what the article is likely about.

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix 

# TfidfVectorizer - Convert a collection of raw documents to a matrix of TF-IDF features.
tv = TfidfVectorizer(max_df=1, min_df = 0.05, stop_words=list(stop_words), max_features=10000, 
                                ngram_range=(1,3), smooth_idf=True, use_idf=True)
kw = []

# sort the words in the vector in descending order of TF-IDF values
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

# iterate over sorted items to extract the top 100 keywords.
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    sorted_items = sorted_items[:topn]
    score_vals = []
    feature_vals = []
    
    for idx, score in sorted_items:
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]
    
    return results


for index in df['index']:
    
    # fit_transform - Learn vocabulary and idf, return document-term matrix.
    tfidf_matrix = tv.fit_transform([corpus[index]])
    # get_feature_names_out - Get output feature names for transformation.
    feature_names = tv.get_feature_names_out()
    
    sorted_items = sort_coo(tfidf_matrix.tocoo())
    keywords = extract_topn_from_vector(feature_names, sorted_items, topn=100)
    # keywords = {'republican': 0.296, 'house republican': 0.276, ...} <- for index = 0
    #            {'precinct': 0.586, 'detective': 0.403, ...} <- for index = 1
    
    kw.append(list(keywords.keys())) 

df['keywords'] = kw
df[['id', 'title', 'content', 'keywords']].head()

Unnamed: 0,id,title,content,keywords
0,17283,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...,"[house, republican, administration, health, tr..."
1,17284,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood...","[precinct, detective, police, officer, year, f..."
2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...","When Walt Disney’s “Bambi” opened in 1942, cri...","[wong, artist, disney, father, work, tyrus, ch..."
3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...","Death may be the great equalizer, but it isn’t...","[death, died, year, time, people, star, music,..."
4,17287,Kim Jong-un Says North Korea Is Preparing to T...,"SEOUL, South Korea — North Korea’s leader, ...","[north, korea, missile, ballistic, test, north..."


# Topic Modelling

> Statistical modeling for discovering the abstract “topics” that occur in a collection of documents

Methods used: LDA (Latent Dirichlet Allocation)

Reference: https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html

## Latent Dirichlet Allocation (LDA)

Latent Dirichlet Allocation (LDA) aims to find topics that documents belong to, based on the words they contain. LDA assumes that documents with similar topics use a similar group of words, enabling the mapping of documents to a probability distribution over latent topics, with topics themselves being distributions over words.

### The Process

Let's suppose we have D documents using a vocabulary of V-word types. Each document consists of N-words tokens (which can be removed or padded). We assume K topics, requiring a K-dimensional vector to represent the topic distribution for each document.

Each topic has a V-dimensional multinomial beta_k over words with a common symmetric prior.

For each topic ( k ), where ( k = 1...K ):

1. Draw a multinomial over words, ( $\varphi_k$ ), from a Dirichlet distribution with parameter ( $\beta$ ):
 
$$
\varphi_k \sim \text{Dir}(\beta)
$$

For each document ( d ), where ( d = 1...D ):

1. Draw a multinomial over topics, ( $\theta_d$ ), from a Dirichlet distribution with parameter ( $\alpha$ ):

$$
\theta_d \sim \text{Dir}(\alpha)
$$

2. For each word ( $w_{N_d}$ ) in document ( d ):
    - Draw a topic ( $Z_{N_d}$ ) from a Multinomial distribution parameterized by ( $\theta_d$ ):

        $$
        Z_{N_d} \sim \text{Mult}(\theta_d)
        $$

        with $Z_{N_d} \epsilon [1..K]$
        

    - Given topic ( $Z_{N_d}$ ), draw a word ( $w_{N_d}$ ) from a Multinomial distribution parameterized by ( $\varphi_{Z_{N_d}}$ ):

$$
W_{N_d} \sim \text{Mult}(\varphi_{Z_{N_d}})
$$


The above steps describe the generative process for LDA, where documents are represented as mixtures of topics, each of which is characterized by a distribution over words.

### In simpler terms

LDA starts with the idea that each article can cover multiple topics to some degree. For example, an article might mostly be about Technology, but it also mentions some Political aspects.  

It reads through all the articles and notices that certain words tend to appear together often. Words like "server," "software," and "programming" might cluster together, suggesting they form a "Technology" topic.    

For each article, LDA decides the mix of topics it contains. So, it might determine that one particular article is 60% about Technology, 30% about Business, and 10% about Education, based on the words it uses and their frequencies.    

The goal for LDA is to be able to recreate the list of articles using the topics it has identified. It adjusts the topics and their distributions within each article until it can do this as accurately as possible.    

LDA is a tool for topic modeling rather than document categorization. It assumes that a document is a mixture of topics, and a topic is a mixture of words. This means that LDA doesn't sort entire documents into one category. Instead, it breaks down each document into a blend of various topics. Each topic is characterized by a distribution of words that are likely to be found together.    

Topic modeling with LDA is about discovering the hidden thematic structure within documents. LDA tells us the degree to which certain themes are present in each document, without necessarily putting a single label on the document as a whole.

Reference: [GeeksForGeeks: Latent Dirichlet Allocation](https://www.geeksforgeeks.org/latent-dirichlet-allocation/)

In [7]:
df['lda_topics'] = ''

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

n_topics = 5
n_top_words = 10

count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = count_vectorizer.fit_transform(corpus)

lda = LatentDirichletAllocation(n_components=n_topics, max_iter=10, learning_method='online', random_state=0)
lda.fit(dtm)

count_feature_names = count_vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    df.lda_topics[topic_idx] = [count_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]

df[['id', 'title', 'keywords', 'lda_topics']].head()

Unnamed: 0,id,title,keywords,lda_topics
0,17283,House Republicans Fret About Winning Their Hea...,"[house, republican, administration, health, tr...","[republican, health, care, democrat, senate, l..."
1,17284,Rift Between Officers and Residents as Killing...,"[precinct, detective, police, officer, year, f...","[time, year, people, trump, woman, day, family..."
2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...","[wong, artist, disney, father, work, tyrus, ch...","[year, city, time, people, film, street, york,..."
3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...","[death, died, year, time, people, star, music,...","[trump, state, president, united, year, countr..."
4,17287,Kim Jong-un Says North Korea Is Preparing to T...,"[north, korea, missile, ballistic, test, north...","[china, chinese, beijing, xi, kong, hong, jamm..."


## Non-negative Matrix Factorization (NMF)

Non-negative Matrix Factorization (NMF) is a linear-algebraic model that reduces the dimensionality of a vector for simpler representation. This method is particularly useful when there are many ambiguous features or if the features have weak predictability. The workings of NMF are explained below.

### The Process

Given an original matrix \( A \), we can obtain two matrices \( W \) and \( H \), such that \( A = WH \). NMF has an inherent clustering property, such that \( W \) and \( H \) represent the following information about \( A \):

- \( A \) (Document-word matrix) - input that contains which words appear in which documents.
- \( W \) (Basis vectors) - the topics (clusters) discovered from the documents.
- \( H \) (Coefficient matrix) - the membership weights for the topics in each document.

We calculate \( W \) and \( H \) by optimizing over an objective function (like the EM algorithm), updating both \( W \) and \( H \) iteratively until convergence.

$$
\frac{1}{2} \| A - WH \|_F^2 = \sum_{i=1}^{n} \sum_{j=1}^{m} (A_{ij} - (WH)_{ij})^2
$$

In this function:
- ( $\| A - WH \|_F^2$ ) represents the Frobenius norm of the matrix difference between A and the product WH, squared.
- A is the original non-negative matrix we want to decompose.
- W and H are the matrices we're solving for, representing the basis and coefficient matrices, respectively.
- n and m are the dimensions of A, with i indexing rows and j indexing columns.
- ( $A_{ij}$ ) is the element of matrix A at row i and column j.
- ( $(WH)_{ij}$ ) is the element of the product matrix WH at row i and column j.

In the objective function for NMF, we measure the error of reconstruction between \( A \) and the product of its 
factors \( W \) and \( H \), based on Euclidean Distance.

$$
W_{ic} \leftarrow W_{ic} \frac{(AH)_{ic}}{(WHH)_{ic}}
$$

$$
H_{cj} \leftarrow H_{cj} \frac{(WA)_{cj}}{(WWH)_{cj}}
$$

In these rules:
- ( $W_{ic}$ ) and ( $H_{cj}$ ) are the elements of matrices W and H at row i, column c, and row c, column j, respectively.
- AH, WHH, WA, and WWH are the matrices resulting from the multiplication of A and H, W and H, and their respective element-wise operations.
- The left arrow \leftarrow denotes the updating of the matrix elements with the new values after each iteration.

Using the objective function, the update rules for W and H can be derived. The updated values are calculated in parallel operations, and using the new W and H, we re-calculate the reconstruction error, repeating this process until convergence.

### In Simpler Terms

NMF starts by looking for patterns in the usage of words across all the articles. It tries to figure out if certain words often appear together, which might suggest they are part of a common topic. For instance, words like "election," "vote," and "campaign" often appear together and might form a "Politics" topic.

It then makes a list of topics (though it doesn't know what to call them yet) based on these patterns. Each topic is a mix of words with certain weights. In the "Politics" topic, words like "election" might have a high weight, while unrelated words like "tennis" would have a very low or zero weight.

Now, NMF takes each article and breaks it down into a mix of these topics. It figures out that an article might be mostly about "Politics," but also has a bit of "Economy" and "International Relations" in it.

The goal of NMF is to be able to recreate each article from the list of topics it has created. It does this by combining the topics in different proportions. NMF keeps adjusting the weights of words in topics and the mix of topics in articles until the recreated articles are as close as possible to the original ones.

In [8]:
df['nmf_topics'] = ''

from sklearn.decomposition import NMF

n_topics = 5

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(corpus)

nmf = NMF(n_components=n_topics, random_state=1).fit(tfidf)

tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(nmf.components_):
    df.nmf_topics[topic_idx] = [tfidf_feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]

df[['id', 'title', 'keywords', 'nmf_topics']].head()

Unnamed: 0,id,title,keywords,nmf_topics
0,17283,House Republicans Fret About Winning Their Hea...,"[house, republican, administration, health, tr...","[trump, president, russia, intelligence, russi..."
1,17284,Rift Between Officers and Residents as Killing...,"[precinct, detective, police, officer, year, f...","[woman, time, year, people, city, life, school..."
2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...","[wong, artist, disney, father, work, tyrus, ch...","[republican, health, senate, democrat, care, l..."
3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...","[death, died, year, time, people, star, music,...","[order, state, united, refugee, judge, court, ..."
4,17287,Kim Jong-un Says North Korea Is Preparing to T...,"[north, korea, missile, ballistic, test, north...","[china, trade, european, chinese, britain, com..."


## Latent Semantic Analysis (LSA) or Latent Semantic Index (LSI)


In LSA, also known as Latent Semantic Indexing (LSI), we start by creating a document-term matrix A of size ( m x n ), where m is the number of documents and n is the number of words in the vocabulary. This matrix is typically very sparse, meaning that a lot of the values in the matrix are zero, and noisy, which indicates the presence of less informative variations in the data.

### Dimensionality Reduction with Truncated SVD

To address the sparsity and noise, we apply dimensionality reduction, specifically using truncated Singular Value Decomposition (SVD). SVD decomposes the matrix A into three distinct matrices:

$$
A = U \cdot S \cdot V^T
$$

- S is a diagonal matrix containing the singular values of A. These values are arranged in descending order, and they represent the importance or 'strength' of each latent semantic factor.
- U is the document-topic matrix with each row representing a document as a combination of topics.
- $V^T$ (the transpose of V is the term-topic matrix, with each row representing a term as a combination of topics.

### Interpretation of the Factors

- The singular values in S give us insight into the structure of our data. By selecting a subset of the largest singular values and their corresponding vectors in U and V, we can approximate the original matrix A with a lower-dimensional representation.
- The matrices U and $V^T$ provide a new way to look at the documents and terms, respectively. Instead of seeing them in the high-dimensional space of terms or documents, they are now viewed in the lower-dimensional space of topics, which can be more informative and less redundant.

In practice, LSA using truncated SVD helps to uncover the latent thematic structures in the text data, thereby improving the efficiency of information retrieval, text mining, and other natural language processing tasks.

### In simple terms

LSA looks at all the words on the articles and counts how often each word appears in each article. Then, it starts to analyze these to find patterns. For example, if the word "apple" often appears with words like "pie", "crust", and "bake," it might decide that these words are part of a "Baking" topic. It does this for all the words, grouping them into topics based on how often they appear together across different articles. It removes the less helpful words and simplifies the topics, making it easier to see the major themes.



In [9]:
df['lsa_topics'] = ''

# LSA: Similar to NMF, but it uses a method called Singular Value Decomposition (SVD) to reduce the complexity of the data, making it easier to analyze the relationships between words and topics.
from sklearn.decomposition import TruncatedSVD

n_topics = 5

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(corpus)

lsa = TruncatedSVD(n_components=n_topics)
lsa.fit(tfidf)

tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lsa.components_):
    df.lsa_topics[topic_idx] = [tfidf_feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]

df[['id', 'title', 'keywords', 'lsa_topics']].head()

Unnamed: 0,id,title,keywords,lsa_topics
0,17283,House Republicans Fret About Winning Their Hea...,"[house, republican, administration, health, tr...","[trump, president, state, republican, united, ..."
1,17284,Rift Between Officers and Residents as Killing...,"[precinct, detective, police, officer, year, f...","[trump, republican, democrat, senate, senator,..."
2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...","[wong, artist, disney, father, work, tyrus, ch...","[republican, health, care, senate, law, democr..."
3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...","[death, died, year, time, people, star, music,...","[order, state, judge, united, refugee, court, ..."
4,17287,Kim Jong-un Says North Korea Is Preparing to T...,"[north, korea, missile, ballistic, test, north...","[china, trade, chinese, european, britain, com..."


In [10]:
# Comparison between outputs of all 3 Topic Modeling Methods - LDA, NMF & LSA

with pd.option_context('display.max_colwidth', None):
    display(df[['title', 'keywords', 'lda_topics', 'nmf_topics', 'lsa_topics']].head())


Unnamed: 0,title,keywords,lda_topics,nmf_topics,lsa_topics
0,House Republicans Fret About Winning Their Health Care Suit - The New York Times,"[house, republican, administration, health, trump, house republican, care, subsidy, health care, trump administration, spending, obama, insurance, executive branch, executive, congress, branch, law, health insurance, case, will, white house, white, transition, team, sue, standing, required, program, power, obama administration, matter, lawyer, judge, fight, constitution, congressional, collyer, appeal, well, view health care, view health, view, victory, trump transition, transition team, time, suit, sue executive branch, sue executive, standing sue, spending power, sought, set, receive, quick, provide, precedent, political, month, money, loss, legal, lead, lawsuit, judge collyer, issue, insurer, insurance subsidy, initially, health insurance subsidy, health care program, health care law, fight house, executive branch spending, dispute, department, decision, court, cost, choose, care program, care law, branch spending, billion, authority, year twist donald, year twist, year, worried preserving executive, worried preserving, worried, won house republican, won house, won, win incoming trump, win incoming, win, will provide future, will provide]","[republican, health, care, democrat, senate, law, senator, dr, congress, house]","[trump, president, russia, intelligence, russian, white, obama, news, house, spicer]","[trump, president, state, republican, united, order, people, year, obama, house]"
1,Rift Between Officers and Residents as Killings Persist in South Bronx - The New York Times,"[precinct, detective, police, officer, year, fernandez, city, bronx, crime, people, case, murder, squad, department, manhattan, drug, complaint, call, victim, percent, neighborhood, help, door, precinct detective, men, violent, house, community, young, work, report, homicide, gang, family, young men, york, time, street, sergeant, hallway, going, felony, day, witness, three, substation, staffing, shooting, robbery, precinct year, police department, lopuzzo, long, east, detective precinct, daughter, brooklyn, arrest, apartment, wife, violent crime, unit, summer, south bronx, south, son, side, sergeant lopuzzo, resident, rate, queen, policing, note, lower, island, investigator, inspector, gun, building, betances house, betances, answer, will, wanted, violence, video, threat, station house, station, staten island, staten, school, resource, protect, problem, park, office, number, mother, month]","[time, year, people, trump, woman, day, family, president, life, school]","[woman, time, year, people, city, life, school, family, film, black]","[trump, republican, democrat, senate, senator, president, house, health, nominee, care]"
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial Bias, Dies at 106 - The New York Times","[wong, artist, disney, father, work, tyrus, chinese, year, art, yeo, studio, gen yeo, gen, bambi, state, island, film, united state, united, paper, son, painting, museum, los angeles, los, life, landscape, immigration, drawing, canemaker, animation, angeles, war, school, san francisco, san, painter, mother, immigrant, francisco, family, child, background, angel island, angel, yow, young, worked, water, warner brother, warner, walt disney, walt, village, tai yow, tai, sky, retrospective, relative, recognition, recalled, quality, public, paint, otis, opened, month, long, lived, joined, interview, institute, house, hollywood studio, hollywood, group, encouraged, death, chinese immigrant, brother, book, bambi wong, arrival, animated, afterward, yeo father, working, wong recalled tyrus, wong recalled, window, well, water paper paint, water paper, united state citizen, tyrus wong, tree, traveled, trained, tom, time]","[year, city, time, people, film, street, york, food, water, neanderthal]","[republican, health, senate, democrat, care, law, senator, insurance, repeal, congress]","[republican, health, care, senate, law, democrat, insurance, senator, repeal, affordable]"
3,"Among Deaths in 2016, a Heavy Toll in Pop Music - The New York Times","[death, died, year, time, people, star, music, life, day, war, career, woman, national, man, led, league, game, stage, question, prime, pop, palmer, left, john, great, giant, george, generation, figure, era, coming, blue, alan, ago, zsa, younger, young, york, william, white, voice, vietnam war, vietnam, tv, three, television, tale, student, sport, son, snape, scene, saved, russell, roster, ring, reynolds, revolution, ralph, public, prince, prime time, practically, power, pop music, phyllis, photographer, paul, patty duke, patty, pat, news, nazi, movie, month, memory, major league, major, lost, leading, lake, illness, hung, hockey, hand, hall fame, hall, growing, gratitude, glenn, garry, french, fisher, fan, fame, executive, entire, emerson, early, duke]","[trump, state, president, united, year, country, american, government, company, official]","[order, state, united, refugee, judge, court, country, iran, ban, visa]","[order, state, judge, united, refugee, court, law, iran, visa, country]"
4,Kim Jong-un Says North Korea Is Preparing to Test Long-Range Missile - The New York Times,"[north, korea, missile, ballistic, test, north korea, kim, ballistic missile, south, rocket, nuclear, united, south korea, intercontinental ballistic, intercontinental, year, united state, trump, state, country, weapon, warhead, speech, intercontinental ballistic missile, icbm, technology, sunday, region, reach, program, nuclear warhead, month, launch, conduct, analyst region, analyst, will, time, testing, test trump, test coming month, test coming, success testing, success, speech kim, satellite, sanction north, sanction, rocket launch, range, preparation, powerful nuclear, powerful, official, nuclear weapon, nuclear test, military, long, intercontinental ballistic rocket, including, inauguration, ground test, ground, final, engine, earth, conducted, conduct test, coming month, coming, claimed, cheong, ballistic rocket, year speech kim, year speech, year increasingly harsh, year increasingly, year day speech, year day, year country cleared, year country, year complete icbm, year complete, will test trump, will test, will sworn jan, will sworn, weapon test coming, weapon test, weapon program long, weapon program, weapon country intercontinental, weapon country, weapon ballistic missile, weapon ballistic, warhead united state, warhead united, warhead small fit, warhead small, warhead pound reach]","[china, chinese, beijing, xi, kong, hong, jammeh, prison, battery, taiwan]","[china, trade, european, chinese, britain, company, united, state, union, trump]","[china, trade, chinese, european, britain, company, health, tax, market, mexico]"


!!! SOMETHING IS WRONG WITH TOPIC MODELLING OUTPUT !!!

Maybe because we originally used SMART information retrieval's stopwords list which gave us this output-

Using LDA:    
Topic 0:    
republican team transition branch executive law    
Topic 1:    
trump administration transition office obama house    
Topic 2:    
house obama white court legal state    
Topic 3:    
court appeal state legal office white    
Topic 4:    
care health insurance law appeal republican    

Using NMF:    
Topic 0:    
care health law insurance house appeal    
Topic 1:    
house republican white trump congressional appeal    
Topic 2:    
court state appeal white house branch    
Topic 3:    
trump transition team office administration health    
Topic 4:    
obama white administration team health appeal    

Using LSA/SVD:    
Topic 0:    
care health law insurance obama transition    
Topic 1:    
house trump white obama transition republican     
Topic 2:    
court state appeal congressional obama team     
Topic 3:    
trump transition team office administration law    
Topic 4:    
obama administration white transition team law    

# Document Classification

> Assign a document to one or more classes or categories

Methods Used: Jaccard Similarity & Cosine Similarity

Reference: https://towardsdatascience.com/overview-of-text-similarity-metrics-3397c4601f50

In [11]:
def file_to_list(file):
    path = './data/' + file
    return [line.rstrip('\n') for line in open(path)]

p_list = file_to_list('politics')
b_list = file_to_list('business')
e_list = file_to_list('entertainment')
s_list = file_to_list('sports')
t_list = file_to_list('technology')

def file_to_str(file):
    path = './data/' + file
    return ' '.join([line.rstrip('\n') for line in open(path)])

b_str = file_to_str('business')
p_str = file_to_str('politics')
e_str = file_to_str('entertainment')
s_str = file_to_str('sports')
t_str = file_to_str('technology')

## Jaccard Similarity

The Jaccard similarity measures the similarity between two sets of data to see which members are shared and distinct. The Jaccard similarity is calculated by dividing the number of observations in both sets by the number of observations in either set. In other words, the Jaccard similarity can be computed as the size of the intersection divided by the size of the union of two sets. This can be written in set notation using intersection $(A \cap B$) and unions $(A \cup B)$ of two sets:

$$
J(A, B) = \frac{|A \cap B|}{|A \cup B|}
$$

where $(A \cap B$) gives the number of members shared between both sets and $(A \cup B)$ gives the total number of members in both sets (shared and un-shared). The Jaccard Similarity will be 0 if the two sets don't share any values and 1 if the two sets are identical. The set may contain either numerical values or strings.

Additionally, this function can be used to find the dissimilarity between two sets by calculating $d(A,B)=1 – J(A,B)$.


Reference: [Jaccard Similarity](https://www.learndatasci.com/glossary/jaccard-similarity/)

In [12]:
df['category_js'] = ''

def jaccard_sim(str1, str2): 
    a = set(str1) 
    b = set(str2)
    c = a.intersection(b)
    res = float(len(c)) / (len(a) + len(b) - len(c))
    return res
     
for x in df['index']:
    
    p_js = jaccard_sim(df['keywords'][x], p_list)
    b_js = jaccard_sim(df['keywords'][x], b_list) 
    e_js = jaccard_sim(df['keywords'][x], e_list) 
    s_js = jaccard_sim(df['keywords'][x], s_list) 
    t_js = jaccard_sim(df['keywords'][x], t_list) 
    
    js = max(p_js, b_js, e_js, s_js, t_js)
    if js == p_js:
        df['category_js'][x] = 'politics'
    elif js == b_js:
        df['category_js'][x] = 'business'
    elif js == e_js:
        df['category_js'][x] = 'entertainment'
    elif js == s_js:
        df['category_js'][x] = 'sports'
    else:
        df['category_js'][x] = 'tech'
        
df[['title', 'content', 'keywords', 'lda_topics', 'nmf_topics', 'lsa_topics', 'category_js']].head()

Unnamed: 0,title,content,keywords,lda_topics,nmf_topics,lsa_topics,category_js
0,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...,"[house, republican, administration, health, tr...","[republican, health, care, democrat, senate, l...","[trump, president, russia, intelligence, russi...","[trump, president, state, republican, united, ...",politics
1,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood...","[precinct, detective, police, officer, year, f...","[time, year, people, trump, woman, day, family...","[woman, time, year, people, city, life, school...","[trump, republican, democrat, senate, senator,...",politics
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...","When Walt Disney’s “Bambi” opened in 1942, cri...","[wong, artist, disney, father, work, tyrus, ch...","[year, city, time, people, film, street, york,...","[republican, health, senate, democrat, care, l...","[republican, health, care, senate, law, democr...",entertainment
3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...","Death may be the great equalizer, but it isn’t...","[death, died, year, time, people, star, music,...","[trump, state, president, united, year, countr...","[order, state, united, refugee, judge, court, ...","[order, state, judge, united, refugee, court, ...",entertainment
4,Kim Jong-un Says North Korea Is Preparing to T...,"SEOUL, South Korea — North Korea’s leader, ...","[north, korea, missile, ballistic, test, north...","[china, chinese, beijing, xi, kong, hong, jamm...","[china, trade, european, chinese, britain, com...","[china, trade, chinese, european, britain, com...",politics


## Cosine Similarity

Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them.

The equation for Cosine Similarity for two vectors \( A \) and \( B \) is given by:

$$
\text{similarity} = \cos(\theta) = \frac{A \cdot B}{\|A\|\|B\|} = \frac{\sum_{i=1}^{n} A_i B_i}{\sqrt{\sum_{i=1}^{n} A_i^2} \sqrt{\sum_{i=1}^{n} B_i^2}},
$$

where:
- $A \cdot B$ is the dot product of the vectors A and B,
- $\|A\|$ and $\|B\|$ are the Euclidean norms (or magnitudes) of the vectors A and B,
- $A_i$ and $B_i$ are the components of vectors A and B respectively.

This metric is particularly useful in various applications such as text analysis and information retrieval.


To calculate cosine similarity, sentences need to be converted into vectors. This can be done using the bag of words model with either TF (term frequency) or TF-IDF (term frequency-inverse document frequency). TF is suitable for general text similarity, while TF-IDF is beneficial for search query relevance.

As mentioned earlier, the TF-IDF value increases proportionally to the number of times a word appears in the document, offset by the frequency of the word across the dataset. The calculated TF-IDF values are then normalized by the Euclidean norm to ensure that each row vector has a length of 1.

The resulting normalized TF-IDF matrix should have the shape of ( n x m ). A cosine similarity matrix ( n x n ) can then be obtained by multiplying the TF-IDF matrix by its transpose ( m x n ).

In [13]:
df['category_cs'] = ''

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

categories = ['politics', 'business', 'entertainment', 'sports', 'tech']
category_strs = [p_str, b_str, e_str, s_str, t_str]

vectorizer = TfidfVectorizer()
vectorizer.fit(df['keywords'].apply(' '.join).tolist() + category_strs)

category_vectors = vectorizer.transform(category_strs)
doc_vectors = vectorizer.transform(df['keywords'].apply(' '.join))

for i, doc_vector in enumerate(doc_vectors):
    similarities = cosine_similarity(doc_vector, category_vectors)[0]
    max_sim_index = similarities.argmax()
    df.at[i, 'category_cs'] = categories[max_sim_index]

df[['title', 'content', 'keywords', 'lda_topics', 'nmf_topics', 'lsa_topics', 'category_js', 'category_cs']].head()


Unnamed: 0,title,content,keywords,lda_topics,nmf_topics,lsa_topics,category_js,category_cs
0,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...,"[house, republican, administration, health, tr...","[republican, health, care, democrat, senate, l...","[trump, president, russia, intelligence, russi...","[trump, president, state, republican, united, ...",politics,politics
1,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood...","[precinct, detective, police, officer, year, f...","[time, year, people, trump, woman, day, family...","[woman, time, year, people, city, life, school...","[trump, republican, democrat, senate, senator,...",politics,tech
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...","When Walt Disney’s “Bambi” opened in 1942, cri...","[wong, artist, disney, father, work, tyrus, ch...","[year, city, time, people, film, street, york,...","[republican, health, senate, democrat, care, l...","[republican, health, care, senate, law, democr...",entertainment,entertainment
3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...","Death may be the great equalizer, but it isn’t...","[death, died, year, time, people, star, music,...","[trump, state, president, united, year, countr...","[order, state, united, refugee, judge, court, ...","[order, state, judge, united, refugee, court, ...",entertainment,entertainment
4,Kim Jong-un Says North Korea Is Preparing to T...,"SEOUL, South Korea — North Korea’s leader, ...","[north, korea, missile, ballistic, test, north...","[china, chinese, beijing, xi, kong, hong, jamm...","[china, trade, european, chinese, britain, com...","[china, trade, chinese, european, britain, com...",politics,tech


In [14]:
mismatches = df.query('category_js != category_cs')
print("Total articles mapped to differnt categories with the 2 methods:", mismatches.shape[0])

with pd.option_context('display.max_colwidth', None):
    display(mismatches[['title', 'author', 'keywords', 'lda_topics', 'nmf_topics', 'lsa_topics', 'category_js', 'category_cs']].head())

Total articles mapped to differnt categories with the 2 methods: 533


Unnamed: 0,title,author,keywords,lda_topics,nmf_topics,lsa_topics,category_js,category_cs
1,Rift Between Officers and Residents as Killings Persist in South Bronx - The New York Times,Benjamin Mueller and Al Baker,"[precinct, detective, police, officer, year, fernandez, city, bronx, crime, people, case, murder, squad, department, manhattan, drug, complaint, call, victim, percent, neighborhood, help, door, precinct detective, men, violent, house, community, young, work, report, homicide, gang, family, young men, york, time, street, sergeant, hallway, going, felony, day, witness, three, substation, staffing, shooting, robbery, precinct year, police department, lopuzzo, long, east, detective precinct, daughter, brooklyn, arrest, apartment, wife, violent crime, unit, summer, south bronx, south, son, side, sergeant lopuzzo, resident, rate, queen, policing, note, lower, island, investigator, inspector, gun, building, betances house, betances, answer, will, wanted, violence, video, threat, station house, station, staten island, staten, school, resource, protect, problem, park, office, number, mother, month]","[time, year, people, trump, woman, day, family, president, life, school]","[woman, time, year, people, city, life, school, family, film, black]","[trump, republican, democrat, senate, senator, president, house, health, nominee, care]",politics,tech
4,Kim Jong-un Says North Korea Is Preparing to Test Long-Range Missile - The New York Times,Choe Sang-Hun,"[north, korea, missile, ballistic, test, north korea, kim, ballistic missile, south, rocket, nuclear, united, south korea, intercontinental ballistic, intercontinental, year, united state, trump, state, country, weapon, warhead, speech, intercontinental ballistic missile, icbm, technology, sunday, region, reach, program, nuclear warhead, month, launch, conduct, analyst region, analyst, will, time, testing, test trump, test coming month, test coming, success testing, success, speech kim, satellite, sanction north, sanction, rocket launch, range, preparation, powerful nuclear, powerful, official, nuclear weapon, nuclear test, military, long, intercontinental ballistic rocket, including, inauguration, ground test, ground, final, engine, earth, conducted, conduct test, coming month, coming, claimed, cheong, ballistic rocket, year speech kim, year speech, year increasingly harsh, year increasingly, year day speech, year day, year country cleared, year country, year complete icbm, year complete, will test trump, will test, will sworn jan, will sworn, weapon test coming, weapon test, weapon program long, weapon program, weapon country intercontinental, weapon country, weapon ballistic missile, weapon ballistic, warhead united state, warhead united, warhead small fit, warhead small, warhead pound reach]","[china, chinese, beijing, xi, kong, hong, jammeh, prison, battery, taiwan]","[china, trade, european, chinese, britain, company, united, state, union, trump]","[china, trade, chinese, european, britain, company, health, tax, market, mexico]",politics,tech
6,Taiwan’s President Accuses China of Renewed Intimidation - The New York Times,Javier C. Hernández,"[tsai, taiwan, china, beijing, trump, island, will, relation, military, washington, visit, united state, united, state, policy, face, asia, washington tsai, transit, study, step, sought, sending, region, pressure, people, path, independence, glaser, diplomatic, criticized china, criticized, confrontation, center, avoid, american, ally, administration, wrote email tsai, wrote email, wrote, will island bargaining, will island, will exercise restraint, will exercise, will course revert, will course, will bow pressure, will bow, west coast africa, west coast, west, well incoming trump, well incoming, well, week china stepped, week china, week, weapon island long, weapon island, weapon, water island dispatching, water island, water, washington tsai understood, washington tsai reassure, washington beijing decade, washington beijing, warning visit embolden, warning visit, warning, vowed avoid confrontation, vowed avoid, vowed, visit unofficial transit, visit unofficial, visit embolden independence, visit embolden, visit central america, visit central, vision align well, vision align, vision, unofficial transit transit, unofficial transit, unofficial, united state taiwan, united state domestic, united state block, understood maintain balance, understood maintain, understood, underpinned relation washington, underpinned relation, underpinned, tsai vowed avoid, tsai vowed, tsai vision align, tsai vision, tsai understood maintain]",,,,politics,sports
7,"After ‘The Biggest Loser,’ Their Bodies Fought to Regain Weight - The New York Times",Gina Kolata,"[weight, calorie, pound, people, contestant, dr, metabolism, year, day, eat, body, obesity, cahill, researcher, lose, weight loss, loss, leptin, keep, maintain, lost, study, hall, drug, calorie day, will, started, size, season, normal, loser, lose weight, level, keep weight, exercise, dr hall, biggest loser, biggest, weight pound, time, rest, maintain weight, hunger, hour, hormone, fat, control, began, work, week, struggle, small, regained, number, medical, maintain weight loss, life, help, hard, group, finding, fight, ended, diet, diabetes, burning, biology, amount weight, amount, weight gain, weight cahill, urge eat, urge, university, treat, three, slower, slowed, scientist, scale, rosenbaum, regain, reason, reality, project, proietto, problem, person, people lose, obesity researcher, national, month, minute, metabolism slowed, metabolic, measure, lost weight, looked, long, lack]",,,,tech,business
9,Calling on Angels While Enduring the Trials of Job - The New York Times,Andy Newman,"[mu, mu iz, iz, family, angel, jos, time, child, zoraida, york, thing, rent, life, jr, jos jr, jesus, father, son, care, cancer, apartment, work, time neediest case, time neediest, summer, started, sr, son jesus, sibling, radiation, program, painting, paid, neediest case fund, neediest case, neediest, mu iz child, mother, maria, lower jaw, lower, jos sr, jaw, iz child, house, gift, fund, food, family paid, depression, depressed, college, case fund, case, built, bronx, bill, began, zoraida severely depressed, zoraida severely, zoraida mu iz, zoraida mu, zoraida grandfather father, zoraida grandfather, zoraida early life, zoraida early, zaro bakery manhattan, zaro bakery, zaro, younger son jesus, younger son, younger, york time neediest, york time, york started life, york started, york organization supported, york organization, york film academy, york film, year time family, year time, year, working care determined, working care, working, worked construction skill, worked construction, worked, work stopped working, work stopped, work save mu, work save, withdrew support zoraida, withdrew support, withdrew, will radiation work, will radiation, will, westchester avenue bronx]",,,,business,entertainment
