# Imports

In [1]:
import json
import math
import numpy as np
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [2]:
# load english stopwords
stopwords = stopwords.words ("english")

# Part 1 - Analysis

## a. News Headlines Text Processing

### Load the data

In [3]:
data_file_path = 'NewsCategoryDataset_2017_2022.json'
with open(data_file_path, 'r') as file:
    # remove new lines at the end of the file
    file = file.read().strip() 
    # wrapping the json elements in an array to be able to load them
    file_content = '[' + ','.join(file.split('\n')) + ']' 
    # load data
    json_records = json.loads(file_content)
print('Number of records: ', len(json_records))

Number of records:  47146


### Extract texts from records

In [4]:
# Extract headlines
headlines = [record['headline'] for record in json_records]
# Extract short descriptions
short_descriptions = [record['short_description'] for record in json_records]
# Combine headlines and short descriptions
headline_desc = [head + ' ' + desc for head, desc in zip(headlines, short_descriptions)]
# Extract categories
categories = [record['category'] for record in json_records]

### Preprocess texts
This function takes a text and returns a list of tokens after removing stopwords, punctuations, and numbers, and stemming the tokens

In [5]:
def preprocess_text(text):
    # lowercase and tokenize
    tokens = word_tokenize(text.lower())
    stemmer = PorterStemmer()
    # Remove stopwords, punctuations, and numbers, and stem the tokens and return them
    return [stemmer.stem(word) for word in tokens if word.isalpha() and word not in stopwords] 
 

In [6]:
# Preprocess headlines, short descriptions, and combined texts
headlines_processed = [preprocess_text(headline) for headline in headlines]
short_descriptions_processed = [preprocess_text(short_description) for short_description in short_descriptions]
headline_desc_processed = [preprocess_text(headline_desc) for headline_desc in headline_desc]

## b. Terms weights using TF-IDF

### Term Frequency (TF)
This function takes a document and returns a dictionary of terms and their normalized frequencies in the document

In [7]:
def doc_term_freq(doc):
    # Compute term frequency for each term in the document
    terms_freq = Counter(doc)
    # get the maximum term frequency to normalize the term frequency
    max_freq = max(terms_freq.values()) if len(terms_freq) > 0 else 0
    # Normalize term frequency
    for i in terms_freq:
        terms_freq[i] = terms_freq[i] / (max_freq * len(doc))
    return terms_freq

### Inverse Document Frequency (IDF)
This function takes a list of documents and returns the IDF value of the terms

In [8]:
def inverse_doc_freq(docs, term_set):
    idf_values = {}
    docs_freq = Counter([term for doc in docs for term in set(doc)])
    docs_len = len(docs)
    for term in term_set:
        idf_values[term] = math.log(docs_len / docs_freq[term])
    return idf_values

### TF-IDF matrix
This function takes a list of documents and returns a TF-IDF matrix

In [9]:
def tfidf_matrix(docs):
    # Get the set of terms in all documents
    term_set = set(term for doc in docs for term in doc)
    # Initialize TF-IDF matrix with zeros
    tfidf_matrix = pd.DataFrame(0.0, index=range(len(docs)), columns=list(term_set))
    # Compute IDF for each term
    idf_values = inverse_doc_freq(docs, term_set)   
    # Compute TF for each term in each document
    tf_values = {doc_idx: doc_term_freq(doc) for doc_idx, doc in enumerate(docs)}
    # Compute TF-IDF for each term in each document
    for term in term_set:
        tf = [tf_values[doc_idx][term] * idf_values[term] if term in tf_values[doc_idx].keys() else 0 for doc_idx in range(len(docs))]
        tfidf_matrix.loc[:, term] = tf 
    return tfidf_matrix

In [None]:
# Compute TF-IDF matrix for the headlines and short descriptions combined texts
headlines_desc_tfidf = tfidf_matrix(headline_desc_processed)

In [54]:
# Export TF-IDF matrix to csv because it takes a long time to compute
headlines_desc_tfidf.to_csv('headlines_desc_tfidf.csv')

In [11]:
# Load TF-IDF matrix from csv
headlines_desc_tfidf = pd.read_csv('headlines_desc_tfidf.csv', index_col=0)

### Highest weighted n% of the terms per document

In [13]:
# Extract top n% highest-weighted terms from a document (row), return a list of tuples (term, weight)
def extract_top_terms(row, percentage):
    # Get the number of terms to extract, a minimum of 20 terms or n% of the terms in the document
    num_terms = min(20, int(len(row) * percentage / 100))
    # Sort terms by their weights in descending order and get the top num_terms terms
    top_terms = row.sort_values(ascending=False).head(num_terms)
    # Get the terms in a list
    terms =  top_terms.index.tolist()
    # Get the weights in a list
    weights = top_terms.tolist()
    # Return a list of tuples (term, weight)
    return [(t, w) for t, w in zip(terms, weights)]

In [14]:
# Get top n% terms from each document
n = 0.1 
# Creating a dictionary where each key is a document and each value is a list of top n% terms
document_terms = {doc_id: extract_top_terms(row, n) for doc_id, row in headlines_desc_tfidf.iterrows()}
# Prepare json Output
docs_detail = {
    doc: {
        'headline': json_records[doc]['headline'],
        'short_description': json_records[doc]['short_description'],
        'category': json_records[doc]['category'],
        'link': json_records[doc]['link'],
        'date': json_records[doc]['date'],
        'authors': json_records[doc]['authors'],
        'keywords': document_terms[doc]
    } for doc in document_terms.keys()
}

# Export to json
with open('./static/data/docs_details.json', 'w') as f:
    json.dump(docs_detail, f, indent=4)

## c. Highest weighted n% of the terms per category

In [15]:
# Compute average term weights for each category
category_avg_term_weights = {}
for category in set(categories):
    headlines_indices = [i for i, x in enumerate(categories) if x == category]
    category_avg_term_weights[category] = {term: headlines_desc_tfidf.loc[headlines_indices, term].mean() for term in headlines_desc_tfidf.columns}

In [16]:
# get top n% terms from each category
n = 3
# Creating a dictionary where each key is a category and each value is a list of top n% terms
category_terms = {}
for category, term_weights in category_avg_term_weights.items():
    sorted_terms = sorted(term_weights.items(), key=lambda x: x[1], reverse=True)
    top_terms = sorted_terms[:int(len(sorted_terms) * n / 100)]
    category_terms[category] = top_terms

# Prepare json Output
category_details = {
    category: {
        'articles': [d for d in json_records if d['category'] == category],
        'keywords': category_terms[category]
    } for category in category_terms.keys()
}

# Export to json
with open('./static/data/category_details.json', 'w') as f:
    json.dump(category_details, f, indent=4)

## d. K-means clustering

### Cosine similarity
This function takes two arrays and returns the cosine similarity between the vectors represented by the arrays

In [17]:
def cos_sim(doc1, doc2):
    # compute the norm of each document and divide each document by its norm
    doc1_norm = doc1 / np.linalg.norm(doc1, axis=1)[:, np.newaxis]
    doc2_norm = doc2 / np.linalg.norm(doc2, axis=1)[:, np.newaxis]
    # compute the dot product between the documents to get the cosine similarity
    return np.dot(doc1_norm, doc2_norm.T)

### K-means clustering
This function takes a dataframe and the number of clusters and returns the clusters of the documents

In [18]:
def k_means(df, k):
    # Initialize centroids with k random documents
    centroids = df.sample(n=k).values
    # Convert dataframe to numpy array
    values = df.values
    for iter in range(100):
        print("Iteration: ", iter)
        # Compute cosine similarity between each document and each centroid
        sim_scores = cos_sim(values, centroids)
        # Assign each document to the cluster of the centroid with the highest similarity
        clusters = np.argmax(sim_scores, axis=1)
        # If the clusters didn't change, stop
        if 'Cluster' in df.columns and all(df['Cluster'].eq(clusters)):
            break
        # Update centroids
        df['Cluster'] = clusters
        # Compute new centroids
        centroids = np.array([values[clusters == i].mean(axis=0) for i in range(k)])
    return clusters

In [19]:
data = headlines_desc_tfidf.copy()
clusters = k_means(data, 41)

Iteration:  0
Iteration:  1
Iteration:  2
Iteration:  3
Iteration:  4
Iteration:  5
Iteration:  6
Iteration:  7
Iteration:  8
Iteration:  9
Iteration:  10
Iteration:  11
Iteration:  12
Iteration:  13
Iteration:  14
Iteration:  15
Iteration:  16
Iteration:  17
Iteration:  18
Iteration:  19
Iteration:  20
Iteration:  21
Iteration:  22
Iteration:  23
Iteration:  24
Iteration:  25
Iteration:  26
Iteration:  27
Iteration:  28
Iteration:  29
Iteration:  30
Iteration:  31
Iteration:  32
Iteration:  33
Iteration:  34
Iteration:  35
Iteration:  36
Iteration:  37
Iteration:  38
Iteration:  39
Iteration:  40
Iteration:  41
Iteration:  42
Iteration:  43
Iteration:  44
Iteration:  45
Iteration:  46
Iteration:  47
Iteration:  48
Iteration:  49
Iteration:  50
Iteration:  51
Iteration:  52
Iteration:  53
Iteration:  54
Iteration:  55
Iteration:  56
Iteration:  57
Iteration:  58
Iteration:  59
Iteration:  60
Iteration:  61
Iteration:  62
Iteration:  63
Iteration:  64
Iteration:  65
Iteration:  66
Itera

## e. Highest weighted n% of the terms per cluster

In [20]:
# Compute average term weights for each cluster
cluster_avg_term_weights = {}
clust_indices = {}
for cluster in set(clusters):
    headlines_indices = [i for i, x in enumerate(clusters) if x == cluster]
    clust_indices[cluster] = headlines_indices
    cluster_avg_term_weights[cluster] = {term: headlines_desc_tfidf.loc[headlines_indices, term].mean() for term in headlines_desc_tfidf.columns}

In [21]:
# get top n% terms from each cluster
n = 3
# Creating a dictionary where each key is a cluster and each value is a list of top n% terms
cluster_terms = {}
for cluster, term_weights in cluster_avg_term_weights.items():
    sorted_terms = sorted(term_weights.items(), key=lambda x: x[1], reverse=True)
    top_terms = sorted_terms[:int(len(sorted_terms) * n / 100)]
    cluster_terms[cluster] = top_terms

# Prepare json Output
cluster_details = {
    int(cluster): {
        'articles': [json_records[i] for i in clust_indices[cluster]],
        'keywords': cluster_terms[cluster]
    } for cluster in cluster_terms.keys()
}

# Export to json
with open('./static/data/cluster_details.json', 'w') as f:
    json.dump(cluster_details, f, indent=4)

# Part 2 - Web Application

To run the web application, run the following cell or copy it to the terminal and run it and then open http://localhost:5001/ in the browser


In [22]:
!FLASK_APP=flask_app.py flask run --port=5001

 * Serving Flask app 'flask_app.py'
 * Debug mode: off
 * Running on http://127.0.0.1:5001
[33mPress CTRL+C to quit[0m
127.0.0.1 - - [13/Jan/2024 13:33:56] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [13/Jan/2024 13:33:56] "GET /static/css/styles.css HTTP/1.1" 200 -
127.0.0.1 - - [13/Jan/2024 13:33:56] "GET /static/css/docs_det.css HTTP/1.1" 200 -
127.0.0.1 - - [13/Jan/2024 13:33:56] "GET /static/js/category_bubble.js HTTP/1.1" 200 -
127.0.0.1 - - [13/Jan/2024 13:33:56] "GET /static/js/cluster_bubble.js HTTP/1.1" 200 -
127.0.0.1 - - [13/Jan/2024 13:33:57] "GET /static/data/cluster_details.json HTTP/1.1" 200 -
127.0.0.1 - - [13/Jan/2024 13:33:57] "GET /static/data/docs_details.json HTTP/1.1" 200 -
127.0.0.1 - - [13/Jan/2024 13:33:57] "GET /static/data/category_details.json HTTP/1.1" 200 -
127.0.0.1 - - [13/Jan/2024 13:34:16] "GET /cat_details?category=QUEER%20VOICES HTTP/1.1" 200 -
127.0.0.1 - - [13/Jan/2024 13:34:16] "GET /static/css/details.css HTTP/1.1" 200 -
127.0.0.1 - - [

## The home page of the web application has three tabs:
   1. News Articles: shows the news headlines, their details (link, date, authors, and short description) and the corresponding keyword cloud. Details can be shown by clicking on the news headline. This tab is the default tab.
      
   2. Categories: shows a bubble chart of the categories. The size of each bubble represents the number of articles in the category. When clicking on a bubble, it directs to the category page, showing all the articles in the category and their keywords in two different tabs. When clicking on a news headline, it directs to the news article page.
      
   3. Clusters: shows a bubble chart of the clusters. The size of each bubble represents the number of articles in the cluster. When clicking on a bubble, it directs to the cluster page, showing all the articles in the cluster and their keywords in two different tabs. When clicking on a news headline, it directs to the news article page.
 
Notes:
1. It might take a few seconds to load the home page because it loads the data from the json files.
2. For the articles keywords cloud, only a maximum of 20 keywords are shown, to reduce the exported json file size, and thus the loading time.

## Below are some screenshots of the web application: 
 

### Home page - News Articles tab (default tab)
 ![screenshots/home_page_article_list.png](screenshots/home_page_article_list.png)
 
### Home page - Article details
 ![screenshots/article_details.png](screenshots/article_details.png)

### Home page - Categories tab
 ![screenshots/categories_page.png](screenshots/categories_page.png)

### Home page - Clusters tab
 ![screenshots/clusters_page.png](screenshots/clusters_page.png)

### Category page - Articles tab
 ![screenshots/category_article_list.png](screenshots/category_article_list.png)

### Category page - Keywords tab
![screenshots/category_wordcloud.png](screenshots/category_wordcloud.png)

### Cluster page - Articles tab
![screenshots/cluster_article_list.png](screenshots/cluster_article_list.png)

### Cluster page - Keywords tab
![screenshots/cluster_wordcloud.png](screenshots/cluster_wordcloud.png)