In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud
from textwrap import wrap
from textblob import TextBlob

from pytrends.request import TrendReq
from pytrends.exceptions import ResponseError

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, OPTICS
from scipy.cluster.hierarchy import dendrogram

import re
import string
import time
from PIL import Image
from collections import Counter

import plotly.express as px

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words as nltk_words
from nltk.corpus import stopwords

nltk.download('stopwords', download_dir='./resources')
nltk.download('punkt', download_dir='./resources')
nltk.download('wordnet', download_dir='./resources')
nltk.download('omw-1.4', download_dir='./resources')
nltk.download('words', download_dir='./resources')
nltk.data.path.append('./resources')

## Trends limits:
By default, it is set to 100 requests per 100 seconds per user and can be adjusted to a maximum value of 1,000. But the number of requests to the API is restricted to a maximum of 10 requests per second per user.

In [None]:
!ls && ls ./Data

### Configs...

In [None]:
TRENDS = True

# Exploratory Data Analysis

In [None]:
data = pd.read_csv('../input/bbc-news/bbc_news.csv')
data

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data['pubDate'] = pd.to_datetime(data['pubDate'], infer_datetime_format=True)
data

## Remove duplicates

In [None]:
data = data.drop_duplicates(subset=['title']).drop_duplicates(subset=['guid']).drop_duplicates(subset=['description']).reset_index(drop=True)
data

In [None]:
STOPWORDS = stopwords.words('english')
pd.Series(STOPWORDS)

## Remove stopwords and count frequencies for single words

In [None]:
sns.set_style('darkgrid')

In [None]:
data['combined'] = data['title'] + ' ' + data['description']
data['combined']

In [None]:
top_N = 50
a = data['combined'].str.lower().str.replace(r'[^\w\s]', '', regex=True).str.cat(sep=' ')
words = list(filter(lambda w: w not in STOPWORDS, nltk.tokenize.word_tokenize(a)))
word_dist = nltk.FreqDist(words)
freq_title = pd.DataFrame(word_dist.most_common(top_N),
                    columns=['Word', 'Frequency'])
freq_title

In [None]:
plt.figure(figsize=(18, 12))
sns.barplot(data=freq_title, x='Word', y='Frequency')
plt.xticks(rotation=90)

## Looking for the most popular 2-grams and 3-grams

In [None]:
def find_ngrams(input_list, n):
    return list(zip(*(input_list[i:] for i in range(n))))

In [None]:
bigrams = find_ngrams(words, 2)
bigrams

In [None]:
bigrams_dist = nltk.FreqDist(bigrams)
bigram_freq_title = pd.DataFrame(bigrams_dist.most_common(top_N),
                    columns=['N-gram', 'Frequency'])
bigram_freq_title

In [None]:
plt.figure(figsize=(18, 12))
sns.barplot(data=bigram_freq_title, x='N-gram', y='Frequency')
plt.xticks(rotation=90)

In [None]:
trigrams = find_ngrams(words, 3)
trigrams

In [None]:
trigrams_dist = nltk.FreqDist(trigrams)
trigram_freq_title = pd.DataFrame(trigrams_dist.most_common(top_N),
                    columns=['N-gram', 'Frequency'])
trigram_freq_title

In [None]:
plt.figure(figsize=(18, 12))
sns.barplot(data=trigram_freq_title, x='N-gram', y='Frequency')
plt.xticks(rotation=90)

# Cleaning data (removing punctuation)

In [None]:
# lower and remove punctuation
data['combined_cleaned'] = data['combined'].str.lower().apply(lambda x: re.sub(f'[{re.escape(string.punctuation)}]', '', x))
data

In [None]:
# remove redundant spaces
data['combined_cleaned'] = data['combined_cleaned'].str.strip()
data

In [None]:
data['combined_cleaned'][0], data['combined_cleaned'][1]

In [None]:
data['combined_cleaned'].apply(len).describe()

# Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('civilians')

In [None]:
data['combined_cleaned']=data['combined_cleaned'].apply(lambda x: ' '.join(map(lemmatizer.lemmatize, filter(lambda w: w not in STOPWORDS, nltk.tokenize.word_tokenize(x)))))
data['combined'][100], data['combined_cleaned'][100]

## Normalizing data

In [None]:
cv = CountVectorizer(analyzer='word')
cv_data = cv.fit_transform(data['combined_cleaned'])
dtm_data = pd.DataFrame(cv_data.toarray(), columns=cv.get_feature_names_out())
dtm_data.index = data.index
dtm_data.head(5)

# Wordclouds

In [None]:
# Function for generating word clouds
def generate_wordcloud(data, title, bg='black', colormap='Dark2', mask=None):
    wc = WordCloud(width=1600, height=1080, max_words=150, colormap=colormap, mask=mask, background_color=bg).generate_from_frequencies(data)
    plt.figure(figsize=(16,12))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title('\n'.join(wrap(title,60)),fontsize=26)
    plt.show()

In [None]:
!mkdir ./assets

In [None]:
assets = ['100.png', '500.png', 'cloud.png', 'thumb-down.png', 'thumb-up.png']

In [None]:
import requests

assets_url = 'https://raw.githubusercontent.com/StepanTita/fds-final-project/main/assets/'
for asset in assets:
    response = requests.get(assets_url + asset)
    print(response)
    open(f'./assets/{asset}', "wb").write(response.content)

In [None]:
!ls ./assets

In [None]:
ASSETS_PATH = './assets'

In [None]:
f100_mask = np.array(Image.open(ASSETS_PATH + '/100.png'))

In [None]:
latest100indexes = data.sort_values(by=['pubDate'], ascending=False).index

## Latest 100 titles

In [None]:
generate_wordcloud(dtm_data.loc[latest100indexes].T.sum(axis=1), 'Latest 100 titles', mask=f100_mask)

## Latest 500 titles

In [None]:
f500_mask = np.array(Image.open(ASSETS_PATH + '/500.png'))

In [None]:
latest500indexes = data.sort_values(by=['pubDate'], ascending=False).index

In [None]:
generate_wordcloud(dtm_data.loc[latest500indexes].T.sum(axis=1), 'Latest 500 titles', mask=f500_mask)

In [None]:
data['combined_polarity']=data['combined_cleaned'].apply(lambda x: TextBlob(x).sentiment.polarity)
data

In [None]:
data.describe()

In [None]:
title_polarity_sorted = data['combined_polarity'].sort_values(ascending=True).copy()[:200:10]

plt.figure(figsize=(16,12))
plt.xlabel('Polarity')
plt.ylabel('Title')
plt.title('Polarity of Titles')
polarity_graph=plt.barh(np.arange(len(title_polarity_sorted.index)), title_polarity_sorted, color='crimson')

for bar, polarity_idx in zip(polarity_graph, title_polarity_sorted.index):
    plt.text(0.005, bar.get_y() - bar.get_width() / 2, str(data['combined_cleaned'][polarity_idx]), va='center', fontsize=13, color='black')

for bar, polarity_idx in zip(polarity_graph, title_polarity_sorted.index):
    plt.text(bar.get_width() + 0.003, bar.get_y() - bar.get_width() / 2,'{:.2f}'.format(title_polarity_sorted[polarity_idx]), va='center', fontsize=13, color='white')

plt.yticks([])
plt.show()

In [None]:
title_polarity_sorted

In [None]:
thumb_down_mask = np.array(Image.open(ASSETS_PATH + '/thumb-up.png').transpose(method=Image.FLIP_TOP_BOTTOM))
thumb_down_mask-=1
thumb_down_mask[thumb_down_mask == 0] = 255

In [None]:
generate_wordcloud(dtm_data.loc[title_polarity_sorted.index].T.sum(axis=1), 'Most negative 200', colormap='Reds', mask=thumb_down_mask)

In [None]:
title_polarity_sorted = data['combined_polarity'].sort_values(ascending=True).copy()[-200::10]

plt.figure(figsize=(16,8))
plt.xlabel('Polarity')
plt.ylabel('Title')
plt.title('Polarity of Titles')
polarity_graph=plt.barh(np.arange(len(title_polarity_sorted.index)), title_polarity_sorted, color='lightgreen')

for bar, polarity_idx in zip(polarity_graph, title_polarity_sorted.index):
    plt.text(0.005, bar.get_y() + bar.get_width() / 2, str(data['title'][polarity_idx]), va='center', fontsize=13, color='white')

for bar, polarity_idx in zip(polarity_graph, title_polarity_sorted.index):
    plt.text(bar.get_width() + 0.003, bar.get_y() + bar.get_width() / 2,'{:.2f}'.format(title_polarity_sorted[polarity_idx]), va='center', fontsize=13, color='black')
    
plt.yticks([])
plt.show()

In [None]:
thumb_up_mask = np.array(Image.open(ASSETS_PATH + '/thumb-up.png'))
thumb_up_mask-=1
thumb_up_mask[thumb_up_mask == 0] = 255

In [None]:
generate_wordcloud(dtm_data.loc[title_polarity_sorted.index].T.sum(axis=1), 'Most positive 200', colormap='Greens', mask=thumb_up_mask)

# Lemmatization and filtering

In [None]:
word_freq_data_normalized = dtm_data.T
word_freq_data_normalized

In [None]:
word_freq_data = pd.DataFrame(word_freq_data_normalized.sum(axis=1).reset_index()).rename(columns={'index': 'word', 0: 'freq'})
word_freq_data

In [None]:
nltk_lems = [lemmatizer.lemmatize(word) for word in nltk_words.words()] #stem the #words in the NLTK corpus so that they’re equivalent to the words in #the dataframe
df_nltk_words = pd.DataFrame() # make a new dataframe with the stemmed #NLTK words
df_nltk_words['word'] = nltk_lems
word_freq_data = word_freq_data[word_freq_data['word'].isin(df_nltk_words['word'])] #keep only #those in the stemmed NLTK corpus
word_freq_data

In [None]:
TRIM_QUANTILE = 0.7
word_freq_data[word_freq_data['freq'] >= word_freq_data['freq'].quantile(TRIM_QUANTILE)]

# TFiDF Vectorization

In [None]:
stopwords = list(word_freq_data[word_freq_data['freq'] < word_freq_data['freq'].quantile(TRIM_QUANTILE)]['word'])
vec_vocab = list(word_freq_data[word_freq_data['freq'] >= word_freq_data['freq'].quantile(TRIM_QUANTILE)]['word'])

In [None]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words=stopwords, vocabulary=vec_vocab)
tfidf_vectorizer

In [None]:
tfidf_matrix = tfidf_vectorizer.fit_transform(data['combined_cleaned'])
tfidf_matrix

In [None]:
tfidf_data = pd.DataFrame(tfidf_matrix.toarray())
tfidf_data

In [None]:
feature_names = tfidf_vectorizer.get_feature_names_out()
feature_names

In [None]:
tfidf_data.columns = feature_names
tfidf_data

# TruncatedSVD

### during the experiments following configurations were tried:

* 1000 components - 10 clusters

* 100 components - 10 clusters

* 50 components - 10 clusters

* 100 components - 5 clusters

* 100 components - 15 clusters

In [None]:
tsvd = TruncatedSVD(n_components=100, random_state=37)
vec_matrix_tsvd = tsvd.fit_transform(tfidf_matrix)
vec_matrix_tsvd

In [None]:
pd.DataFrame(vec_matrix_tsvd)

# Clustering

## KMeans

In [None]:
clst10 = KMeans(n_clusters=10, verbose=0, random_state=37)
clst10.fit(vec_matrix_tsvd)

In [None]:
data['label'] = clst10.labels_
data

In [None]:
labels_count_data = data[['description', 'label']].groupby(['label', 'description']).count()
labels_count_data

In [None]:
assign_ratio = labels_count_data.reset_index()['label'].value_counts() / labels_count_data.shape[0]
assign_ratio

In [None]:
assign_ratio.sum()

In [None]:
flat_labels_count_data = labels_count_data.reset_index()
for i in assign_ratio.index:
    flat_labels_count_data.loc[flat_labels_count_data['label'] == i, 'ratio'] = assign_ratio.loc[i]
flat_labels_count_data.groupby(['label', 'ratio', 'description']).first(5)

In [None]:
tfidf_data

In [None]:
def to_viz_data(data, tfidf_data, labels):
    transformed_data = []
    for label in labels:
        new_data = tfidf_data[tfidf_data[data['label'] == label].idxmax(axis=1)].max()[:200]
        transformed_data.append(pd.concat([new_data.reset_index(), pd.Series([label] * len(new_data))], axis=1))
    viz_data = pd.concat(transformed_data)
    viz_data.columns = ['word', 'tfidf', 'label']
    return viz_data

In [None]:
viz_data = to_viz_data(data, tfidf_data, data['label'].unique())
viz_data

In [None]:
!mkdir ./Data

In [None]:
viz_data.to_csv('./Data/combined-kmeans-clusters.csv', index=False)

In [None]:
tfidf_data

## Visualizing the results

In [None]:
viz_data.sort_values('label').groupby('label').head(10).reset_index(drop=True)

In [None]:
def visualize_clusters(viz_data):
    viz_data = viz_data.copy()
    viz_data['label'] += np.abs(viz_data['label'].min())
    viz_data = viz_data.sort_values('label').groupby('label').head(10)
    fig = px.scatter(viz_data, x='label', y='tfidf', color='label', text='word', hover_data=['label', 'tfidf', 'word'])
    fig.update_layout(
        font=dict(
            family='Courier New, monospace',
            size=18,  # Set the font size here
            color='black'
        ),
        xaxis = dict(
            tickmode = 'linear',
            tick0 = 0,
            dtick = 1
        ),
        yaxis = dict(
            tickmode = 'linear',
            tick0 = 0,
            dtick = 0.05
        )
    )
    fig.update_traces(mode='text', selector=dict(type='scatter'), textfont_size=(viz_data['tfidf'] * 25).values)
    fig.for_each_trace(lambda t: t.update(textfont_color=np.array(px.colors.qualitative.Dark24)[t.marker.color], textposition='bottom center'))
    fig.show()

In [None]:
visualize_clusters(viz_data)

## Infer real results

In [None]:
def infer_topic(data):
    topics_data = pd.DataFrame()
    for kw in data['word']:
        print('Starting: ', kw)
        trends.build_payload(kw_list=[kw])
        
        backoff = 1.0001
        while True:
            try:
                related_topics = trends.related_topics()[kw]['top'][['topic_title', 'topic_type']][:3].T
                break
            except ResponseError as e:
                print('Retrying:', e)
                time.sleep(10 * backoff)
                backoff = pow(backoff, 2)
            except Error as e:
                print('Bad situation...')
                time.sleep(10)
                backoff = pow(backoff, 2)
            print('Using backoff: ', backoff)
        n = min(len(related_topics.loc['topic_title', :]), len(related_topics.loc['topic_type', :]))
        tmp = pd.DataFrame(pd.concat([related_topics.loc['topic_title', :], related_topics.loc['topic_type', :]])).T
        tmp.columns = [f'topic_title_{c}' for c in tmp.columns[:n]] + [f'topic_type_{c}' for c in tmp.columns[n:]]

        topics_data = topics_data.append(tmp)
        
        print('Finished: ', kw)
        print('*' * 20)
        time.sleep(1)
    return topics_data

In [None]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

## Agglomerative (ward)

In [None]:
clst_agg_ward = AgglomerativeClustering(n_clusters=10, linkage='ward')
clst_agg_ward.fit(vec_matrix_tsvd)

In [None]:
data['label'] = clst_agg_ward.labels_
data

In [None]:
assign_ratio = data['label'].value_counts() / data.shape[0]
assign_ratio

In [None]:
viz_data = to_viz_data(data, tfidf_data, data['label'].unique())
viz_data

In [None]:
viz_data.to_csv('./Data/combined-agg-ward-clusters.csv', index=False)

In [None]:
visualize_clusters(viz_data)

In [None]:
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None, linkage='ward')
model.fit(vec_matrix_tsvd)
plt.figure(figsize=(18, 12))
plot_dendrogram(model, truncate_mode='level', p=5)
plt.xlabel('Dendogram agglomerative ward')
plt.show()

## Agglomerative (complete)

In [None]:
clst_agg_avg = AgglomerativeClustering(n_clusters=10, linkage='complete')
clst_agg_avg.fit(vec_matrix_tsvd)

In [None]:
data['label'] = clst_agg_avg.labels_
data

In [None]:
assign_ratio = data['label'].value_counts() / data.shape[0]
assign_ratio

In [None]:
viz_data = to_viz_data(data, tfidf_data, data['label'].unique())
viz_data

In [None]:
viz_data.to_csv('./Data/combined-agg-complete-clusters.csv', index=False)

In [None]:
visualize_clusters(viz_data)

In [None]:
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None, linkage='complete')
model.fit(vec_matrix_tsvd)
plt.figure(figsize=(18, 12))
plot_dendrogram(model, truncate_mode='level', p=5)
plt.xlabel('Dendogram agglomerative complete')
plt.show()

## DBSCAN

In [None]:
dbscan = DBSCAN(eps=0.5)
dbscan.fit(vec_matrix_tsvd)

In [None]:
data['label'] = dbscan.labels_
data

In [None]:
assign_ratio = data['label'].value_counts() / data.shape[0]
assign_ratio

In [None]:
viz_data = to_viz_data(data, tfidf_data, data['label'].unique())
viz_data

In [None]:
viz_data.to_csv('./Data/combined-dbscan-clusters.csv', index=False)

In [None]:
visualize_clusters(viz_data)

## OPTICS

In [None]:
optics = OPTICS(min_samples=10)
optics.fit(vec_matrix_tsvd)

In [None]:
data['label'] = optics.labels_
data

In [None]:
assign_ratio = data['label'].value_counts() / data.shape[0]
assign_ratio

In [None]:
viz_data = to_viz_data(data, tfidf_data, data['label'].unique())
viz_data

In [None]:
viz_data.to_csv('./Data/combined-optics-clusters.csv', index=False)

In [None]:
visualize_clusters(viz_data)