In [None]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)

In [None]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)
import seaborn as sns
import numpy as np

In [None]:
cnx = sqlite3.connect('/content/gdrive/MyDrive/data.db')
df = pd.read_sql_query('SELECT * FROM Article', cnx)
in_category = pd.read_sql_query('SELECT * FROM in_category', cnx)
df['Date'] = pd.to_datetime(df['Date'], utc=True)

In [None]:
df_test = pd.read_sql_query("""SELECT * from Article 
INNER JOIN In_category ON In_category.articleID = Article.articleID 
INNER JOIN Category ON Category.categoryID = In_category.categoryID""", cnx)

In [None]:
df_categories = pd.read_sql_query('SELECT * from In_category', cnx)
df_categories.shape

In [None]:
import re
df['Content'] = df.Title + " " + df.Description
df['Content'] = df['Content'].str.lower()
df['Content'] = df['Content'].str.replace('[^\w\s]', ' ')

with open('/content/gdrive/MyDrive/stopwords-hr.txt', 'r') as f:
  stopwords = [x.strip() for x in f.readlines()]

replace = re.compile(r'\b(' + ('|'.join(stopwords)) + r')\b')

df['Content'] = df['Content'].str.replace(replace, '')
df['Content'] = df['Content'].str.replace(re.compile('\s{2,}'), ' ')
df['Content'] = df['Content'].astype(str)

In [None]:
sample = df.sample(1)
sample_idx = sample.index.item()
sample

In [None]:
print(sample['Content'])

In [None]:
#print((df["Description"] <= "\n").value_counts())
df['Description'].str.count('\s+').lt(5).value_counts()

In [None]:
import nltk
nltk.download('punkt')
def preprocess(text):
  return nltk.word_tokenize(text)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
f = open('/content/gdrive/MyDrive/stopwords-hr.txt')
stop_words = f.read().splitlines() 
f.close()
vectorizer = TfidfVectorizer(tokenizer=preprocess, stop_words=stop_words)

In [None]:
def compute_similarity(a, b):
  tfidf = vectorizer.fit_transform([a, b])
  return ((tfidf * tfidf.T).toarray())[0,1]

In [None]:
def wrapper_compute_similarity(a, b):
  return compute_similarity(str(a['Content']), str(b['Content']))

In [None]:
df_subset = df.sample(10000)
specific_row = df.loc[sample_idx]
result = df_subset.apply(wrapper_compute_similarity, args=(specific_row,), axis=1)
#compute_similarity(str(a["Content"]), str(b["Content"]))

In [None]:
result.sort_values(ascending=False, inplace=True)

In [None]:
sample["Content"]

In [None]:
df_best = result[:5]
df_selected = df.iloc[df_best.index]
df_selected["Similarity"] = result

In [None]:
df_best

In [None]:
df_selected

In [None]:
X = vectorizer.fit_transform(df_subset['Content'])

In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def generate_metrics(X, max_clusters):
  range_n_clusters = range(2, max_clusters)
  silhouette_scores = list()
  davis_bouldin_scores = list()
  labels_per_cluster_num = dict()
  for n_clusters in range_n_clusters:
      kmeans = KMeans(n_clusters=n_clusters)
      kmeans.fit(X)
      results = kmeans.predict(X)
      silhouette_scores.append(silhouette_score(X, kmeans.labels_))
      davis_bouldin_scores.append(davies_bouldin_score(X.toarray(), kmeans.labels_))
      labels_per_cluster_num[n_clusters] = kmeans.labels_
  
  plt.plot(range_n_clusters, silhouette_scores, '-o', label='Silhouette score')
  plt.xlabel("Number of clusters")
  plt.ylabel("Silhouette score")
  plt.title("TF-IDF")
  plt.savefig('silhouette.png', bbox_inches='tight')
  plt.show()
  plt.plot(range_n_clusters, davis_bouldin_scores, '-o', label='Davies-Bouldin score')
  plt.xlabel("Number of clusters")
  plt.ylabel("Davies-Bouldin score")
  plt.title("TF-IDF")
  plt.savefig('davies_bouldin.png', bbox_inches='tight')
  plt.show()

  return labels_per_cluster_num


In [None]:
!pip -q install umap-learn
from sklearn.decomposition import PCA
from bokeh.plotting import figure, show
from bokeh.palettes import Category20_20 as palette
from bokeh.io import output_notebook
import itertools
import umap
from sklearn.manifold import TSNE

def generate_visuals(X, labels_per_cluster_num, chosen_cluster_num):
  # Call once to configure Bokeh to display plots inline in the notebook.
  output_notebook()
  df_subset['cluster'] = labels_per_cluster_num[chosen_cluster_num]

  #PCA
  pca = PCA(n_components=2, random_state=42)
  pca_vecs = pca.fit_transform(X.toarray())

  x0 = pca_vecs[:, 0]
  x1 = pca_vecs[:, 1]

  df_subset['x0_pca'] = x0
  df_subset['x1_pca'] = x1

  colors = itertools.cycle(palette)
  p = figure(plot_width=600, plot_height=450, title = "PCA", tooltips="@Title")
  for i in range(chosen_cluster_num):
    p.scatter('x0_pca', 'x1_pca', source=df_subset[df_subset['cluster'] == i], color = next(colors))
  show(p)

  #TSNE
  tsne = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=30)
  tsne_vecs = tsne.fit_transform(X.toarray())

  x0 = tsne_vecs[:, 0]
  x1 = tsne_vecs[:, 1]

  df_subset['x0_tsne'] = x0
  df_subset['x1_tsne'] = x1

  colors = itertools.cycle(palette)
  p = figure(plot_width=600, plot_height=450, title = "TSNE", tooltips="@Title")
  for i in range(chosen_cluster_num):
    p.scatter('x0_tsne', 'x1_tsne', source=df_subset[df_subset['cluster'] == i], color = next(colors))
  show(p)

  #UMAP
  reducer = umap.UMAP()
  umap_vecs = reducer.fit_transform(X.toarray())

  x0 = umap_vecs[:, 0]
  x1 = umap_vecs[:, 1]

  df_subset['x0_umap'] = x0
  df_subset['x1_umap'] = x1

  colors = itertools.cycle(palette)
  p = figure(plot_width=600, plot_height=450, title = "UMAP", tooltips="@Title")
  for i in range(chosen_cluster_num):
    p.scatter('x0_umap', 'x1_umap', source=df_subset[df_subset['cluster'] == i], color = next(colors))
  show(p)

In [None]:
labels_per_cluster_num = generate_metrics(X, 20)

In [None]:
generate_visuals(X, labels_per_cluster_num, 16)

In [None]:
generate_visuals(X, labels_per_cluster_num, 8)

In [None]:
!pip install -U sentence-transformers
from sentence_transformers import util

def get_example_neighs(example_idx,embds):
    cos_sim = util.cos_sim(embds,embds)
    example_neighs = cos_sim.numpy()[example_idx]
    indices = np.argsort(example_neighs)[-6:-1]
    indices = list(indices)[::-1]
    indices.insert(0,example_idx)
    
    out_df = pd.DataFrame()
    out_df["Article"] = [df_subset.iloc[idx]['Content'] for idx in indices]
    out_df["Cosine Similarity"] = example_neighs[indices]
    
    return out_df

In [None]:
df_subset.iloc[103]['Content']

In [None]:
get_example_neighs(103,X.toarray())