In [1]:
# relative to this notebook,  adding src folder to sys.path
src_path = os.path.abspath(os.path.join('..'))
src = os.path.join(src_path,"src")
if src not in sys.path:
    sys.path.append(src)

In [2]:
import pandas as pd
import numpy as np

from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# https://towardsdatascience.com/a-friendly-introduction-to-text-clustering-fa996bcefd04

In [3]:
data_in = os.path.join(src_path, 'data','02_intermediate','ArticleDataset.json')
article_complete = pd.read_json(data_in)
article_small = article_complete.head(1000)


In [4]:
# name the columns
article_small.columns = ['date','heading','content','link','empty']

# cut down columns
articles = article_small[['heading','content']]


In [6]:
# Data cleaning
# https://towardsdatascience.com/nlp-for-beginners-cleaning-preprocessing-text-data-ae8e306bef0f

import spacy

# , disable=['parser', 'tagger', 'ner']
nlp = spacy.load("en_core_web_sm")
stops = spacy.lang.en.stop_words.STOP_WORDS


def normalize(comment, lowercase, remove_stopwords):
    if lowercase:
        comment = comment.lower()
    comment = nlp(comment)
    lemmatized = list()
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            if not remove_stopwords or (remove_stopwords and lemma not in stops):
                lemmatized.append(lemma)
    return " ".join(lemmatized)


articles['Text_After_Clean'] = articles['content'].apply(normalize, lowercase=True, remove_stopwords=True)

In [9]:
vec = TfidfVectorizer()
vec.fit(articles.content.values)
features = vec.transform(articles.Text_After_Clean.values)        # <--- feature matrix

# try dim reduction using LSI
# TODO

# DBSCAN model, tune the parameter
dbscan = DBSCAN(eps=0.8, min_samples=2)
db1 = dbscan.fit(features)

labels = db1.labels_

n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print("number of clusters:",n_clusters)

# identify core points
#core_samples = np.zeros_like(labels,dtype =bool)
#core_samples[dbscan.core_sample_indices_] = True
#print(core_samples)

# metric, Sillhouette score or BIC

number of clusters: 9
