In [1]:
import numpy as np
import pandas as pd 
import nltk
import re
import string
import scipy.sparse as sp
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import normalize
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans 
from sklearn.metrics import silhouette_score
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
raw_articles_data = pd.read_csv('articles_raw.csv')
raw_articles_data

Unnamed: 0,id,title,text,date
0,1247638,Triple W to Feature DFree at CES 2019 - First ...,jan. according to the u. s. national institute...,2019-01-03
1,1247639,Pundi X Coin Review: Should You Invest in the ...,the npxs token developer aim to have ethereum ...,2019-01-06
2,1247641,Pundi X 1-Day Trading Volume Reaches $1.04 Mil...,pundi x (currency:npxs) traded 2. u dollar dur...,2019-01-02
3,1247642,Pundi X Hits Market Capitalization of $74.35 M...,pundi x (currency:npxs) traded 0. pm eastern o...,2019-01-04
4,1247643,Pundi X (NPXS) Market Cap Reaches $73.89 Million,pundi x (currency:npxs) traded down 0. u dolla...,2019-01-03
...,...,...,...,...
11182,1388717,,microsoft for the mixed reality partnership pr...,2019-01-02
11183,1388718,,ar (ar) app that convert any room to a virtual...,2019-01-09
11184,1388738,Intercellular is an Educational Experience Tak...,there nothing like a good educational vr (vr) ...,2019-02-13
11185,1388746,Wonderfall Mixes VR and Actual Reality For An ...,"it got a lot of vr and ar exhibit right now, n...",2019-01-30


In [4]:
# Create function to process and tokenize raw texts
def preprocess(text, stopwords={}, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    # Lower case
    text = text.lower()
    # Handle URL
    text = re.sub(r"https?://t.co/\w{10}",' ', text)
    # Deal with "'s" 
    text = re.sub(r"'s", "", text)
    # Deal with "'" 
    translator2 = str.maketrans({key: None for key in string.punctuation[6]})
    text = text.translate(translator2) 
    # Deal with the rest of punctuations
    translator3 = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    text = text.translate(translator3)
    # Handle unicode
    text = re.sub(r'[^\x00-\x7F]+',' ', text)
    # Split the text
    r1 = nltk.word_tokenize(text)
    # Lemmatize the text
    r2 = [lemmatizer.lemmatize(word) for word in r1]
    # Remove the stopwords
    r3 = [word for word in r2 if not word in stopwords]
    # Remove digits
    r4 = [word for word in r3 if word.isalpha()]
    return r4

In [5]:
# Import NLTK stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
extra_stopwords = set()
stopwords = set(nltk.corpus.stopwords.words('english')) | extra_stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shamita\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Shamita\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shamita\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# Put the preprocessed texts into a list
articles = []
for i in range(0,raw_articles_data.shape[0]):
    tokenized_text = preprocess(raw_articles_data['text'][i], stopwords)
    articles.append(' '.join(tokenized_text))

In [7]:
# Create function to build a sparse TFIDF matrix
def tfidf(docs):    
    document_words = [doc.split() for doc in docs]
    vocab = sorted(set(sum(document_words, [])))
    vocab_dict = {k:i for i,k in enumerate(vocab)}
    X_tf = np.zeros((len(docs), len(vocab)), dtype=int)
    for i,doc in enumerate(document_words):
        for word in doc:
            X_tf[i, vocab_dict[word]] += 1
            
    idf = np.log(X_tf.shape[0]/X_tf.astype(bool).sum(axis=0))
    X_tfidf = X_tf * idf
    tfidf = sp.csr_matrix(X_tfidf)
    all_words = vocab
    return(tfidf,all_words)

In [None]:
# Compute a TFIDF matrix for the texts and normalize the matrix
X_tfidf, words = tfihdf(articles)
tf_idf_norm = normalize(X_tfidf)

In [None]:
# Create function to select the optimal k value of k-means clustering with silhouette method
def plot_sil(df, kmax=30): 
    sil = []
    for k in range(2, kmax+1):
        kmeans = KMeans(n_clusters=k, max_iter=600, algorithm='auto').fit(df)
        labels = kmeans.labels_
        sil.append(silhouette_score(df, labels, metric='euclidean')) 

    plt.plot(range(2, kmax+1), sil)
    plt.xlabel('Number of Clusters')
    plt.ylabel('Score')
    plt.title('Silhouette Method')
    plt.grid()
    plt.show()

In [None]:
# Determine the optimal k value by identifying the highest score
random_state=0xCAFE
plot_sil(svd_matrix, kmax=30)

In [None]:
# Create function to cluster the texts and plot the position of the texts in 2D space 
def plot_and_predict(df, n_clusters=2, random_state=0xCAFE):

    kmeans = KMeans(n_clusters=n_clusters, max_iter=600, algorithm='auto')
    fitted = kmeans.fit(df)
    prediction = kmeans.predict(df)
    
    plot_matrix = TruncatedSVD(n_components=2, algorithm='randomized', random_state=random_state).fit_transform(df)
    
    fig, ax = plt.subplots(figsize=(12,6))
    for c in range(n_clusters):
        ax.scatter(plot_matrix[prediction==c][:,0], plot_matrix[prediction==c][:,1], label=c, alpha=0.8, edgecolors='none')

    ax.legend()
    ax.grid(True)
    plt.show()
    
    return prediction

In [None]:
# Show the clustering result in 2D space 
prediction = plot_and_predict(svd_matrix, n_clusters=2)

In [None]:
# Show the raw texts in these two groups
n_clusters = 2
for i in range(n_clusters):
    print('%d, The raw texts in group %d:'%(i,i))
    result = raw_articles_data[prediction==i]['text']   
    display(result)

In [None]:
# Deal with the same articles in group 1
index, = np.where(prediction==0)
new_raw_articles_data = raw_articles_data.iloc[index].reset_index(drop=True)

tf_idf_norm_step2 = tf_idf_norm[prediction==0]
svd_matrix_step2 = TruncatedSVD(n_components=100, algorithm='randomized', random_state=0xCAFE).fit_transform(tf_idf_norm_step2)

# Determine the optimal k value by identifying the highest score
random_state=0xCAFE
plot_sil(svd_matrix_step2, kmax=30)

In [None]:
# Show the clustering result in 2D space 
prediction2 = plot_and_predict(svd_matrix_step2, n_clusters=2)

In [None]:
# Show the raw texts in these two groups
n_clusters = 2
for i in range(n_clusters):
    print('%d, The raw texts in group %d:'%(i,i))
    result = raw_articles_data[prediction==0][prediction2==i]['text']   
    display(result)

In [None]:
# Find cluster centers
random_state = 0xCAFE
kmeans = KMeans(n_clusters=2, max_iter=600, algorithm='auto')
fitted = kmeans.fit(svd_matrix_step2)
pred = kmeans.predict(svd_matrix_step2)
centers = fitted.cluster_centers_

In [None]:
# Choose article whose position is most close to the cluster center as representative
closest, _ = pairwise_distances_argmin_min(centers, svd_matrix_step2)
representative_article = new_raw_articles_data['text'][closest[0]]
new_raw_articles_data['title'][closest[0]]

In [None]:
# Choose words with top-5 highest TFIDF scores as top-5 keywords
tf_idf_array = tf_idf_norm_step2.toarray()
top5_index = tf_idf_array[closest[0]].argsort()[-5:][::-1]
top5_words = list(np.array(words)[top5_index.tolist()])
top5_words

In [None]:
# choose articles with top-10 highest sum of TFIDF scores regarding the top-5 keywords as top-10 documents
sum_tfidf = np.sum(tf_idf_array[:,top5_index],axis=1)
top10_index = sum_tfidf.argsort()[-10:][::-1]
top_articles_q3 = np.array(new_raw_articles_data['text'][top10_index]).tolist()
new_raw_articles_data['title'][top10_index]

In [None]:
# choose articles with top-10 highest cosine similarities regarding representative article as top-10 documents
c_matrix = cosine_similarity(tf_idf_norm_step2)
array1 = np.squeeze(np.asarray(c_matrix))[closest[0]]
array2 = np.where(array1 > 0)[0]
top10_index2 = array1[array2].argsort()[-10:][::-1]
top_articles_q4 = np.array(new_raw_articles_data['text'][array2[top10_index2]]).tolist()
new_raw_articles_data['title'][array2[top10_index2]]

In [None]:
# choose articles with top-10 highest cosine similarities only calculated by top-5 keywords as top-10 documents
c_matrix = cosine_similarity(tf_idf_norm_step2[:,top5_index])
array1 = np.squeeze(np.asarray(c_matrix))[closest[0]]
array2 = np.where(array1 > 0)[0]
top10_index3= array1[array2].argsort()[-10:][::-1]
top_articles_q5 = np.array(new_raw_articles_data['text'][array2[top10_index3]]).tolist()
new_raw_articles_data['title'][array2[top10_index3]]

In [None]:
# Create a dictionary with the above results
Results_Dict = {'Category 1': [representative_article, top5_words, top_articles_q3, top_articles_q4, top_articles_q5]}

In [None]:
Results_Dict['Category 1'][0]

In [None]:
Results_Dict['Category 1'][1]

In [None]:
Results_Dict['Category 1'][2]

In [None]:
66. 