In [0]:
import random
random.seed(42)

import nltk
from nltk import pos_tag
from nltk.corpus import stopwords

import numpy as np

from matplotlib import pyplot as plt

from sklearn import metrics, preprocessing
import sklearn.cluster as clst
from sklearn.cluster import KMeans as KM
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import time

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pickle
import warnings; warnings.simplefilter('ignore')

In [0]:
twenty_train = fetch_20newsgroups(subset='all', categories=None, shuffle=True,
                                  random_state=42)
print(twenty_train.keys())

rs = [1,2,3,5,20,50,100,150,200,250,300]

num_clusters = 20

dim_techs = []

transform = []

# extract tfidf features:
vectorizer = CountVectorizer(min_df=3, stop_words='english')
tfidf_transformer = TfidfTransformer()
X_train_20 = vectorizer.fit_transform(twenty_train.data)
X_train_tfidf_20 = tfidf_transformer.fit_transform(X_train_20)
print(X_train_tfidf_20.shape)

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR', 'description'])
(18846, 52295)


In [0]:
def dim_red_svd(r):
    svd = TruncatedSVD(n_components=r, random_state=42)
    X_train_svd = svd.fit_transform(X_train_tfidf_20)
    return X_train_svd

def dim_red_nmf(r):
    nmf = NMF(n_components=r, random_state=42)
    X_train_nmf= nmf.fit_transform(X_train_tfidf_20)
    return X_train_nmf

def scores(km):
    return [metrics.cluster.contingency_matrix(twenty_train.target, km.labels_),
            metrics.homogeneity_score(twenty_train.target, km.labels_),
            metrics.completeness_score(twenty_train.target, km.labels_),
            metrics.v_measure_score(twenty_train.target, km.labels_),
            metrics.adjusted_rand_score(twenty_train.target, km.labels_),
            metrics.adjusted_mutual_info_score(twenty_train.target, km.labels_)]

def print_scores(scores_list):
    score_names= ['contingeny matrix', 'homogeneity_score', 'completeness_score', 'v_measure_score',
                  'adjusted_rand_score', 'adjusted_mutual_info_score']
    for i in range(1, 6):
        print(str(score_names[i]) + ' ' + str(scores_list[i]))

def apply_kmeans(X_train_dim_red):
    km = KM(n_clusters = num_clusters, random_state=0,
            max_iter = 1000, n_init = 30, n_jobs=-1)
    km.fit(X_train_dim_red)
    X_kmeans = km.predict(X_train_dim_red)
    return km, X_kmeans

def plain(X_dim_red):
    return X_dim_red

def normalize(X_dim_red):
    return preprocessing.scale(X_dim_red)

def log_transform(X_dim_red):
    c=0.01
    X_nonlinear = np.multiply(np.sign(X_dim_red),
                              (np.subtract(np.log10(
                                  abs(X_dim_red)+c), np.log10(c))))
    return X_nonlinear

def log_normalize(X_dim_red):
    return log_transform(normalize(X_dim_red))

def normalize_log(X_dim_red):
    return normalize(log_transform(X_dim_red))

In [0]:
dim_techs = [dim_red_svd, dim_red_nmf]
transforms = [plain, normalize, log_transform, log_normalize, normalize_log]
scores_log = []
best_tech = {}
best_tech['dim_red_svd'] = [0, '']
best_tech['dim_red_nmf'] = [0, '']

In [0]:
for dim_tech in dim_techs:
    for r in rs:
        X_dim_red = dim_tech(r)
        for transform in transforms:
            X_trans = transform(X_dim_red)
            km, _ = apply_kmeans(X_trans)
            scores_list = scores(km)
            if best_tech[dim_tech.__name__][0] < scores_list[3]:
                best_tech[dim_tech.__name__][0] = scores_list[3]
                best_tech[dim_tech.__name__][1] = str(r) + ' ' + transform.__name__
            scores_log.append(scores_list)
            print(dim_tech.__name__ + ' ' + str(r) + ' ' + transform.__name__)
            print_scores(scores_list)
            print('\n')
        print('\n')
    print('\n')
    print('Best technique: ' + best_tech[dim_tech.__name__][1] + 'Score is ' + str(best_tech[dim_tech.__name__][0]))
    print('\n\n\n')

dim_red_svd 1 plain
homogeneity_score 0.02779545671734885
completeness_score 0.030289905047271613
v_measure_score 0.0289891194317001
adjusted_rand_score 0.0058970947589372024
adjusted_mutual_info_score 0.024627566992484234


dim_red_svd 1 normalize
homogeneity_score 0.02779545671734885
completeness_score 0.030289905047271613
v_measure_score 0.0289891194317001
adjusted_rand_score 0.0058970947589372024
adjusted_mutual_info_score 0.024627566992484234


dim_red_svd 1 log_transform
homogeneity_score 0.027700279498944567
completeness_score 0.030887906630044655
v_measure_score 0.02920737791423757
adjusted_rand_score 0.0057301769189110315
adjusted_mutual_info_score 0.0245177307686322


dim_red_svd 1 log_normalize
homogeneity_score 0.027182357883646285
completeness_score 0.028815748829267213
v_measure_score 0.027975231426235838
adjusted_rand_score 0.008879181141351064
adjusted_mutual_info_score 0.02403223780121439


dim_red_svd 1 normalize_log
homogeneity_score 0.027700279498944567
completeness

dim_red_svd 150 log_transform
homogeneity_score 0.4224179479331435
completeness_score 0.4576390850613666
v_measure_score 0.43932371643652257
adjusted_rand_score 0.22950270621153734
adjusted_mutual_info_score 0.4205485736064781


dim_red_svd 150 log_normalize
homogeneity_score 0.39072226283656203
completeness_score 0.41282011357517845
v_measure_score 0.4014673367715211
adjusted_rand_score 0.21840375573526866
adjusted_mutual_info_score 0.3887542324515173


dim_red_svd 150 normalize_log
homogeneity_score 0.3980794671635033
completeness_score 0.4533374219031277
v_measure_score 0.42391529149558704
adjusted_rand_score 0.19297157511880977
adjusted_mutual_info_score 0.3961309105280443




dim_red_svd 200 plain
homogeneity_score 0.23606881736175844
completeness_score 0.40927913535216204
v_measure_score 0.2994292956136741
adjusted_rand_score 0.06023279975644719
adjusted_mutual_info_score 0.2335192223745728


dim_red_svd 200 normalize
homogeneity_score 0.1493709612226473
completeness_score 0.2537

dim_red_nmf 5 log_normalize
homogeneity_score 0.2789982455985024
completeness_score 0.3047802694483108
v_measure_score 0.2913199382210863
adjusted_rand_score 0.1297529419419758
adjusted_mutual_info_score 0.27666274302127725


dim_red_nmf 5 normalize_log
homogeneity_score 0.30932091333637957
completeness_score 0.31284585823845973
v_measure_score 0.3110734003324497
adjusted_rand_score 0.1220918891386104
adjusted_mutual_info_score 0.307091804598393




dim_red_nmf 20 plain
homogeneity_score 0.2833502674383202
completeness_score 0.3732318438209899
v_measure_score 0.32213897073844555
adjusted_rand_score 0.09417972886331091
adjusted_mutual_info_score 0.281003829281312


dim_red_nmf 20 normalize
homogeneity_score 0.273521164601938
completeness_score 0.36274719494038016
v_measure_score 0.31187794812726327
adjusted_rand_score 0.08655020655783453
adjusted_mutual_info_score 0.27114196323392725


dim_red_nmf 20 log_transform
homogeneity_score 0.3644197782299894
completeness_score 0.40394915903841


In [0]:
import pickle
pickle_out = open("list_scores1","wb")
pickle.dump(scores_list, pickle_out)
pickle_out.close()

In [0]:
print('Best technique of dim_red_svd: ' + best_tech['dim_red_svd'][1] + 'Score is ' + str(best_tech['dim_red_svd'][0]))

Best technique of dim_red_svd: 250 log_transformScore is 0.4959037050491699


In [0]:
print('Best technique of dim_red_nmf: ' + best_tech['dim_red_nmf'][1] + 'Score is ' + str(best_tech['dim_red_nmf'][0]))

Best technique of dim_red_nmf: 50 normalize_logScore is 0.5114307814724552


# Results:

|Dim Redn|r value|Transformation|V-measure|
|--------------
| SVD | 250 | Non linear Log | 0.4959|
| NMF | 50 | Normalize-Non-linear Log | 0.5114|

In [0]:
np.save("scores", np.array(scores_log))

**Table of best transforms for each value of r and dimensionality reduction.**

|Dim Redn|r value|Transformation|V-measure|
|--------------
|svd | 1 | Nonlinear log | 0.02920737791423757|
|svd | 2 | Nonlinear log | 0.22137431035889787|
|svd | 3 | Nonlinear log | 0.24556890227908298|
|svd | 5 | Nonlinear log | 0.33620301926167195|
|svd | 20 | Nonlinear log | 0.3780618791036021|
|svd | 50 | Nonlinear log | 0.40744154838791874|
|svd | 100 | Nonlinear log | 0.4278652555578429|
|svd | 150 | Nonlinear log | 0.43932371643652257|
|svd | 200 | log normalize | 0.42117212169141577|
|**svd** | **250** | **Nonlinear log** | **0.4959037050491699**|
|svd | 300 | Nonlinear log | 0.4028692911432682|
|nmf | 1 | plain | 0.028991012059297015|
|nmf | 2 | normalize log | 0.18078926495899358|
|nmf | 3 | Nonlinear log | 0.20889804017403682|
|nmf | 5 | Nonlinear log | 0.31405437018215376|
|nmf | 20 | Nonlinear log | 0.3831676576523215|
|**nmf** | **50** | **normalize log** | **0.5114307814724552**|
|nmf | 100 | normalize log | 0.4782193561264264|
|nmf | 150 | normalize log | 0.458026053925147|
|nmf | 200 | normalize log | 0.4759353007156798|
|nmf | 250 | normalize log | 0.48040482154957237|
|nmf | 300 | normalize log | 0.4901858365152337|


**The best result for SVD has been for r=250 and Log transform(data).**

**The best fresult for NMF has been for r=50 and Normalize(Log transform(data)).**