**Корректность проверена на Python 3.6:**
+ numpy 1.15.4
+ sklearn 0.20.2

# Пример: кластеризация текстов

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Выборка

In [None]:
from sklearn.datasets import fetch_20newsgroups

train_all = fetch_20newsgroups(subset='train')
print(train_all.target_names)

In [None]:
simple_dataset = fetch_20newsgroups(
    subset='train', 
    categories=['comp.sys.mac.hardware', 'soc.religion.christian', 'rec.sport.hockey'])

In [None]:
print(simple_dataset.data[0])

In [None]:
simple_dataset.target

In [None]:
print(simple_dataset.data[-1])

In [None]:
print(simple_dataset.data[-2])

In [None]:
print(len(simple_dataset.data))

## Признаки

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=500, min_df=10)
matrix = vectorizer.fit_transform(simple_dataset.data)

In [None]:
print(matrix.shape)

## Аггломеративная кластеризация (neighbour joining)

In [None]:
from sklearn.cluster.hierarchical import AgglomerativeClustering

model = AgglomerativeClustering(n_clusters=3, affinity='cosine', linkage='complete')
preds = model.fit_predict(matrix.toarray())

In [None]:
print(list(preds))

In [None]:
print(matrix[0])

In [None]:
vectorizer.get_feature_names()

In [None]:
vectorizer.get_feature_names()[877]

In [None]:
simple_dataset.data[0]

## KMeans

In [None]:
from sklearn.cluster import KMeans

model = KMeans(n_clusters=3, random_state=1)
preds = model.fit_predict(matrix.toarray())
print(preds)

In [None]:
print(simple_dataset.target)

In [None]:
mapping = {2 : 1, 1: 2, 0: 0}
mapped_preds = [mapping[pred] for pred in preds]
print(float(sum(mapped_preds != simple_dataset.target)) / len(simple_dataset.target))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
clf = LogisticRegression()
print (cross_val_score(clf, matrix, simple_dataset.target).mean())

## Более сложная выборка

In [None]:
dataset = fetch_20newsgroups(
    subset='train', 
    categories=['comp.sys.mac.hardware', 'comp.os.ms-windows.misc', 'comp.graphics'])

In [None]:
matrix = vectorizer.fit_transform(dataset.data)
model = KMeans(n_clusters=3, random_state=42)
preds = model.fit_predict(matrix.toarray())
print(preds)
print(dataset.target)

In [None]:
mapping = {2 : 0, 1: 1, 0: 2}
mapped_preds = [mapping[pred] for pred in preds]
print(float(sum(mapped_preds != dataset.target)) / len(dataset.target))

In [None]:
clf = LogisticRegression()
print(cross_val_score(clf, matrix, dataset.target).mean())

## SVD + KMeans

In [None]:
from sklearn.decomposition import TruncatedSVD

model = KMeans(n_clusters=3, random_state=42)
svd = TruncatedSVD(n_components=1000, random_state=123)
features = svd.fit_transform(matrix)
preds = model.fit_predict(features)
print(preds)
print(dataset.target)

In [None]:
mapping = {0 : 2, 1: 0, 2: 1}
mapped_preds = [mapping[pred] for pred in preds]
print(float(sum(mapped_preds != dataset.target)) / len(dataset.target))

In [None]:
model = KMeans(n_clusters=3, random_state=42)
svd = TruncatedSVD(n_components=200, random_state=123)
features = svd.fit_transform(matrix)
preds = model.fit_predict(features)
print(preds)
print(dataset.target)

In [None]:
import itertools
def validate_with_mappings(preds, target, dataset):
    permutations = itertools.permutations([0, 1, 2])
    for a, b, c in permutations:
        mapping = {2 : a, 1: b, 0: c}
        mapped_preds = [mapping[pred] for pred in preds]
        print(float(sum(mapped_preds != target)) / len(target))
        
validate_with_mappings(preds, dataset.target, dataset)

In [None]:
model = KMeans(n_clusters=3, random_state=42)
svd = TruncatedSVD(n_components=200, random_state=321)
features = svd.fit_transform(matrix)
preds = model.fit_predict(features)
print(preds)
print(dataset.target)
validate_with_mappings(preds, dataset.target, dataset)

## Итоги

1. Получили интерпретируемый результат на обеих выборках
1. Реальность, однако, намного более жестока
1. Попробовали использовать AgglomerativeClustering и KMeans