# _All the news_

https://www.kaggle.com/snapcrack/all-the-news

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import sklearn
sklearn.__version__

'0.20.2'

## 1. _Data preparation_

_"Проверьте данные на наличие аномалий"_ - я не очень понял, что здесь имеется в виду.

In [3]:
articles = pd.DataFrame()

for i in range(3):
    articles = articles.append(pd.read_csv('articles{}.csv'.format(1+i), index_col=0))

In [4]:
articles.tail()

Unnamed: 0,id,title,publication,author,date,year,month,url,content
146028,218078,An eavesdropping Uber driver saved his 16-year...,Washington Post,Avi Selk,2016-12-30,2016.0,12.0,https://web.archive.org/web/20161231004909/htt...,Uber driver Keith Avila picked up a p...
146029,218079,Plane carrying six people returning from a Cav...,Washington Post,Sarah Larimer,2016-12-30,2016.0,12.0,https://web.archive.org/web/20161231004909/htt...,Crews on Friday continued to search L...
146030,218080,After helping a fraction of homeowners expecte...,Washington Post,Renae Merle,2016-12-30,2016.0,12.0,https://web.archive.org/web/20161231004909/htt...,When the Obama administration announced a...
146031,218081,"Yes, this is real: Michigan just banned bannin...",Washington Post,Chelsea Harvey,2016-12-30,2016.0,12.0,https://web.archive.org/web/20161231004909/htt...,This story has been updated. A new law in...
146032,218082,What happened in Washington state after voters...,Washington Post,Christopher Ingraham,2016-12-29,2016.0,12.0,https://web.archive.org/web/20161231004909/htt...,The nation’s first recreational marijuana...


In [5]:
articles.shape

(142570, 9)

In [6]:
articles = articles[['title', 'content']]

In [7]:
articles.isnull().sum()

title      2
content    0
dtype: int64

In [8]:
articles['title'][articles['title'].isnull()] = ''

In [9]:
titles = articles['title']

texts = articles['content']

## 2. _Clusterization_ 

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation

from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering

from sklearn import metrics

Ограничим размеры словарей (для сокращения времени работы алгоритмов).

In [11]:
titles_sparse = TfidfVectorizer(max_features=4096).fit_transform(titles)
texts_sparse = TfidfVectorizer(max_features=4096).fit_transform(texts)

In [12]:
titles_sparse.shape, texts_sparse.shape

((142570, 4096), (142570, 4096))

In [13]:
def get_title_metrics(X_train, _, y_pred):
    return { 'Silhouette' : metrics.silhouette_score(X_train, y_pred) }

def get_content_metrics(X_train, y_test, y_pred):
    result = {
        'ARI': metrics.adjusted_rand_score(y_test, y_pred),
        'AMI': metrics.adjusted_mutual_info_score(y_test, y_pred, average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y_test, y_pred),
        'Completeness': metrics.completeness_score(y_test, y_pred),
        'V-measure': metrics.v_measure_score(y_test, y_pred)}
    result.update(get_title_metrics(X_train, y_test, y_pred))
    return result

def print_metrics(get_metrics, dataset_name, algo_name, X_train, y_test, y_pred):
    print('{} for {}:'.format(algo_name, dataset_name))
    for name, val in get_metrics(X_train, y_test, y_pred).items():
        print('{}: {:.3f}'.format(name, val))
    print(40 * '*')

def print_title_metrics(algo_name, X_train, y_pred):
    print_metrics(get_title_metrics, 'titles', algo_name, X_train, None, y_pred)

def print_content_metrics(algo_name, X_train, y_test, y_pred):
    print_metrics(get_content_metrics, 'content', algo_name, X_train, y_test, y_pred)

### 2.1 _AffinityPropagation_

У меня **AffinityPropagation** не работает на большом датасете - не хватает памяти ([`MemoryError`](https://docs.python.org/3/library/exceptions.html)), пришлось его сильно уменьшить.

In [14]:
ix = np.random.choice(range(titles_sparse.shape[0]), 4096, replace=False)

X_train = titles_sparse[ix]
X_test = texts_sparse[ix]

In [15]:
%%time

y_test = AffinityPropagation().fit_predict(X_train)

print_title_metrics('AffinityPropagation', X_train, y_test)

AffinityPropagation for titles:
Silhouette: -0.031
****************************************
Wall time: 2min 29s


In [16]:
%%time

print_content_metrics('AffinityPropagation', X_train, y_test, AffinityPropagation().fit_predict(X_test))

AffinityPropagation for content:
ARI: 0.001
AMI: 0.002
Homogenity: 0.055
Completeness: 0.022
V-measure: 0.031
Silhouette: -0.023
****************************************
Wall time: 2min 39s


### 2.2 _DBSCAN_

Так же пришлось уменьшить датасет (20% от исходных данных), а то долго считает.

In [17]:
_, X_train, _, X_test = train_test_split(titles_sparse, texts_sparse, test_size=0.2, random_state=299)

In [18]:
X_train.shape, X_test.shape

((28514, 4096), (28514, 4096))

In [19]:
%%time

y_test = DBSCAN().fit_predict(X_train)

print_title_metrics('DBSCAN', X_train, y_test)

DBSCAN for titles:
Silhouette: -0.284
****************************************
Wall time: 1min 4s


In [20]:
%%time

print_content_metrics('DBSCAN', X_train, y_test, DBSCAN().fit_predict(X_test))

DBSCAN for content:
ARI: -0.003
AMI: -0.001
Homogenity: 0.000
Completeness: 0.000
V-measure: 0.000
Silhouette: -0.009
****************************************
Wall time: 5min 11s


# 3. _PCA Clusterization_ 

In [21]:
titles_pca = TruncatedSVD(n_components=128).fit_transform(titles_sparse)
texts_pca = TruncatedSVD(n_components=128).fit_transform(texts_sparse)

titles_pca.shape, texts_pca.shape

((142570, 128), (142570, 128))

### 3.1 _AffinityPropagation_

In [22]:
X_train = titles_pca[ix]
X_test = texts_pca[ix]

In [23]:
%%time

y_test = AffinityPropagation().fit_predict(X_train)

print_title_metrics('PCA / AffinityPropagation', X_train, y_test)

PCA / AffinityPropagation for titles:
Silhouette: 0.008
****************************************
Wall time: 35.5 s


In [24]:
%%time

print_content_metrics('PCA / AffinityPropagation', X_train, y_test, AffinityPropagation().fit_predict(X_test))

PCA / AffinityPropagation for content:
ARI: 0.016
AMI: 0.063
Homogenity: 0.454
Completeness: 0.433
V-measure: 0.443
Silhouette: -0.198
****************************************
Wall time: 51.7 s


### 3.2 _DBSCAN_

In [25]:
_, X_train, _, X_test = train_test_split(titles_pca, texts_pca, test_size=0.2, random_state=299)

In [26]:
%%time

y_test = DBSCAN().fit_predict(X_train)

print_title_metrics('PCA / DBSCAN', X_train, y_test)

PCA / DBSCAN for titles:
Silhouette: 0.265
****************************************
Wall time: 6min 48s


In [27]:
%%time

print_connent_metrics('PCA / DBSCAN', X_train, y_test, DBSCAN().fit_predict(X_test))

NameError: name 'print_connent_metrics' is not defined

### 3.3 _AgglomerativeClustering_

In [28]:
%%time

y_test = AgglomerativeClustering().fit_predict(X_train)

print_title_metrics('PCA / AgglomerativeClustering', X_train, y_test)

PCA / AgglomerativeClustering for titles:
Silhouette: 0.113
****************************************
Wall time: 2min 43s


In [29]:
%%time

print_content_metrics('PCA / AgglomerativeClustering', X_train, y_test, AgglomerativeClustering().fit_predict(X_test))

PCA / AgglomerativeClustering for content:
ARI: 0.058
AMI: 0.015
Homogenity: 0.028
Completeness: 0.011
V-measure: 0.016
Silhouette: 0.036
****************************************
Wall time: 2min 30s
