In [1]:
import nltk
import numpy as np
import pandas as pd

In [2]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
dataset = pd.read_csv('/content/20newsgroup_preprocessed.csv', delimiter=';')

In [4]:
dataset = dataset.dropna()

In [5]:
# Extract columns that are needed
dataset = dataset[['target', 'text_cleaned']]

In [6]:
dataset_shuffle = dataset.sample(frac=1, random_state=0)


In [7]:
reduced_dataset = pd.DataFrame([], columns=['target', 'text', 'text_cleaned'])

# 各ジャンル40文書抽出
for gunre in dataset['target'].unique():
    dataset_gunre = dataset_shuffle[dataset_shuffle['target']==gunre]
    dataset_gunre = dataset_gunre.iloc[0:40, :]
    reduced_dataset =  pd.concat([reduced_dataset, dataset_gunre], axis=0)

dataset = reduced_dataset.reset_index(drop=True)


In [8]:
# Tokenization
text_cleaned = list(dataset['text_cleaned'])

def tokenize(text):
    text = text.split(' ')
    return text

text_tokenized = [tokenize(text) for text in text_cleaned]

In [9]:
# Steming, Lemmatize
from nltk.corpus import wordnet as wn

def lemmatize_word(word):
    word=word.lower()
    
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma


# Remove stop words and too short words
def remove_stopwords(word, stopwordset):
    if word in stopwordset or len(word) <= 2:
        return None
    else:
        return word


en_stop = nltk.corpus.stopwords.words('english')

text_lemmatized = []
documents = []

# Aparently, this dataset does not include 'the', 'of', and so on, 
# but remove stop words just to be sure
for text in text_tokenized:
    text_lemmatized = [remove_stopwords(lemmatize_word(word), en_stop) for word in text]
    documents.append([word for word in text_lemmatized if word is not None])

In [48]:
len(documents)

800

In [10]:
word2id = {}
counter = 0
for document in documents:
    for word in document:
        if word not in word2id.keys():
            word2id[word] = counter
            counter += 1


In [11]:
collection = nltk.TextCollection(documents)
terms = list(set(collection))
tf_idf = []
for index, doc in enumerate(documents):
    tmp_vec = np.zeros(len(word2id))
    for term in word2id.keys():
        tmp_vec[word2id[term]] = collection.tf_idf(term, doc)
    tf_idf.append(list(tmp_vec))

In [17]:
from nltk.cluster.kmeans import KMeansClusterer
from nltk.cluster.util import cosine_distance

# トピック数に設定
num_clusters = 20
km20 = KMeansClusterer(num_clusters, distance=cosine_distance, repeats=10)

In [18]:
tf_idf = np.array(tf_idf)

In [19]:
clusters = km20.cluster(tf_idf, assign_clusters=True)

In [20]:
dataset20_cluster_num = pd.concat([dataset, pd.Series(clusters)], axis=1)

d20cn = dataset20_cluster_num.rename(columns={0: "cluster_num"})

In [21]:
d20cn["cluster_num"].value_counts()

0     89
3     83
10    60
14    55
2     49
9     49
7     48
13    46
17    45
19    37
15    34
8     33
18    29
5     26
16    23
1     22
12    21
6     19
11    18
4     14
Name: cluster_num, dtype: int64

In [22]:
for i in range(20):
    print("Result of cluster No.{}".format(i))
    print(d20cn[d20cn["cluster_num"]==i]["target"].value_counts())
    print("\n")

Result of cluster No.0
soc.religion.christian    23
talk.religion.misc        19
alt.atheism               13
talk.politics.mideast      9
talk.politics.guns         8
talk.politics.misc         6
sci.med                    3
sci.space                  2
rec.sport.baseball         2
misc.forsale               1
comp.graphics              1
rec.motorcycles            1
rec.autos                  1
Name: target, dtype: int64


Result of cluster No.1
comp.sys.mac.hardware       4
talk.politics.mideast       2
misc.forsale                2
sci.med                     2
comp.graphics               2
rec.sport.baseball          2
comp.sys.ibm.pc.hardware    2
sci.electronics             2
rec.sport.hockey            1
rec.autos                   1
talk.politics.guns          1
alt.atheism                 1
Name: target, dtype: int64


Result of cluster No.2
sci.med                    6
sci.space                  5
talk.politics.guns         5
rec.sport.hockey           5
alt.atheism         

In [23]:
dataset_11topic = dataset.copy()

まずは，　token化
steming
tf_idf
cosin
kmeans

In [24]:
# 似ているカテゴリをまとめる
# num_topics 20 -> 11
dataset_11topic.loc[dataset_11topic['target'].isin(['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x']), 'target'] = 'comp_pc'
dataset_11topic.loc[dataset_11topic['target'].isin(['rec.autos', 'rec.motorcycles']), 'target'] = 'rec.vehicle'
dataset_11topic.loc[dataset_11topic['target'].isin(['rec.sport.baseball', 'rec.sport.hockey']), 'target'] = 'rec_sport'
dataset_11topic.loc[dataset_11topic['target'].isin(['talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc']), 'target'] = 'talk.politics'
dataset_11topic.loc[dataset_11topic['target'].isin(['alt.atheism', 'soc.religion.christian', 'talk.religion.misc']), 'target'] = 'religion'

In [25]:
dataset_11topic['target'].unique().size

11

In [26]:
# まとめた後のトピック数
num_clusters = 11
km11 = KMeansClusterer(num_clusters, distance=cosine_distance, repeats=10)

In [27]:
clusters = km11.cluster(tf_idf, assign_clusters=True)

In [28]:
dataset11_cluster_num = pd.concat([dataset_11topic, pd.Series(clusters)], axis=1)

d11cn = dataset11_cluster_num.rename(columns={0: "cluster_num"})

In [29]:
d11cn["cluster_num"].value_counts()

0     279
3     133
1      78
5      74
6      52
7      38
2      37
10     36
9      32
4      26
8      15
Name: cluster_num, dtype: int64

In [30]:
for i in range(11):
    print("Result of cluster No.{}".format(i))
    print(d11cn[d11cn["cluster_num"]==i]["target"].value_counts())
    print("\n")

Result of cluster No.0
talk.politics      72
religion           62
sci.crypt          29
comp_pc            24
rec_sport          23
rec.vehicle        20
sci.space          17
sci.med            14
sci.electronics    10
comp.graphics       5
misc.forsale        3
Name: target, dtype: int64


Result of cluster No.1
comp_pc            17
rec.vehicle        14
religion           11
misc.forsale       11
rec_sport           7
sci.crypt           4
sci.electronics     4
talk.politics       3
comp.graphics       3
sci.med             2
sci.space           2
Name: target, dtype: int64


Result of cluster No.2
religion           14
talk.politics       9
misc.forsale        3
comp_pc             3
sci.crypt           2
sci.med             2
sci.space           2
rec_sport           1
sci.electronics     1
Name: target, dtype: int64


Result of cluster No.3
comp_pc            71
comp.graphics      20
misc.forsale       11
rec_sport           7
sci.electronics     7
sci.med             5
religio

In [103]:
d11cn["target"].unique()

array(['religion', 'comp.graphics', 'comp_pc', 'misc.forsale',
       'rec.vehicle', 'rec_sport', 'sci.crypt', 'sci.electronics',
       'sci.med', 'sci.space', 'talk.politics'], dtype=object)

In [31]:
dataset_7topic = dataset_11topic.copy()

# num_topics 11 -> 7
dataset_7topic.loc[dataset_7topic["target"].isin(["comp.graphics", "comp_pc", "sci.crypt", "sci.electronics"]), "target"] = "comp"
dataset_7topic.loc[dataset_7topic["target"].isin(["rec.vehicle", "rec_sport"]), "target"] = "rec"


In [32]:
# 大きなまとまりでのトピック数でクラスタリング
num_clusters = 7
km7 = KMeansClusterer(num_clusters, distance=cosine_distance, repeats=10)

In [33]:
clusters = km7.cluster(tf_idf, assign_clusters=True)

In [36]:
dataset7_cluster_num = pd.concat([dataset_7topic, pd.Series(clusters)], axis=1)

d7cn = dataset7_cluster_num.rename(columns={0: "cluster_num"})

In [37]:
d7cn["cluster_num"].value_counts()

1    164
4    155
3    148
0    136
2     91
5     84
6     22
Name: cluster_num, dtype: int64

In [38]:
for i in range(7):
    print("Result of cluster No.{}".format(i))
    print(d7cn[d7cn["cluster_num"]==i]["target"].value_counts())
    print("\n")

Result of cluster No.0
religion         75
talk.politics    19
comp             17
rec              12
sci.med           7
misc.forsale      5
sci.space         1
Name: target, dtype: int64


Result of cluster No.1
comp             55
talk.politics    49
religion         21
rec              20
sci.med           9
misc.forsale      5
sci.space         5
Name: target, dtype: int64


Result of cluster No.2
rec              32
comp             31
talk.politics     9
sci.space         9
religion          6
misc.forsale      2
sci.med           2
Name: target, dtype: int64


Result of cluster No.3
comp             73
rec              24
talk.politics    22
sci.space        10
religion          7
misc.forsale      7
sci.med           5
Name: target, dtype: int64


Result of cluster No.4
comp             84
rec              20
misc.forsale     14
sci.med          12
sci.space        11
religion          7
talk.politics     7
Name: target, dtype: int64


Result of cluster No.5
rec              

In [46]:
def labels_array(df):
    return df['target'].unique() 


def max_n_ij(df, labels_array, cluster_num):
    df_cn = df.loc[df["cluster_num"]==cluster_num]
    
    precision_list = []
    for j, label in enumerate(labels_array):
        intersection_size = len(df_cn.loc[df["target"]==label])
        precision_list.append(intersection_size)

    return np.array(precision_list).max()


def max_recall(df, i, labels_array):
    recall_list = []
    for label in labels_array:
        df_label = df.loc[df["target"]==label]
        df_label_size = len(df_label)

        intersection_size = len(df_label.loc[df_label['cluster_num']==i])
        
        recall_list.append(intersection_size/df_label_size)
    
    return np.array(recall_list).max()


def purity(df, labels_array, num_clusters):
    n = len(df)

    purity_part_list = []
    for i in range(num_clusters):
        max_n_ij_val = max_n_ij(df, labels_array, i)
        purity_part_list.append(max_n_ij_val)

    return sum(purity_part_list) / n


def inverse_purity(df, labels_array, num_clusters):
    n = len(df)

    inverse_purity_part_list = []

    for i in range(num_clusters):
        max_recall_val = max_recall(df, i, labels_array)
        cluster_size = len(df.loc[df['cluster_num']==i])
        inverse_purity_part_list.append(cluster_size * max_recall_val)

    return sum(inverse_purity_part_list) / n


def F_measure(purity_val, inverse_purity_val):
    return (2 * purity_val * inverse_purity_val) / (purity_val + inverse_purity_val)


In [43]:
# Purity when num_clusters is 20, 11, 7
purity20 = purity(d20cn, list(d20cn['target'].unique()), 20)
purity11 = purity(d11cn, list(d11cn['target'].unique()), 11)
purity7 = purity(d7cn, list(d7cn['target'].unique()), 7)

In [44]:
# Inverse purity when num_clusters is 20, 11, 7
inverse_purity20 = inverse_purity(d20cn, list(d20cn['target'].unique()), 20)
inverse_purity11 = inverse_purity(d11cn, list(d11cn['target'].unique()), 11)
inverse_purity7 = inverse_purity(d7cn, list(d7cn['target'].unique()), 7)

In [47]:
# F-measure
F_measure20 = F_measure(purity20, inverse_purity20)
F_measure11 = F_measure(purity11, inverse_purity11)
F_measure7 = F_measure(purity7, inverse_purity7)

In [49]:
# Result
print("The purity when num_cluster is 20 is: ", round(purity20, 4))
print("The purity when num_cluster is 11 is: ", round(purity11, 4))
print("The purity when num_cluster is 7 is: ", round(purity7, 7))

print("The inverse purity when num_cluster is 20 is:", round(inverse_purity20, 4))
print("The inverse purity when num_cluster is 11 is:", round(inverse_purity11, 4))
print("The inverse purity when num_cluster is 7 is:", round(inverse_purity7, 4))

print("The F-measure when num_cluster is 20 is:", round(F_measure20, 4))
print("The F-measure when num_cluster is 11 is:", round(F_measure11, 4))
print("The F-measure when num_cluster is 7 is:", round(F_measure7, 4))

The purity when num_cluster is 20 is:  0.24
The purity when num_cluster is 11 is:  0.3137
The purity when num_cluster is 7 is:  0.4725
The inverse purity when num_cluster is 20 is: 0.2958
The inverse purity when num_cluster is 11 is: 0.4263
The inverse purity when num_cluster is 7 is: 0.3665
The F-measure when num_cluster is 20 is: 0.265
The F-measure when num_cluster is 11 is: 0.3615
The F-measure when num_cluster is 7 is: 0.4128
