In [56]:
import pandas as pd
import nltk
import re
import string
import numpy as np
from nltk import word_tokenize
#nltk.download('stopwords')
#nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [42]:
#---------------STARTING WITH R8----------------------------
##------------BUILDING THE DATASET
X_train = pd.read_csv('datasets/r8-train-all-terms.txt', sep="\t", header=None)
X_test = pd.read_csv('datasets/r8-test-all-terms.txt', sep="\t", header=None)
data_r8=pd.concat([X_train,X_test], ignore_index=True)
data_r8.columns = ["class", "text"]
classes_count = data_r8.groupby('class').count().sort_values(by=['text'],ascending=False)
classes_count

Unnamed: 0_level_0,text
class,Unnamed: 1_level_1
earn,3923
acq,2292
crude,374
trade,326
money-fx,293
interest,271
ship,144
grain,51


In [18]:
##------------PRE-PROCESSING START FROM HERE-------------##
def nlp_preprocessing(text):
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    # Replace nweline by some space
    text = text.replace('\r\n', ' ').replace('\n', ' ')
    word_tokens = word_tokenize(text)  # n_rows 1971
    stems = ''
    for word in word_tokens:
        stemed_word = stemmer.stem(word)
        if ((stemed_word not in stopwords) and (re.search('[a-zA-Z]', stemed_word)) and stemed_word.isalpha() and len(stemed_word) > 3):
            stems = stems + ' ' + stemed_word

    return stems[1:]  # to remove the first space of the file

In [38]:
data_r8_processed = data_r8
for index ,row in data_r8_processed.iterrows():
    print(index, end="\r")
    row['text'] = nlp_preprocessing(row['text'])

7673

In [54]:
len(nlp_preprocessing(data_r8.iloc[5]["text"]))
len(data_r8.iloc[5]["text"])

1346

In [52]:
print(len(data_r8_processed.iloc[5]["text"]))
print(len(data_r8.iloc[5]["text"]))

853
1346


In [71]:
##------------CO-CLUSTERING START FROM HERE-------------##
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(np.unique(data_r8_processed['class']))
data = data_r8_processed['text']
labels = list(le.transform(data_r8_processed['class'].tolist()))

In [87]:
from sklearn.metrics import normalized_mutual_info_score as nmi
from sklearn.metrics import adjusted_rand_score as ari
from coclust.coclustering import CoclustInfo, CoclustMod, CoclustSpecMod
from coclust.evaluation.external import accuracy
from scipy.io import loadmat


def execute_coclustering(tf_idf, method, n_clusters, return_pred_rows=True, max_iteration=300):
    global model
    print("---executing ",method)
    if(method=="CoclustInfo"):
        model = CoclustInfo(n_row_clusters=n_clusters, n_col_clusters=n_clusters, n_init=10, max_iter=max_iteration)
    elif(method=="CoclustMod"):
        model = CoclustMod(n_clusters=n_clusters, n_init=10, max_iter=max_iteration)
    elif(method=="CoclustModFuzzy"):
        model = CoclustSpecMod(n_clusters=n_clusters, n_init=10, max_iter=max_iteration)
    model.fit(tf_idf)
    pred_row_labels = model.row_labels_
    pred_column_labels = model.column_labels_
    if(return_pred_rows):
        return  pred_row_labels
    else:
        return pred_column_labels

In [73]:
# Evaluate the results
def clustering_quality(true_row_labels, predicted_row_labels):
    nmi_ = nmi(true_row_labels, predicted_row_labels)
    ari_ = ari(true_row_labels, predicted_row_labels)
    acc_ = accuracy(true_row_labels, predicted_row_labels)
    print("NMI : {}\nARI : {}\nAccuracy : {}".format(nmi_, ari_, acc_))
    return nmi_, ari_, acc_

In [88]:
def execute_clustering_evaluation(raw_data, true_labels, row_labels=True,use_words_thresh=True, max_iteration=300):
    global tfidf_vectorizer
    clustering_eval = []
    n_labels = len(np.unique(true_labels))
    if(use_words_thresh):
        tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features=5000, max_df=0.7, min_df=0.001)
    elif(not use_words_thresh):
        tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features=5000)
    tfidf_matrix = tfidf_vectorizer.fit_transform(raw_data)
    colustering_methods = ["CoclustInfo", "CoclustMod", "CoclustModFuzzy"]
    for algo in colustering_methods:
        pred_labels = execute_coclustering(tfidf_matrix, algo, n_labels, return_pred_rows=row_labels, max_iteration=max_iteration)
        nmi_, ari_, acc_ = clustering_quality(true_labels, pred_labels)
        clustering_eval += [[algo, nmi_, ari_, acc_]]
    return clustering_eval

In [90]:
evaluation_list = execute_clustering_evaluation(data, labels, use_words_thresh=True, max_iteration=100)

---executing  CoclustInfo
NMI : 0.4173487731492575
ARI : 0.2573006538465157
Accuracy : 0.40474328902788637
---executing  CoclustMod
NMI : 0.4226733813737461
ARI : 0.318515137141091
Accuracy : 0.4756320041699244
---executing  CoclustModFuzzy
NMI : 0.47719071468224405
ARI : 0.34927249797899385
Accuracy : 0.6215793588741204


  warn_on_dtype=False, estimator=None)
  warn_on_dtype=False, estimator=None)
  warn_on_dtype=False, estimator=None)


In [89]:
evaluation_df = pd.DataFrame(evaluation_list, columns=[ "method", "nmi", "ari", "acc"])