In [None]:
import sklearn
import numpy as np
import re
import string
import pandas as pd
import nltk
import os
import sys
from sklearn import feature_extraction
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from datetime import datetime
from nltk.stem.snowball import SnowballStemmer
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances

stemmer = SnowballStemmer("english")
nltk.download('stopwords')
nltk.download('punkt')



In [None]:

def tokenize_and_stem(text):
    #Clean Headers
    text = re.sub(r'(From:\s+[^\n]+\n)', '', text)
    text = re.sub(r'(Subject:[^\n]+\n)', '', text)
    text = re.sub(r'(([\sA-Za-z0-9\-]+)?[A|a]rchive-name:[^\n]+\n)', '', text)
    text = re.sub(r'(Last-modified:[^\n]+\n)', '', text)
    text = re.sub(r'(Version:[^\n]+\n)', '', text)
    #Clean More Text
    text = text.lower()
    text = text.strip()
    re_url = re.compile(r'(?:http|ftp|https)://(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?')
    re_email = re.compile('(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])')
    text = re.sub(re_url, '', text)
    text = re.sub(re_email, '', text)
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub(r'(\d+)', ' ', text)
    text = re.sub(r'(\s+)', ' ', text)
    # Remove repeating chars
    text = re . sub (r"!+", "! ", text )
    text = re . sub (r" \.+ ", ". ", text )
    text = re . sub (r" \?+ ", "? ", text )
    text = re . sub (r" \*+ ", "* ", text )
    text = re . sub (r"\ >+", "> ", text )
    text = re . sub (r"\ <+", "< ", text )
    # Clean shorthands
    text = re . sub ("\’s"," ", text )
    text = re . sub ("\’ve"," have ", text )
    text = re . sub ("\’re", " are ", text )
    text = re . sub ("\’ll", " will ", text )
    text = re . sub ("I’m", "I am", text )
    text = re . sub ("\’d", " would ", text )
    text = re . sub ("n’t", " not ", text )
    text = re . sub (" can ’t", " can not ", text , flags = re . IGNORECASE )
    text = re . sub ("i\.e\.", "id est ", text , flags = re . IGNORECASE )
    text = re . sub ("e\.g\.", " for example ", text , flags = re . IGNORECASE )
    text = re . sub ("e- mail ", " email ", text , flags = re . IGNORECASE )
    # Special characters
    text = re . sub ("\$"," dollar ", text )
    text = re . sub ("\&", " and ", text )
    text = re . sub ("\%", " percent ", text )
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [None]:
def splite_data(data,target,size):
    X_train, X_test, y_train, y_test = train_test_split(data,target,random_state=42, test_size=size)
    print("Data Train",(1-size)*100,'%#################',"Data Test",size*100,'%')
    print ('Train Size:', len(X_train),'Test Size:', len(X_test))
    print ('#####################################')
    return X_train, X_test, y_train, y_test

In [None]:
dataset=fetch_20newsgroups(subset='all',shuffle=True,random_state=42)

X_train_80, X_test_20, y_train_80, y_test_20=splite_data(dataset.data,dataset.target,0.20)
X_train_70, X_test_30, y_train_70, y_test_30=splite_data(dataset.data,dataset.target,0.30)
X_train_60, X_test_40, y_train_60, y_test_40=splite_data(dataset.data,dataset.target,0.40)

In [None]:
def clusters_visualization(PCA,clusters,group,dataset,test_type):
    
    print("Cluster Type",test_type)
    #create data frame that has the result of the MDS plus the cluster numbers and titles
    
    df = pd.DataFrame(dict(x=PCA[:,0], y=PCA[:, 1], z=PCA[:, 2], cluster=clusters,news_group=group, title=[dataset.target_names[i] for i in group])) 
    #group by cluster
    groups = df.groupby('cluster')
    # For Lable Checking For Give Cluster
    df22 = df.groupby(['cluster', 'news_group'])['news_group'].size()
    df2 = df22.reset_index(level='cluster').groupby('cluster')['news_group'].idxmax().reset_index(name='news_group')
    news_groups=['alt.atheism', 'comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware','comp.sys.mac.hardware', 'comp.windows.x',   'misc.forsale',   'rec.autos',   'rec.motorcycles', 'rec.sport.baseball',  'rec.sport.hockey',  'sci.crypt',  'sci.electronics',  'sci.med',   'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
    
    print ("########External Evaluation IntraCluster (Purity)#########")
    
    df3 = df22.reset_index(level='cluster').groupby('cluster')['news_group'].max().reset_index(name='news_group')
    df4 = df22.reset_index(level='cluster').groupby('cluster')['news_group'].sum().reset_index(name='news_group')
    print(pd.to_numeric(df3['news_group'])/pd.to_numeric(df4['news_group']))
    
    print ("###################################################")
    
    
    
    # set up plot
    fig = plt.figure(figsize=(15, 9))
    ax = fig.add_subplot(111, projection='3d')
    ax.margins(0.07) # Optional, just adds 7% padding to the autoscaling

    for name, group in groups:
        ax.plot(group.x, group.y, group.z, marker='o', linestyle='', ms=10, label=news_groups[pd.to_numeric(df2.loc[df2['cluster'] == name, 'news_group'].iloc[0])], mec='none')
        ax.set_aspect('auto')
        ax.tick_params(\
            axis= 'x',          # changes apply to the x-axis
            which='both',      # both major and minor ticks are affected
            bottom='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelbottom='off')
        ax.tick_params(\
            axis= 'y',         # changes apply to the y-axis
            which='both',      # both major and minor ticks are affected
            left='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelleft='off')

    ax.legend(numpoints=1)  #show legend with only 1 point   
  

    plt.show() #show the plot
    plt.close()

In [None]:
def compute_KMEAN_cluster(X_train, X_test, y_train, y_test,num_clusters,dataset):
    
    #Train
    tfidf_vectorizer = TfidfVectorizer(max_df=0.55, max_features=20000,
                                 min_df=1, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem)
    
    tfidf = tfidf_vectorizer.fit_transform(X_train)

    #dist = 1 - cosine_similarity(tfidf)
    
    kmeans = KMeans(n_clusters=num_clusters ,init='k-means++',max_iter=100, n_init=1 ).fit(tfidf)
    clusters = kmeans.labels_.tolist()
    
    pca = PCA(n_components = 3).fit(tfidf.toarray())
    X_pca = pca.transform(tfidf.toarray())
    
    clusters_visualization(X_pca,clusters,y_train,dataset,'Train'+' & clusters ('+str(num_clusters)+')' )
    
    #Test
    tfidf2 = tfidf_vectorizer.transform(X_test)

    #dist2 = 1 - cosine_similarity(tfidf2)
   
    
    Y=kmeans.predict(tfidf2)
    
    pca = PCA(n_components = 3).fit(tfidf2.toarray())
    X_pca = pca.transform(tfidf2.toarray())
    
    clusters_visualization(X_pca,Y,y_test,dataset,'Test'+' & clusters ('+str(num_clusters)+')' )
    
    #InterCluster Distance  ('Distance Between Clusters')
    distances = euclidean_distances(kmeans.cluster_centers_)
    tri_dists = distances[np.triu_indices(num_clusters, 1)]
    print("InterCluster Distance  ('Distance Between Clusters')")
    print( "Complete Linkage Distance",tri_dists.max())
    print( "Average Linkage Distance", tri_dists.mean())
    print( "Single Linkage Distance", tri_dists.min())
    print("InterCluster Distance  ('Distance Between Clusters') END")

In [None]:
for cluster in [3,6,12,20]:
    compute_KMEAN_cluster(X_train_80, X_test_20, y_train_80, y_test_20,cluster,dataset)
    compute_KMEAN_cluster(X_train_70, X_test_30, y_train_70, y_test_30,cluster,dataset)
    compute_KMEAN_cluster(X_train_60, X_test_40, y_train_60, y_test_40,cluster,dataset)  