In [2]:
!pip install stanza

In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import gensim
from gensim.parsing.preprocessing import remove_stopwords
import re

In [3]:
import stanza
stanza.download('en')
nlp = stanza.Pipeline(lang='en',use_gpu=True, processors='tokenize, lemma',pos_batch_size=3000)

In [5]:
DATA_DIR = '../input/article-classification-assignment/dataset'

In [6]:
files = os.listdir(DATA_DIR)
data = pd.DataFrame(columns=['text','clean_text'])

In [8]:
for file_name in files:
    file = f'{DATA_DIR}/{file_name}'
    with open(file,'rb') as f:
        contents = f.read().decode(errors='replace')
        data = data.append({
            'text': contents
        },ignore_index=True)

In [9]:
def clean_document(text):
    text = text = text.replace("\'",'"').replace('"',"'").replace('\n',' ').strip()
    return text

data['text'] = data['text'].apply(lambda x: clean_document(x))

In [12]:
def preprocess_text(text):
    doc = nlp(text)
    lemmatized_sentence = ' '.join([word.lemma for sent in doc.sentences for word in sent.words])
    cleansed_sentence = remove_stopwords(lemmatized_sentence)
    clean_sentence = ' '.join(re.sub(r'[^\w\s]','',cleansed_sentence).split())
    return clean_sentence

data['clean_text'] = data['text'].apply(lambda x: preprocess_text(x))

In [None]:
# Merged the original document and preprocessed document into one dataframe.

In [15]:
df = pd.read_csv('../input/preprocessedsl/final.csv')
df.head()

In [16]:
def count_words(text):
    text = text.split(' ')
    return len(text)

def count_distinct_words(text):
    text = text.split(' ')
    text_set = set(text)
    return len(text_set)

In [19]:
df['original_len'] = df['text'].apply(lambda x: count_words(x))
df['processed_len'] = df['clean_text'].apply(lambda x: count_words(x))
df['original_unique_len'] = df['text'].apply(lambda x: count_distinct_words(x))
df['processed_unique_len'] = df['clean_text'].apply(lambda x: count_distinct_words(x))

In [22]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['clean_text'])
X_normalized = preprocessing.normalize(X,norm='l2')
X_adjusted = 2 - 2*cosine_similarity(X_normalized)

In [37]:
n_clusters = np.arange(5,21)
inertia = []
for n_cluster in n_clusters:
    model = KMeans(n_clusters=n_cluster)
    model.fit(X)
    inertia.append(model.inertia_)

In [38]:
plt.plot(np.arange(5,21),inertia)

In [43]:
model = KMeans(n_clusters=15)
model.fit(X)
df['label'] = model.labels_

In [46]:
for num_cluster in np.arange(0,15):
    data = df[df['label'] == num_cluster]
    words = ''
    text_list = [] 
    for texts in data['clean_text']:
        text = texts.split()
        for word in text:
            if len(word) < 2:
                text.remove(word)
        text_list.extend(text)
    text = ' '.join(text_list)
    words += ''.join(text)
           
    wordcloud = WordCloud(width = 600, height = 600,
                background_color ='white',
                min_font_size = 10).generate(words)
    print(f'---------------------Cluster Number: {num_cluster}-----------------------')
    plt.figure(figsize = (6, 6), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 2)
    plt.show()