In [1]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
from string import punctuation
import re
import nltk
import urllib

# Extracting 'technewsworld' post data
extracted_text = []
for limit in range(0, 501, 25):
    url = 'http://www.technewsworld.com/perl/archives/tnw/?init=%s' % limit
    print("Fetching URL: %s" %url)
    page = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(page,"html.parser" )
    div_block = soup.select("div.teaser")
    for index in range(len(div_block)):
        extracted_text.append(div_block[index].text.replace("[More...]",""))

Fetching URL: http://www.technewsworld.com/perl/archives/tnw/?init=0
Fetching URL: http://www.technewsworld.com/perl/archives/tnw/?init=25
Fetching URL: http://www.technewsworld.com/perl/archives/tnw/?init=50
Fetching URL: http://www.technewsworld.com/perl/archives/tnw/?init=75
Fetching URL: http://www.technewsworld.com/perl/archives/tnw/?init=100
Fetching URL: http://www.technewsworld.com/perl/archives/tnw/?init=125
Fetching URL: http://www.technewsworld.com/perl/archives/tnw/?init=150
Fetching URL: http://www.technewsworld.com/perl/archives/tnw/?init=175
Fetching URL: http://www.technewsworld.com/perl/archives/tnw/?init=200
Fetching URL: http://www.technewsworld.com/perl/archives/tnw/?init=225
Fetching URL: http://www.technewsworld.com/perl/archives/tnw/?init=250
Fetching URL: http://www.technewsworld.com/perl/archives/tnw/?init=275
Fetching URL: http://www.technewsworld.com/perl/archives/tnw/?init=300
Fetching URL: http://www.technewsworld.com/perl/archives/tnw/?init=325
Fetching UR

In [2]:
# TF-IDF is used to give important to the rare words than the frequently occuring words. To understand more about the TF-IDF, watch this video 'https://www.youtube.com/watch?v=4vT4fzjkGCQ'
from sklearn.feature_extraction.text import TfidfVectorizer

# max_df and min_df are used to set up threshold to the words that needs to be considered. To understand more about these attributes, read this 'https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer/35615151#35615151'
vectorizer = TfidfVectorizer(max_df=0.75, min_df=2, stop_words='english')
# If you print X, you will see XxY which means X posts and Y unique words found in all the posts
X = vectorizer.fit_transform(extracted_text)

In [3]:
number_of_clusters = 4

km = KMeans(n_clusters = number_of_clusters, init = 'k-means++', max_iter = 100, n_init = 1, verbose = True)
km.fit(X)

Initialization complete
Iteration  0, inertia 994.763
Iteration  1, inertia 509.738
Iteration  2, inertia 509.075
Iteration  3, inertia 508.653
Iteration  4, inertia 508.386
Iteration  5, inertia 508.208
Iteration  6, inertia 508.041
Iteration  7, inertia 507.822
Iteration  8, inertia 507.659
Iteration  9, inertia 507.528
Iteration 10, inertia 507.420
Iteration 11, inertia 507.375
Iteration 12, inertia 507.344
Converged at iteration 12: center shift 0.000000e+00 within tolerance 4.082702e-08


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=4, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)

In [4]:
from scipy.stats import itemfreq

# To check how many posts each cluster have
itemfreq(km.labels_)

array([[  0, 131],
       [  1,  90],
       [  2, 173],
       [  3, 131]], dtype=int64)

In [5]:
# Categorising text to clusters
text_categorised ={}

for i,cluster in enumerate(km.labels_):
    text = extracted_text[i]
    if cluster not in text_categorised.keys():
        text_categorised[cluster] = text
    else:
        text_categorised[cluster] += text

In [6]:
# Buidling 'filter words' to remove unnecessary words that causes noise in the result

special_words = ["''", "``", "--", "'ve", "'m", "'re", 'g', "'s", 'even', "using", "however", "year", "could", 'really', 'recently', 'actually']
filter_words = set(stopwords.words('english') + list(punctuation) + special_words)

In [7]:
# Tokenising text in each cluster and removing the 'filter words'. Finally in 'counts' frequency distribution of each word in cluster is stored and in 'keywords' top 100 words in the cluster is stored 
keywords = {}
counts={}

for cluster in range(number_of_clusters):
    word_sent = word_tokenize(text_categorised[cluster].lower())
    word_sent = [word for word in word_sent if ((word not in filter_words) and (not(re.match(r'^[0-9]+$', word))))]
    freq = FreqDist(word_sent)
    keywords[cluster] = sorted(freq, key=freq.get, reverse=True)[:100]
    counts[cluster] = freq

In [8]:
# Storing 10 unique cluster specific keywords in each cluster
unique_keys={}

for cluster in range(number_of_clusters):   
    other_clusters_keywords = set()
    other_clusters = list(range(0, number_of_clusters))
    other_clusters.remove(cluster)
    for other_cluster in other_clusters:
        other_clusters_keywords.update(keywords[other_cluster])
    unique = set(keywords[cluster]) - other_clusters_keywords
#     print("For Cluster %s, unique keywords are %s" %(cluster, unique))
    unique_keys[cluster] = sorted(unique, key=counts[cluster].get, reverse=True)[:10]

In `unique_keys`, we can know that

1st cluster data is about **Video Games**,

2nd cluster data is about **Mobile Phones**,

3rd cluster data is about **Politics**, and

4th cluster data is about **Aritificial Intelligence and Machine Learning**

In [9]:
unique_keys

{0: ['video',
  'app',
  'game',
  'twitter',
  'nintendo',
  'accounts',
  'ios',
  'gaming',
  'videos',
  'service'],
 1: ['iphone',
  'samsung',
  'galaxy',
  'device',
  'smartphone',
  'phone',
  'pixel',
  'display',
  'design',
  'touch'],
 2: ['government',
  'internet',
  'trump',
  'presidential',
  'think',
  'russian',
  'companies',
  'president',
  'much',
  'likely'],
 3: ['microsoft',
  'group',
  'amazon',
  'intelligence',
  'echo',
  'artificial',
  'research',
  'nvidia',
  'ai',
  'machine']}

In [10]:
# Naive Bayes Classifier algorithm is used to train the obtained data
from sklearn.naive_bayes import GaussianNB

# To check whether the data is correctly classified, we use sample_text
sample_text = "Machine learning is awesome"
test = vectorizer.transform([sample_text])
model = GaussianNB()
model.fit(X.todense(), km.labels_)
model.predict(test.toarray())

array([3])