In [1]:
# for operations on data
import pandas as pd
import numpy as np
# for regexp
import re
# for nlp
import nltk
# for stopwords and punctuations
from string import punctuation
from nltk.corpus import stopwords
# tokenizing words
from nltk import word_tokenize
# for lemmatizing and stemming of text
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
# for word normalizing 
from sklearn.feature_extraction.text import TfidfVectorizer
# our model
from sklearn.cluster import KMeans
# our score
from sklearn.metrics import silhouette_score

In [2]:
# loaded the data
df = pd.read_csv('news.csv')

In [3]:
# combined the headline and text column
df['full_news'] = df['headline'] + ' ' + df['text']

In [4]:
# these are unwanted words and stopwords
unwanted_words = ['/br', '--', "'s", "'ve", "'re", "'ll", "'ly"]
stop_list = stopwords.words('english')
# print(stop_list)
# print(list(punctuation))

In [5]:
# this function tokenize the string
def customTokenizer(text): 
    temp_list = []
    # break into words
    tokens = word_tokenize(text)
    # remove stopwords and punctuations
    tokens = [w for w in tokens if w.lower() not in (stop_list + list(punctuation) + unwanted_words)]
    # remove digits and floats
    for token in tokens:
        # getting rid of digits
        if re.search(r'[a-zA-Z]', token):
            temp_list.append(token)
    temp_list = remove_wierd(temp_list)
    temp_list = my_lemmatizer(temp_list)
    return temp_list

In [6]:
# this lemmatizer function lemmatize and stem the corresponding words
def my_lemmatizer(tokens):
    ''' this lemmatizer will lemmatize all the words including
        adjectives, adverb, verbs and noun.
    '''
    # lemmatizing
    wnl = WordNetLemmatizer()
    lemmatized_tokens = [wnl.lemmatize(w) for w in tokens]
    
    # stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(w) for w in lemmatized_tokens]
    
    return stemmed_tokens

In [7]:
# this block of code removes the wierd ? symbol from the words
def remove_wierd(tokens):
    temp_list = []
    for token in tokens:
        try:
            temp = []
            for c in list(token.encode('utf-8')):
                if c < 128:
                    temp.append(chr(c))
            temp_list.append(''.join(temp))
        except Exception as e:
            print(e, 'word:', token)
    return temp_list

In [8]:
# this function is for finding best tfidf parameters for the data
def find_parameters(corpus, max_clusters, max_ngrams):
    best_prams = tuple()
    best_score = 0
    not_allowed_prams = []
    for max_ng in range(2, max_ngrams):
        for mdf in [0.8, 0.9]:
            for midf in [0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59]:
                # convert features into tfidfs
                try:
                    tfidf_vectorizer = TfidfVectorizer(ngram_range = (1,max_ng), max_df=mdf, min_df=midf, use_idf=True, tokenizer=customTokenizer, max_features=200000)
                    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
                except Exception as e:
                    print(f'{e} : not allowed ({max_ng}, {mdf}, {midf})')
                    not_allowed_prams.append((max_ng, mdf, midf))
                    continue
                # clustering
                min_c = 29
                for clusters in range(min_c, max_clusters):
                    kmeans = KMeans(n_clusters=clusters, random_state=0)
                    kmeans.fit_predict(tfidf_matrix)
                    score = silhouette_score(tfidf_matrix, kmeans.labels_)
                    print('cluster no:',clusters, 'score:', score)
                    # updating best parameters
                    if score > best_score:
                        best_score = score
                        best_prams = ((max_ng, mdf, midf), clusters)
                print(f'current best {best_prams}:{best_score}:{max_ng}:{mdf}:{midf}')
    return best_prams, best_score, not_allowed_prams

# Do not Run this cell!!!

In [250]:
# this cell is looking for the best hyper parameters
# the function will check from min_c to 33 clusters 
# change min_c in find_parameters function for checking for more clusters
text_corpus = df['full_news']
max_clusters = 33
max_ngrams = 3

best_prams, best_score, not_allowed_prams = find_parameters(text_corpus, max_clusters, max_ngrams)

  return_n_iter=True)


cluster no: 29 score: 1.0


  return_n_iter=True)


cluster no: 30 score: 1.0


  return_n_iter=True)


cluster no: 31 score: 1.0


  return_n_iter=True)


cluster no: 32 score: 1.0
current best ((2, 0.8, 0.51), 29):1.0:2:0.8:0.51
After pruning, no terms remain. Try a lower min_df or a higher max_df. : not allowed (2, 0.8, 0.52)
After pruning, no terms remain. Try a lower min_df or a higher max_df. : not allowed (2, 0.8, 0.53)
After pruning, no terms remain. Try a lower min_df or a higher max_df. : not allowed (2, 0.8, 0.54)
After pruning, no terms remain. Try a lower min_df or a higher max_df. : not allowed (2, 0.8, 0.55)
After pruning, no terms remain. Try a lower min_df or a higher max_df. : not allowed (2, 0.8, 0.56)
After pruning, no terms remain. Try a lower min_df or a higher max_df. : not allowed (2, 0.8, 0.57)
After pruning, no terms remain. Try a lower min_df or a higher max_df. : not allowed (2, 0.8, 0.58)
After pruning, no terms remain. Try a lower min_df or a higher max_df. : not allowed (2, 0.8, 0.59)


  return_n_iter=True)


cluster no: 29 score: 1.0


  return_n_iter=True)


cluster no: 30 score: 1.0


  return_n_iter=True)


cluster no: 31 score: 1.0


  return_n_iter=True)


cluster no: 32 score: 1.0
current best ((2, 0.8, 0.51), 29):1.0:2:0.9:0.51
After pruning, no terms remain. Try a lower min_df or a higher max_df. : not allowed (2, 0.9, 0.52)
After pruning, no terms remain. Try a lower min_df or a higher max_df. : not allowed (2, 0.9, 0.53)
After pruning, no terms remain. Try a lower min_df or a higher max_df. : not allowed (2, 0.9, 0.54)
After pruning, no terms remain. Try a lower min_df or a higher max_df. : not allowed (2, 0.9, 0.55)
After pruning, no terms remain. Try a lower min_df or a higher max_df. : not allowed (2, 0.9, 0.56)
After pruning, no terms remain. Try a lower min_df or a higher max_df. : not allowed (2, 0.9, 0.57)
After pruning, no terms remain. Try a lower min_df or a higher max_df. : not allowed (2, 0.9, 0.58)
After pruning, no terms remain. Try a lower min_df or a higher max_df. : not allowed (2, 0.9, 0.59)


In [9]:
# these are the best parameters
print(best_prams)

NameError: name 'best_prams' is not defined

In [10]:
# ((2, 0.8, 0.51), 29)
# min_df = 0.51 will give a warning try min_df = 0.5 
# cleaned text is vectorized with tfidfvectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range = (1,2), max_df=0.8, min_df=0.51, use_idf=True, tokenizer=customTokenizer, max_features=20000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['full_news'])
# idf = idf.toarray()

In [11]:
# for checking optimal no of clusters
%time kmeans = KMeans(n_clusters=2, random_state=0)
kmeans.fit(tfidf_matrix)
print('cluster no:', 2, 'score:', silhouette_score(tfidf_matrix, kmeans.labels_))

Wall time: 0 ns
cluster no: 2 score: 1.0


In [12]:
# these are the labels corresponding to the ids
kmeans.labels_

array([1, 0, 1, ..., 0, 0, 1])

In [289]:
# this is a breif description of the pos tags available in nltk
'''
Description
1.	CC	Coordinating conjunction
2.	CD	Cardinal number
3.	DT	Determiner
4.	EX	Existential there
5.	FW	Foreign word
6.	IN	Preposition or subordinating conjunction
7.	JJ	Adjective
8.	JJR	Adjective, comparative
9.	JJS	Adjective, superlative
10.	LS	List item marker
11.	MD	Modal
12.	NN	Noun, singular or mass
13.	NNS	Noun, plural
14.	NNP	Proper noun, singular
15.	NNPS	Proper noun, plural
16.	PDT	Predeterminer
17.	POS	Possessive ending
18.	PRP	Personal pronoun
19.	PRP$	Possessive pronoun
20.	RB	Adverb
21.	RBR	Adverb, comparative
22.	RBS	Adverb, superlative
23.	RP	Particle
24.	SYM	Symbol
25.	TO	to
26.	UH	Interjection
27.	VB	Verb, base form
28.	VBD	Verb, past tense
29.	VBG	Verb, gerund or present participle
30.	VBN	Verb, past participle
31.	VBP	Verb, non-3rd person singular present
32.	VBZ	Verb, 3rd person singular present
33.	WDT	Wh-determiner
34.	WP	Wh-pronoun
35.	WP$	Possessive wh-pronoun
36.	WRB	Wh-adverb
'''

# enter a pos_tag for more info
tag = ''
nltk.help.upenn_tagset(tag)

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

# Making Files For Submission

In [20]:
# making files for submission
np.savetxt('submission_file_4.txt', tfidf_matrix.toarray(), delimiter=" ")

In [18]:
new_df = df.drop(columns=['headline', 'text', 'full_news'])
new_df['cluster'] = kmeans.labels_
new_df.head()

Unnamed: 0,id,cluster
0,uid-1,1
1,uid-2,0
2,uid-3,1
3,uid-4,1
4,uid-5,0


In [19]:
new_df.to_csv('submission_csv_4.csv', index=False)