In [3]:
import os.path
import joblib
raw_documents = []
snippets = []
with open("dogu.csv" ,"r", encoding="utf8") as fin:
    for line in fin.readlines():
        text = line.strip()
        raw_documents.append( text )
        # keep a short snippet of up to 100 characters as a title for each article
        snippets.append( text[0:min(len(text),100)] )
print("Read %d raw text documents" % len(raw_documents))


Read 21879 raw text documents


In [4]:
custom_stop_words = []
with open( "stopwords.txt", "r", encoding = "utf-8") as fin:
    for line in fin.readlines():
        custom_stop_words.append( line.strip() )
# note that we need to make it hashable
print("Stopword list has %d entries" % len(custom_stop_words) )

Stopword list has 764 entries


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
# use a custom stopwords list, set the minimum term-document frequency to 20 
vectorizer = CountVectorizer(stop_words = custom_stop_words, min_df = 20)
A = vectorizer.fit_transform(raw_documents)
print( "Created %d X %d document-term matrix" % (A.shape[0], A.shape[1]) )


Created 21879 X 2741 document-term matrix


In [13]:
terms = vectorizer.get_feature_names()
print("Vocabulary has %d distinct terms" % len(terms))

Vocabulary has 2741 distinct terms


In [14]:
import joblib
joblib.dump((A,terms,snippets), "articles-raw.pkl") 

['articles-raw.pkl']

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
# we can pass in the same preprocessing parameters
vectorizer = TfidfVectorizer(stop_words=custom_stop_words, min_df = 20)
A = vectorizer.fit_transform(raw_documents)
print( "Created %d X %d TF-IDF-normalized document-term matrix" % (A.shape[0], A.shape[1]) )

Created 21879 X 2741 TF-IDF-normalized document-term matrix


In [16]:
# extract the resulting vocabulary
terms = vectorizer.get_feature_names()
print("Vocabulary has %d distinct terms" % len(terms))

Vocabulary has 2741 distinct terms


In [17]:
import operator
def rank_terms( A, terms ):
    # get the sums over each column
    sums = A.sum(axis=0)
    # map weights to the terms
    weights = {}
    for col, term in enumerate(terms):
        weights[term] = sums[0,col]
    # rank the terms by their weight over all documents
    return sorted(weights.items(), key=operator.itemgetter(1), reverse=True)

In [18]:
ranking = rank_terms( A, terms )
for i, pair in enumerate( ranking[0:10] ):
    print( "%02d. %s (%.2f)" % ( i+1, pair[0], pair[1] ) )

01. war (1144.45)
02. palestine (1077.11)
03. israel (1035.62)
04. afghanistan (825.57)
05. syria (585.99)
06. ukraine (487.54)
07. iraq (394.80)
08. russia (367.46)
09. people (350.60)
10. only (320.57)


In [29]:
joblib.dump((A,terms,snippets), "articles-tfidf.pkl") 

['articles-tfidf.pkl']